Home | History | Annotate | Download | only in phonenumbers
      1 // Copyright (C) 2011 The Libphonenumber Authors
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 // http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 //
     15 // Author: Lara Rennie
     16 // Author: Tao Huang
     17 //
     18 // Implementation of a stateful class that finds and extracts telephone numbers
     19 // from text.
     20 
     21 #include "phonenumbers/phonenumbermatcher.h"
     22 
     23 #ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP
     24 #error phonenumbermatcher depends on ICU \
     25     (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set)
     26 #endif  // I18N_PHONENUMBERS_USE_ICU_REGEXP
     27 
     28 #include <ctype.h>
     29 #include <stddef.h>
     30 #include <limits>
     31 #include <map>
     32 #include <string>
     33 #include <utility>
     34 #include <vector>
     35 
     36 #include <unicode/uchar.h>
     37 
     38 #include "phonenumbers/alternate_format.h"
     39 #include "phonenumbers/base/logging.h"
     40 #include "phonenumbers/base/memory/scoped_ptr.h"
     41 #include "phonenumbers/base/memory/singleton.h"
     42 #include "phonenumbers/callback.h"
     43 #include "phonenumbers/default_logger.h"
     44 #include "phonenumbers/encoding_utils.h"
     45 #include "phonenumbers/normalize_utf8.h"
     46 #include "phonenumbers/phonemetadata.pb.h"
     47 #include "phonenumbers/phonenumber.pb.h"
     48 #include "phonenumbers/phonenumbermatch.h"
     49 #include "phonenumbers/phonenumberutil.h"
     50 #include "phonenumbers/regexp_adapter.h"
     51 #include "phonenumbers/regexp_adapter_icu.h"
     52 #include "phonenumbers/stringutil.h"
     53 
     54 #ifdef I18N_PHONENUMBERS_USE_RE2
     55 #include "phonenumbers/regexp_adapter_re2.h"
     56 #endif  // I18N_PHONENUMBERS_USE_RE2_AND_ICU
     57 
     58 using std::make_pair;
     59 using std::map;
     60 using std::numeric_limits;
     61 using std::string;
     62 using std::vector;
     63 
     64 namespace i18n {
     65 namespace phonenumbers {
     66 
     67 namespace {
     68 // Returns a regular expression quantifier with an upper and lower limit.
     69 string Limit(int lower, int upper) {
     70   DCHECK_GE(lower, 0);
     71   DCHECK_GT(upper, 0);
     72   DCHECK_LT(lower, upper);
     73   return StrCat("{", lower, ",", upper, "}");
     74 }
     75 
     76 bool IsInvalidPunctuationSymbol(char32 character) {
     77   return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL;
     78 }
     79 
     80 bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate,
     81                              const PhoneNumberUtil& util) {
     82   // The characters 'x' and 'X' can be (1) a carrier code, in which case they
     83   // always precede the national significant number or (2) an extension sign,
     84   // in which case they always precede the extension number. We assume a
     85   // carrier code is more than 1 digit, so the first case has to have more than
     86   // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1
     87   // 'x' or 'X'.
     88   size_t found;
     89   found = candidate.find_first_of("xX");
     90   // We ignore the character if 'x' or 'X' appears as the last character of
     91   // the string.
     92   while (found != string::npos && found < candidate.length() - 1) {
     93     // We only look for 'x' or 'X' in ASCII form.
     94     char next_char = candidate[found + 1];
     95     if (next_char == 'x' || next_char == 'X') {
     96       // This is the carrier code case, in which the 'X's always precede the
     97       // national significant number.
     98       ++found;
     99       if (util.IsNumberMatchWithOneString(
    100               number, candidate.substr(found, candidate.length() - found))
    101           != PhoneNumberUtil::NSN_MATCH) {
    102         return false;
    103       }
    104     } else {
    105       string normalized_extension(candidate.substr(found,
    106                                                    candidate.length() - found));
    107       util.NormalizeDigitsOnly(&normalized_extension);
    108       if (normalized_extension != number.extension()) {
    109         return false;
    110       }
    111     }
    112     found = candidate.find_first_of("xX", found + 1);
    113   }
    114   return true;
    115 }
    116 
    117 bool AllNumberGroupsRemainGrouped(
    118     const PhoneNumberUtil& util,
    119     const PhoneNumber& phone_number,
    120     const string& normalized_candidate,
    121     const vector<string>& formatted_number_groups) {
    122   size_t from_index = 0;
    123   // Check each group of consecutive digits are not broken into separate
    124   // groupings in the normalized_candidate string.
    125   for (size_t i = 0; i < formatted_number_groups.size(); ++i) {
    126     // Fails if the substring of normalized_candidate starting from from_index
    127     // doesn't contain the consecutive digits in formatted_number_groups.at(i).
    128     from_index = normalized_candidate.find(formatted_number_groups.at(i),
    129                                            from_index);
    130     if (from_index == string::npos) {
    131       return false;
    132     }
    133     // Moves from_index forward.
    134     from_index += formatted_number_groups.at(i).length();
    135     if (i == 0 && from_index < normalized_candidate.length()) {
    136       // We are at the position right after the NDC. We get the region used for
    137       // formatting information based on the country code in the phone number,
    138       // rather than the number itself, as we do not need to distinguish between
    139       // different countries with the same country calling code and this is
    140       // faster.
    141       string region;
    142       util.GetRegionCodeForCountryCode(phone_number.country_code(), &region);
    143       string ndd_prefix;
    144       util.GetNddPrefixForRegion(region, true, &ndd_prefix);
    145       // Note although normalized_candidate might contain non-ASCII formatting
    146       // characters, they won't be treated as ASCII digits when converted to a
    147       // char.
    148       if (!ndd_prefix.empty() && isdigit(normalized_candidate.at(from_index))) {
    149         // This means there is no formatting symbol after the NDC. In this case,
    150         // we only accept the number if there is no formatting symbol at all in
    151         // the number, except for extensions. This is only important for
    152         // countries with national prefixes.
    153         string national_significant_number;
    154         util.GetNationalSignificantNumber(
    155             phone_number, &national_significant_number);
    156         return HasPrefixString(normalized_candidate.substr(
    157             from_index - formatted_number_groups.at(i).length()),
    158             national_significant_number);
    159         }
    160       }
    161     }
    162     // The check here makes sure that we haven't mistakenly already used the
    163     // extension to match the last group of the subscriber number. Note the
    164     // extension cannot have formatting in-between digits.
    165     return normalized_candidate.substr(from_index)
    166         .find(phone_number.extension()) != string::npos;
    167 }
    168 
    169 bool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) {
    170 #if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS)
    171   if (!alternate_formats->ParseFromArray(alternate_format_get(),
    172                                          alternate_format_size())) {
    173     LOG(ERROR) << "Could not parse binary data.";
    174     return false;
    175   }
    176   return true;
    177 #else
    178   return false;
    179 #endif
    180 }
    181 
    182 }  // namespace
    183 
    184 class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
    185  private:
    186   friend class Singleton<PhoneNumberMatcherRegExps>;
    187 
    188   string opening_parens_;
    189   string closing_parens_;
    190   string non_parens_;
    191   // Limit on the number of pairs of brackets in a phone number.
    192   string bracket_pair_limit_;
    193   // Helper strings for the matching_brackets_ pattern.
    194   // An opening bracket at the beginning may not be closed, but subsequent ones
    195   // should be. It's also possible that the leading bracket was dropped, so we
    196   // shouldn't be surprised if we see a closing bracket first.
    197   string leading_maybe_matched_bracket_;
    198   string bracket_pairs_;
    199   // Limit on the number of leading (plus) characters.
    200   string lead_limit_;
    201   // Limit on the number of consecutive punctuation characters.
    202   string punctuation_limit_;
    203   // The maximum number of digits allowed in a digit-separated block. As we
    204   // allow all digits in a single block, this should be set high enough to
    205   // accommodate the entire national number and the international country code.
    206   int digit_block_limit_;
    207   // Limit on the number of blocks separated by punctuation. Uses
    208   // kDigitBlockLimit since some formats use spaces to separate each digit.
    209   string block_limit_;
    210   // A punctuation sequence allowing white space.
    211   string punctuation_;
    212   // A digits block without punctuation.
    213   string digit_sequence_;
    214   // Punctuation that may be at the start of a phone number - brackets and plus
    215   // signs.
    216   string lead_class_chars_;
    217   // Same as lead_class_chars_, but enclosed as a character class.
    218   string lead_class_;
    219   // Extra helper strings that form part of pattern_. These are stored
    220   // separately since StrCat has a limit of 12 args.
    221   string opening_punctuation_;
    222   string optional_extn_pattern_;
    223 
    224  public:
    225   // We use two different reg-ex factories here for performance reasons. RE2 is
    226   // much faster for smaller reg-ex patterns, but the main pattern cannot be
    227   // handled by RE2 in an efficient way.
    228   scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_;
    229   scoped_ptr<const AbstractRegExpFactory> regexp_factory_;
    230 
    231   // Matches strings that look like publication pages. Example:
    232   // Computing Complete Answers to Queries in the Presence of Limited Access
    233   // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003).
    234   //
    235   // The string "211-227 (2003)" is not a telephone number.
    236   scoped_ptr<const RegExp> pub_pages_;
    237   // Matches strings that look like dates using "/" as a separator. Examples:
    238   // 3/10/2011, 31/10/96 or 08/31/95.
    239   scoped_ptr<const RegExp> slash_separated_dates_;
    240   // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
    241   // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_.
    242   scoped_ptr<const RegExp> time_stamps_;
    243   scoped_ptr<const RegExp> time_stamps_suffix_;
    244   // Pattern to check that brackets match. Opening brackets should be closed
    245   // within a phone number. This also checks that there is something inside the
    246   // brackets. Having no brackets at all is also fine.
    247   scoped_ptr<const RegExp> matching_brackets_;
    248   // Matches white-space, which may indicate the end of a phone number and the
    249   // start of something else (such as a neighbouring zip-code). If white-space
    250   // is found, continues to match all characters that are not typically used to
    251   // start a phone number.
    252   scoped_ptr<const RegExp> group_separator_;
    253   scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_;
    254   scoped_ptr<const RegExp> capturing_ascii_digits_pattern_;
    255   // Compiled reg-ex representing lead_class_;
    256   scoped_ptr<const RegExp> lead_class_pattern_;
    257   // Phone number pattern allowing optional punctuation.
    258   scoped_ptr<const RegExp> pattern_;
    259 
    260   PhoneNumberMatcherRegExps()
    261       : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[" */),
    262         closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\]" */),
    263         non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")),
    264         bracket_pair_limit_(Limit(0, 3)),
    265         leading_maybe_matched_bracket_(StrCat(
    266             "(?:[", opening_parens_, "])?",
    267             "(?:", non_parens_, "+[", closing_parens_, "])?")),
    268         bracket_pairs_(StrCat(
    269             "(?:[", opening_parens_, "]", non_parens_, "+",
    270             "[", closing_parens_, "])", bracket_pair_limit_)),
    271         lead_limit_(Limit(0, 2)),
    272         punctuation_limit_(Limit(0, 4)),
    273         digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn +
    274                            PhoneNumberUtil::kMaxLengthCountryCode),
    275         block_limit_(Limit(0, digit_block_limit_)),
    276         punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]",
    277                             punctuation_limit_)),
    278         digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))),
    279         lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)),
    280         lead_class_(StrCat("[", lead_class_chars_, "]")),
    281         opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")),
    282         optional_extn_pattern_(StrCat(
    283             "(?i)(?:",
    284             PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
    285             ")?")),
    286         regexp_factory_for_pattern_(new ICURegExpFactory()),
    287 #ifdef I18N_PHONENUMBERS_USE_RE2
    288         regexp_factory_(new RE2RegExpFactory()),
    289 #else
    290         regexp_factory_(new ICURegExpFactory()),
    291 #endif  // I18N_PHONENUMBERS_USE_RE2
    292         pub_pages_(regexp_factory_->CreateRegExp(
    293             "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")),
    294         slash_separated_dates_(regexp_factory_->CreateRegExp(
    295             "(?:(?:[0-3]?\\d/[01]?\\d)|"
    296             "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")),
    297         time_stamps_(regexp_factory_->CreateRegExp(
    298             "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$")),
    299         time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")),
    300         matching_brackets_(regexp_factory_->CreateRegExp(
    301             StrCat(leading_maybe_matched_bracket_, non_parens_, "+",
    302                    bracket_pairs_, non_parens_, "*"))),
    303         group_separator_(regexp_factory_->CreateRegExp(
    304             StrCat("\\p{Z}", "[^", lead_class_chars_, "\\p{Nd}]*"))),
    305         capture_up_to_second_number_start_pattern_(
    306             regexp_factory_->CreateRegExp(
    307                 PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
    308         capturing_ascii_digits_pattern_(
    309             regexp_factory_->CreateRegExp("(\\d+)")),
    310         lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)),
    311         pattern_(regexp_factory_for_pattern_->CreateRegExp(
    312             StrCat("(", opening_punctuation_, lead_limit_,
    313                    digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
    314                    block_limit_, optional_extn_pattern_, ")"))) {
    315   }
    316 
    317  private:
    318   DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps);
    319 };
    320 
    321 class AlternateFormats : public Singleton<AlternateFormats> {
    322  public:
    323   PhoneMetadataCollection format_data_;
    324 
    325   map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_;
    326 
    327   AlternateFormats()
    328       : format_data_(),
    329         calling_code_to_alternate_formats_map_() {
    330     if (!LoadAlternateFormats(&format_data_)) {
    331       LOG(DFATAL) << "Could not parse compiled-in metadata.";
    332       return;
    333     }
    334     for (RepeatedPtrField<PhoneMetadata>::const_iterator it =
    335              format_data_.metadata().begin();
    336          it != format_data_.metadata().end();
    337          ++it) {
    338       calling_code_to_alternate_formats_map_.insert(
    339           make_pair(it->country_code(), &*it));
    340     }
    341   }
    342 
    343   const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code)
    344       const {
    345     map<int, const PhoneMetadata*>::const_iterator it =
    346         calling_code_to_alternate_formats_map_.find(country_calling_code);
    347     if (it != calling_code_to_alternate_formats_map_.end()) {
    348       return it->second;
    349     }
    350     return NULL;
    351   }
    352 
    353  private:
    354   DISALLOW_COPY_AND_ASSIGN(AlternateFormats);
    355 };
    356 
    357 PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util,
    358                                        const string& text,
    359                                        const string& region_code,
    360                                        PhoneNumberMatcher::Leniency leniency,
    361                                        int max_tries)
    362     : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
    363       alternate_formats_(AlternateFormats::GetInstance()),
    364       phone_util_(util),
    365       text_(text),
    366       preferred_region_(region_code),
    367       leniency_(leniency),
    368       max_tries_(max_tries),
    369       state_(NOT_READY),
    370       last_match_(NULL),
    371       search_index_(0) {
    372 }
    373 
    374 PhoneNumberMatcher::PhoneNumberMatcher(const string& text,
    375                                        const string& region_code)
    376     : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
    377       alternate_formats_(NULL),  // Not used.
    378       phone_util_(*PhoneNumberUtil::GetInstance()),
    379       text_(text),
    380       preferred_region_(region_code),
    381       leniency_(VALID),
    382       max_tries_(numeric_limits<int>::max()),
    383       state_(NOT_READY),
    384       last_match_(NULL),
    385       search_index_(0) {
    386 }
    387 
    388 PhoneNumberMatcher::~PhoneNumberMatcher() {
    389 }
    390 
    391 // static
    392 bool PhoneNumberMatcher::IsLatinLetter(char32 letter) {
    393   // Combining marks are a subset of non-spacing-mark.
    394   if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) {
    395     return false;
    396   }
    397   UBlockCode block = ublock_getCode(letter);
    398   return ((block == UBLOCK_BASIC_LATIN) ||
    399       (block == UBLOCK_LATIN_1_SUPPLEMENT) ||
    400       (block == UBLOCK_LATIN_EXTENDED_A) ||
    401       (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
    402       (block == UBLOCK_LATIN_EXTENDED_B) ||
    403       (block == UBLOCK_COMBINING_DIACRITICAL_MARKS));
    404 }
    405 
    406 bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset,
    407                                         PhoneNumberMatch* match) {
    408   DCHECK(match);
    409   // Check the candidate doesn't contain any formatting which would indicate
    410   // that it really isn't a phone number.
    411   if (!reg_exps_->matching_brackets_->FullMatch(candidate)) {
    412     return false;
    413   }
    414 
    415   // If leniency is set to VALID or stricter, we also want to skip numbers that
    416   // are surrounded by Latin alphabetic characters, to skip cases like
    417   // abc8005001234 or 8005001234def.
    418   if (leniency_ >= VALID) {
    419     // If the candidate is not at the start of the text, and does not start with
    420     // phone-number punctuation, check the previous character.
    421     scoped_ptr<RegExpInput> candidate_input(
    422         reg_exps_->regexp_factory_->CreateInput(candidate));
    423     if (offset > 0 &&
    424         !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) {
    425       char32 previous_char;
    426       const char* previous_char_ptr =
    427           EncodingUtils::BackUpOneUTF8Character(text_.c_str(),
    428                                                 text_.c_str() + offset);
    429       EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char);
    430       // We return false if it is a latin letter or an invalid punctuation
    431       // symbol.
    432       if (IsInvalidPunctuationSymbol(previous_char) ||
    433           IsLatinLetter(previous_char)) {
    434         return false;
    435       }
    436     }
    437     size_t lastCharIndex = offset + candidate.length();
    438     if (lastCharIndex < text_.length()) {
    439       char32 next_char;
    440       const char* next_char_ptr =
    441           EncodingUtils::AdvanceOneUTF8Character(
    442               text_.c_str() + lastCharIndex - 1);
    443       EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char);
    444       if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) {
    445         return false;
    446       }
    447     }
    448   }
    449 
    450   PhoneNumber number;
    451   if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) !=
    452       PhoneNumberUtil::NO_PARSING_ERROR) {
    453     return false;
    454   }
    455   if (VerifyAccordingToLeniency(leniency_, number, candidate)) {
    456     match->set_start(offset);
    457     match->set_raw_string(candidate);
    458     // We used ParseAndKeepRawInput to create this number, but for now we don't
    459     // return the extra values parsed. TODO: stop clearing all values here and
    460     // switch all users over to using raw_input() rather than the raw_string()
    461     // of PhoneNumberMatch.
    462     number.clear_country_code_source();
    463     number.clear_preferred_domestic_carrier_code();
    464     number.clear_raw_input();
    465     match->set_number(number);
    466     return true;
    467   }
    468   return false;
    469 }
    470 
    471 // Helper method to replace the verification method for each enum in the Java
    472 // version.
    473 bool PhoneNumberMatcher::VerifyAccordingToLeniency(
    474     Leniency leniency, const PhoneNumber& number,
    475     const string& candidate) const {
    476   switch (leniency) {
    477     case PhoneNumberMatcher::POSSIBLE:
    478       return phone_util_.IsPossibleNumber(number);
    479     case PhoneNumberMatcher::VALID:
    480       if (!phone_util_.IsValidNumber(number) ||
    481           !ContainsOnlyValidXChars(number, candidate, phone_util_)) {
    482         return false;
    483       }
    484       return IsNationalPrefixPresentIfRequired(number);
    485     case PhoneNumberMatcher::STRICT_GROUPING: {
    486       if (!phone_util_.IsValidNumber(number) ||
    487           !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
    488           ContainsMoreThanOneSlashInNationalNumber(
    489               number, candidate, phone_util_) ||
    490           !IsNationalPrefixPresentIfRequired(number)) {
    491         return false;
    492       }
    493       ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
    494                       const string&, const vector<string>&>* callback =
    495           NewPermanentCallback(&AllNumberGroupsRemainGrouped);
    496       bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
    497       delete(callback);
    498       return is_valid;
    499     }
    500     case PhoneNumberMatcher::EXACT_GROUPING: {
    501       if (!phone_util_.IsValidNumber(number) ||
    502           !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
    503           ContainsMoreThanOneSlashInNationalNumber(
    504               number, candidate, phone_util_) ||
    505           !IsNationalPrefixPresentIfRequired(number)) {
    506         return false;
    507       }
    508       ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
    509                       const string&, const vector<string>&>* callback =
    510           NewPermanentCallback(
    511               this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent);
    512       bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
    513       delete(callback);
    514       return is_valid;
    515     }
    516     default:
    517       LOG(ERROR) << "No implementation defined for verification for leniency "
    518                  << static_cast<int>(leniency);
    519       return false;
    520   }
    521 }
    522 
    523 bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset,
    524                                            PhoneNumberMatch* match) {
    525   DCHECK(match);
    526   // Try removing either the first or last "group" in the number and see if this
    527   // gives a result. We consider white space to be a possible indication of
    528   // the start or end of the phone number.
    529   scoped_ptr<RegExpInput> candidate_input(
    530       reg_exps_->regexp_factory_->CreateInput(candidate));
    531   if (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
    532                                                   NULL)) {
    533     // Try the first group by itself.
    534     int group_start_index =
    535         candidate.length() - candidate_input->ToString().length();
    536     string first_group_only = candidate.substr(0, group_start_index);
    537     phone_util_.TrimUnwantedEndChars(&first_group_only);
    538     bool success = ParseAndVerify(first_group_only, offset, match);
    539     if (success) {
    540       return true;
    541     }
    542     --max_tries_;
    543 
    544     // Try the rest of the candidate without the first group.
    545     string without_first_group(candidate_input->ToString());
    546     phone_util_.TrimUnwantedEndChars(&without_first_group);
    547     success =
    548         ParseAndVerify(without_first_group, offset + group_start_index, match);
    549     if (success) {
    550       return true;
    551     }
    552     --max_tries_;
    553 
    554     if (max_tries_ > 0) {
    555       while (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
    556                                                          NULL)) {
    557         // Find the last group.
    558       }
    559       int last_group_start =
    560           candidate.length() - candidate_input->ToString().length();
    561       string without_last_group = candidate.substr(0, last_group_start);
    562       phone_util_.TrimUnwantedEndChars(&without_last_group);
    563       if (without_last_group == first_group_only) {
    564         // If there are only two groups, then the group "without the last group"
    565         // is the same as the first group. In these cases, we don't want to
    566         // re-check the number group, so we exit already.
    567         return false;
    568       }
    569       success = ParseAndVerify(without_last_group, offset, match);
    570       if (success) {
    571         return true;
    572       }
    573       --max_tries_;
    574     }
    575   }
    576   return false;
    577 }
    578 
    579 bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset,
    580                                       PhoneNumberMatch* match) {
    581   DCHECK(match);
    582   // Skip a match that is more likely a publication page reference or a date.
    583   if (reg_exps_->pub_pages_->PartialMatch(candidate) ||
    584       reg_exps_->slash_separated_dates_->PartialMatch(candidate)) {
    585     return false;
    586   }
    587   // Skip potential time-stamps.
    588   if (reg_exps_->time_stamps_->PartialMatch(candidate)) {
    589     scoped_ptr<RegExpInput> following_text(
    590         reg_exps_->regexp_factory_->CreateInput(
    591             text_.substr(offset + candidate.size())));
    592     if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) {
    593       return false;
    594     }
    595   }
    596 
    597   // Try to come up with a valid match given the entire candidate.
    598   if (ParseAndVerify(candidate, offset, match)) {
    599     return true;
    600   }
    601 
    602   // If that failed, try to find an "inner match" - there might be a phone
    603   // number within this candidate.
    604   return ExtractInnerMatch(candidate, offset, match);
    605 }
    606 
    607 bool PhoneNumberMatcher::HasNext() {
    608   if (state_ == NOT_READY) {
    609     PhoneNumberMatch temp_match;
    610     if (!Find(search_index_, &temp_match)) {
    611       state_ = DONE;
    612     } else {
    613       last_match_.reset(new PhoneNumberMatch(temp_match.start(),
    614                                              temp_match.raw_string(),
    615                                              temp_match.number()));
    616       search_index_ = last_match_->end();
    617       state_ = READY;
    618     }
    619   }
    620   return state_ == READY;
    621 }
    622 
    623 bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) {
    624   DCHECK(match);
    625   // Check the state and find the next match as a side-effect if necessary.
    626   if (!HasNext()) {
    627     return false;
    628   }
    629   match->CopyFrom(*last_match_);
    630   state_ = NOT_READY;
    631   last_match_.reset(NULL);
    632   return true;
    633 }
    634 
    635 bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) {
    636   DCHECK(match);
    637 
    638   scoped_ptr<RegExpInput> text(
    639       reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index)));
    640   string candidate;
    641   while ((max_tries_ > 0) &&
    642          reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) {
    643     int start = text_.length() - text->ToString().length() - candidate.length();
    644     // Check for extra numbers at the end.
    645     reg_exps_->capture_up_to_second_number_start_pattern_->
    646         PartialMatch(candidate, &candidate);
    647     if (ExtractMatch(candidate, start, match)) {
    648       return true;
    649     }
    650 
    651     index = start + candidate.length();
    652     --max_tries_;
    653   }
    654   return false;
    655 }
    656 
    657 bool PhoneNumberMatcher::CheckNumberGroupingIsValid(
    658     const PhoneNumber& phone_number,
    659     const string& candidate,
    660     ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
    661                     const string&, const vector<string>&>* checker) const {
    662   DCHECK(checker);
    663   // TODO: Evaluate how this works for other locales (testing has been limited
    664   // to NANPA regions) and optimise if necessary.
    665   string normalized_candidate =
    666       NormalizeUTF8::NormalizeDecimalDigits(candidate);
    667   vector<string> formatted_number_groups;
    668   GetNationalNumberGroups(phone_number, NULL,  // Use default formatting pattern
    669                           &formatted_number_groups);
    670   if (checker->Run(phone_util_, phone_number, normalized_candidate,
    671                    formatted_number_groups)) {
    672     return true;
    673   }
    674   // If this didn't pass, see if there are any alternate formats, and try them
    675   // instead.
    676   const PhoneMetadata* alternate_formats =
    677     alternate_formats_->GetAlternateFormatsForCountry(
    678         phone_number.country_code());
    679   if (alternate_formats) {
    680     for (RepeatedPtrField<NumberFormat>::const_iterator it =
    681              alternate_formats->number_format().begin();
    682          it != alternate_formats->number_format().end(); ++it) {
    683       formatted_number_groups.clear();
    684       GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups);
    685       if (checker->Run(phone_util_, phone_number, normalized_candidate,
    686                        formatted_number_groups)) {
    687         return true;
    688       }
    689     }
    690   }
    691   return false;
    692 }
    693 
    694 // Helper method to get the national-number part of a number, formatted without
    695 // any national prefix, and return it as a set of digit blocks that would be
    696 // formatted together.
    697 void PhoneNumberMatcher::GetNationalNumberGroups(
    698     const PhoneNumber& number,
    699     const NumberFormat* formatting_pattern,
    700     vector<string>* digit_blocks) const {
    701   string rfc3966_format;
    702   if (!formatting_pattern) {
    703     // This will be in the format +CC-DG;ext=EXT where DG represents groups of
    704     // digits.
    705     phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
    706     // We remove the extension part from the formatted string before splitting
    707     // it into different groups.
    708     size_t end_index = rfc3966_format.find(';');
    709     if (end_index == string::npos) {
    710       end_index = rfc3966_format.length();
    711     }
    712     // The country-code will have a '-' following it.
    713     size_t start_index = rfc3966_format.find('-') + 1;
    714     SplitStringUsing(rfc3966_format.substr(start_index,
    715                                            end_index - start_index),
    716                      "-", digit_blocks);
    717   } else {
    718     // We format the NSN only, and split that according to the separator.
    719     string national_significant_number;
    720     phone_util_.GetNationalSignificantNumber(number,
    721                                              &national_significant_number);
    722     phone_util_.FormatNsnUsingPattern(national_significant_number,
    723                                       *formatting_pattern,
    724                                       PhoneNumberUtil::RFC3966,
    725                                       &rfc3966_format);
    726     SplitStringUsing(rfc3966_format, "-", digit_blocks);
    727   }
    728 }
    729 
    730 bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired(
    731     const PhoneNumber& number) const {
    732   // First, check how we deduced the country code. If it was written in
    733   // international format, then the national prefix is not required.
    734   if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
    735     return true;
    736   }
    737   string phone_number_region;
    738   phone_util_.GetRegionCodeForCountryCode(
    739       number.country_code(), &phone_number_region);
    740   const PhoneMetadata* metadata =
    741       phone_util_.GetMetadataForRegion(phone_number_region);
    742   if (!metadata) {
    743     return true;
    744   }
    745   // Check if a national prefix should be present when formatting this number.
    746   string national_number;
    747   phone_util_.GetNationalSignificantNumber(number, &national_number);
    748   const NumberFormat* format_rule =
    749       phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(),
    750                                                    national_number);
    751   // To do this, we check that a national prefix formatting rule was present and
    752   // that it wasn't just the first-group symbol ($1) with punctuation.
    753   if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) {
    754     if (format_rule->national_prefix_optional_when_formatting()) {
    755       // The national-prefix is optional in these cases, so we don't need to
    756       // check if it was present.
    757       return true;
    758     }
    759     if (phone_util_.FormattingRuleHasFirstGroupOnly(
    760         format_rule->national_prefix_formatting_rule())) {
    761       // National Prefix not needed for this number.
    762       return true;
    763     }
    764     // Normalize the remainder.
    765     string raw_input_copy(number.raw_input());
    766     // Check if we found a national prefix and/or carrier code at the start of
    767     // the raw input, and return the result.
    768     phone_util_.NormalizeDigitsOnly(&raw_input_copy);
    769     return phone_util_.MaybeStripNationalPrefixAndCarrierCode(
    770         *metadata,
    771         &raw_input_copy,
    772         NULL);  // Don't need to keep the stripped carrier code.
    773   }
    774   return true;
    775 }
    776 
    777 bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent(
    778     const PhoneNumberUtil& util,
    779     const PhoneNumber& phone_number,
    780     const string& normalized_candidate,
    781     const vector<string>& formatted_number_groups) const {
    782     const scoped_ptr<RegExpInput> candidate_number(
    783         reg_exps_->regexp_factory_->CreateInput(normalized_candidate));
    784   vector<string> candidate_groups;
    785   string digit_block;
    786   while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume(
    787              candidate_number.get(),
    788              &digit_block)) {
    789     candidate_groups.push_back(digit_block);
    790   }
    791 
    792   // Set this to the last group, skipping it if the number has an extension.
    793   int candidate_number_group_index =
    794       phone_number.has_extension() ? candidate_groups.size() - 2
    795                                    : candidate_groups.size() - 1;
    796   // First we check if the national significant number is formatted as a block.
    797   // We use find and not equals, since the national significant number may be
    798   // present with a prefix such as a national number prefix, or the country code
    799   // itself.
    800   string national_significant_number;
    801   util.GetNationalSignificantNumber(phone_number,
    802                                     &national_significant_number);
    803   if (candidate_groups.size() == 1 ||
    804       candidate_groups.at(candidate_number_group_index).find(
    805           national_significant_number) != string::npos) {
    806     return true;
    807   }
    808   // Starting from the end, go through in reverse, excluding the first group,
    809   // and check the candidate and number groups are the same.
    810   for (int formatted_number_group_index =
    811            (formatted_number_groups.size() - 1);
    812        formatted_number_group_index > 0 &&
    813        candidate_number_group_index >= 0;
    814        --formatted_number_group_index, --candidate_number_group_index) {
    815     if (candidate_groups.at(candidate_number_group_index) !=
    816         formatted_number_groups.at(formatted_number_group_index)) {
    817       return false;
    818     }
    819   }
    820   // Now check the first group. There may be a national prefix at the start, so
    821   // we only check that the candidate group ends with the formatted number
    822   // group.
    823   return (candidate_number_group_index >= 0 &&
    824           HasSuffixString(candidate_groups.at(candidate_number_group_index),
    825                           formatted_number_groups.at(0)));
    826 }
    827 
    828 // static
    829 bool PhoneNumberMatcher::ContainsMoreThanOneSlashInNationalNumber(
    830     const PhoneNumber& number,
    831     const string& candidate,
    832     const PhoneNumberUtil& util) {
    833   size_t first_slash_in_body = candidate.find('/');
    834   if (first_slash_in_body == string::npos) {
    835     // No slashes, this is okay.
    836     return false;
    837   }
    838   // Now look for a second one.
    839   size_t second_slash_in_body = candidate.find('/', first_slash_in_body + 1);
    840   if (second_slash_in_body == string::npos) {
    841     // Only one slash, this is okay.
    842     return false;
    843   }
    844 
    845   // If the first slash is after the country calling code, this is permitted.
    846   if (number.country_code_source() == PhoneNumber::FROM_NUMBER_WITH_PLUS_SIGN ||
    847       number.country_code_source() ==
    848           PhoneNumber::FROM_NUMBER_WITHOUT_PLUS_SIGN) {
    849     string normalized_country_code =
    850         candidate.substr(0, first_slash_in_body);
    851     util.NormalizeDigitsOnly(&normalized_country_code);
    852     if (normalized_country_code == SimpleItoa(number.country_code())) {
    853       // Any more slashes and this is illegal.
    854       return candidate.find('/', second_slash_in_body + 1) != string::npos;
    855     }
    856   }
    857   return true;
    858 }
    859 
    860 }  // namespace phonenumbers
    861 }  // namespace i18n
    862