Home | History | Annotate | Download | only in phonenumbers
      1 // Copyright (C) 2011 The Libphonenumber Authors
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 // http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 //
     15 // Author: Lara Rennie
     16 // Author: Tao Huang
     17 //
     18 // Implementation of a stateful class that finds and extracts telephone numbers
     19 // from text.
     20 
     21 #include "phonenumbers/phonenumbermatcher.h"
     22 
     23 #ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP
     24 #error phonenumbermatcher depends on ICU \
     25     (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set)
     26 #endif  // I18N_PHONENUMBERS_USE_ICU_REGEXP
     27 
     28 #include <ctype.h>
     29 #include <iostream>
     30 #include <limits>
     31 #include <map>
     32 #include <stddef.h>
     33 #include <string>
     34 #include <utility>
     35 #include <vector>
     36 
     37 #include <unicode/uchar.h>
     38 
     39 #include "phonenumbers/alternate_format.h"
     40 #include "phonenumbers/base/logging.h"
     41 #include "phonenumbers/base/memory/scoped_ptr.h"
     42 #include "phonenumbers/base/memory/singleton.h"
     43 #include "phonenumbers/callback.h"
     44 #include "phonenumbers/default_logger.h"
     45 #include "phonenumbers/encoding_utils.h"
     46 #include "phonenumbers/normalize_utf8.h"
     47 #include "phonenumbers/phonemetadata.pb.h"
     48 #include "phonenumbers/phonenumber.pb.h"
     49 #include "phonenumbers/phonenumbermatch.h"
     50 #include "phonenumbers/phonenumberutil.h"
     51 #include "phonenumbers/regexp_adapter.h"
     52 #include "phonenumbers/regexp_adapter_icu.h"
     53 #include "phonenumbers/stringutil.h"
     54 
     55 #ifdef I18N_PHONENUMBERS_USE_RE2
     56 #include "phonenumbers/regexp_adapter_re2.h"
     57 #endif  // I18N_PHONENUMBERS_USE_RE2_AND_ICU
     58 
     59 using std::cerr;
     60 using std::endl;
     61 using std::make_pair;
     62 using std::map;
     63 using std::numeric_limits;
     64 using std::string;
     65 using std::vector;
     66 
     67 namespace i18n {
     68 namespace phonenumbers {
     69 
     70 namespace {
     71 // Returns a regular expression quantifier with an upper and lower limit.
     72 string Limit(int lower, int upper) {
     73   DCHECK_GE(lower, 0);
     74   DCHECK_GT(upper, 0);
     75   DCHECK_LT(lower, upper);
     76   return StrCat("{", lower, ",", upper, "}");
     77 }
     78 
     79 bool IsInvalidPunctuationSymbol(char32 character) {
     80   return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL;
     81 }
     82 
     83 bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate,
     84                              const PhoneNumberUtil& util) {
     85   // The characters 'x' and 'X' can be (1) a carrier code, in which case they
     86   // always precede the national significant number or (2) an extension sign,
     87   // in which case they always precede the extension number. We assume a
     88   // carrier code is more than 1 digit, so the first case has to have more than
     89   // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1
     90   // 'x' or 'X'.
     91   size_t found;
     92   found = candidate.find_first_of("xX");
     93   // We ignore the character if 'x' or 'X' appears as the last character of
     94   // the string.
     95   while (found != string::npos && found < candidate.length() - 1) {
     96     // We only look for 'x' or 'X' in ASCII form.
     97     char next_char = candidate[found + 1];
     98     if (next_char == 'x' || next_char == 'X') {
     99       // This is the carrier code case, in which the 'X's always precede the
    100       // national significant number.
    101       ++found;
    102       if (util.IsNumberMatchWithOneString(
    103               number, candidate.substr(found, candidate.length() - found))
    104           != PhoneNumberUtil::NSN_MATCH) {
    105         return false;
    106       }
    107     } else {
    108       string normalized_extension(candidate.substr(found,
    109                                                    candidate.length() - found));
    110       util.NormalizeDigitsOnly(&normalized_extension);
    111       if (normalized_extension != number.extension()) {
    112         return false;
    113       }
    114     }
    115     found = candidate.find_first_of("xX", found + 1);
    116   }
    117   return true;
    118 }
    119 
    120 bool AllNumberGroupsRemainGrouped(
    121     const PhoneNumberUtil& util,
    122     const PhoneNumber& phone_number,
    123     const string& normalized_candidate,
    124     const vector<string>& formatted_number_groups) {
    125   size_t from_index = 0;
    126   // Check each group of consecutive digits are not broken into separate
    127   // groupings in the normalized_candidate string.
    128   for (size_t i = 0; i < formatted_number_groups.size(); ++i) {
    129     // Fails if the substring of normalized_candidate starting from from_index
    130     // doesn't contain the consecutive digits in formatted_number_groups.at(i).
    131     from_index = normalized_candidate.find(formatted_number_groups.at(i),
    132                                            from_index);
    133     if (from_index == string::npos) {
    134       return false;
    135     }
    136     // Moves from_index forward.
    137     from_index += formatted_number_groups.at(i).length();
    138     if (i == 0 && from_index < normalized_candidate.length()) {
    139       // We are at the position right after the NDC. Note although
    140       // normalized_candidate might contain non-ASCII formatting characters,
    141       // they won't be treated as ASCII digits when converted to a char.
    142       if (isdigit(normalized_candidate.at(from_index))) {
    143         // This means there is no formatting symbol after the NDC. In this case,
    144         // we only accept the number if there is no formatting symbol at all in
    145         // the number, except for extensions.
    146         string national_significant_number;
    147         util.GetNationalSignificantNumber(
    148             phone_number, &national_significant_number);
    149         return HasPrefixString(normalized_candidate.substr(
    150             from_index - formatted_number_groups.at(i).length()),
    151             national_significant_number);
    152         }
    153       }
    154     }
    155     // The check here makes sure that we haven't mistakenly already used the
    156     // extension to match the last group of the subscriber number. Note the
    157     // extension cannot have formatting in-between digits.
    158     return normalized_candidate.substr(from_index)
    159         .find(phone_number.extension()) != string::npos;
    160 }
    161 
    162 bool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) {
    163 #if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS)
    164   if (!alternate_formats->ParseFromArray(alternate_format_get(),
    165                                          alternate_format_size())) {
    166     cerr << "Could not parse binary data." << endl;
    167     return false;
    168   }
    169   return true;
    170 #else
    171   return false;
    172 #endif
    173 }
    174 
    175 }  // namespace
    176 
    177 class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
    178  private:
    179   friend class Singleton<PhoneNumberMatcherRegExps>;
    180 
    181   string opening_parens_;
    182   string closing_parens_;
    183   string non_parens_;
    184   // Limit on the number of pairs of brackets in a phone number.
    185   string bracket_pair_limit_;
    186   // Helper strings for the matching_brackets_ pattern.
    187   // An opening bracket at the beginning may not be closed, but subsequent ones
    188   // should be. It's also possible that the leading bracket was dropped, so we
    189   // shouldn't be surprised if we see a closing bracket first.
    190   string leading_maybe_matched_bracket_;
    191   string bracket_pairs_;
    192   // Limit on the number of leading (plus) characters.
    193   string lead_limit_;
    194   // Limit on the number of consecutive punctuation characters.
    195   string punctuation_limit_;
    196   // The maximum number of digits allowed in a digit-separated block. As we
    197   // allow all digits in a single block, this should be set high enough to
    198   // accommodate the entire national number and the international country code.
    199   int digit_block_limit_;
    200   // Limit on the number of blocks separated by punctuation. Uses
    201   // kDigitBlockLimit since some formats use spaces to separate each digit.
    202   string block_limit_;
    203   // A punctuation sequence allowing white space.
    204   string punctuation_;
    205   // A digits block without punctuation.
    206   string digit_sequence_;
    207   // Punctuation that may be at the start of a phone number - brackets and plus
    208   // signs.
    209   string lead_class_chars_;
    210   // Same as lead_class_chars_, but enclosed as a character class.
    211   string lead_class_;
    212   // Extra helper strings that form part of pattern_. These are stored
    213   // separately since StrCat has a limit of 12 args.
    214   string opening_punctuation_;
    215   string optional_extn_pattern_;
    216 
    217  public:
    218   // We use two different reg-ex factories here for performance reasons. RE2 is
    219   // much faster for smaller reg-ex patterns, but the main pattern cannot be
    220   // handled by RE2 in an efficient way.
    221   scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_;
    222   scoped_ptr<const AbstractRegExpFactory> regexp_factory_;
    223 
    224   // Matches strings that look like publication pages. Example:
    225   // Computing Complete Answers to Queries in the Presence of Limited Access
    226   // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003).
    227   //
    228   // The string "211-227 (2003)" is not a telephone number.
    229   scoped_ptr<const RegExp> pub_pages_;
    230   // Matches strings that look like dates using "/" as a separator. Examples:
    231   // 3/10/2011, 31/10/96 or 08/31/95.
    232   scoped_ptr<const RegExp> slash_separated_dates_;
    233   // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
    234   // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_.
    235   scoped_ptr<const RegExp> time_stamps_;
    236   scoped_ptr<const RegExp> time_stamps_suffix_;
    237   // Pattern to check that brackets match. Opening brackets should be closed
    238   // within a phone number. This also checks that there is something inside the
    239   // brackets. Having no brackets at all is also fine.
    240   scoped_ptr<const RegExp> matching_brackets_;
    241   // Matches white-space, which may indicate the end of a phone number and the
    242   // start of something else (such as a neighbouring zip-code). If white-space
    243   // is found, continues to match all characters that are not typically used to
    244   // start a phone number.
    245   scoped_ptr<const RegExp> group_separator_;
    246   scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_;
    247   scoped_ptr<const RegExp> capturing_ascii_digits_pattern_;
    248   // Compiled reg-ex representing lead_class_;
    249   scoped_ptr<const RegExp> lead_class_pattern_;
    250   // Phone number pattern allowing optional punctuation.
    251   scoped_ptr<const RegExp> pattern_;
    252 
    253   PhoneNumberMatcherRegExps()
    254       : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[" */),
    255         closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\]" */),
    256         non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")),
    257         bracket_pair_limit_(Limit(0, 3)),
    258         leading_maybe_matched_bracket_(StrCat(
    259             "(?:[", opening_parens_, "])?",
    260             "(?:", non_parens_, "+[", closing_parens_, "])?")),
    261         bracket_pairs_(StrCat(
    262             "(?:[", opening_parens_, "]", non_parens_, "+",
    263             "[", closing_parens_, "])", bracket_pair_limit_)),
    264         lead_limit_(Limit(0, 2)),
    265         punctuation_limit_(Limit(0, 4)),
    266         digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn +
    267                            PhoneNumberUtil::kMaxLengthCountryCode),
    268         block_limit_(Limit(0, digit_block_limit_)),
    269         punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]",
    270                             punctuation_limit_)),
    271         digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))),
    272         lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)),
    273         lead_class_(StrCat("[", lead_class_chars_, "]")),
    274         opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")),
    275         optional_extn_pattern_(StrCat(
    276             "(?i)(?:",
    277             PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
    278             ")?")),
    279         regexp_factory_for_pattern_(new ICURegExpFactory()),
    280 #ifdef I18N_PHONENUMBERS_USE_RE2
    281         regexp_factory_(new RE2RegExpFactory()),
    282 #else
    283         regexp_factory_(new ICURegExpFactory()),
    284 #endif  // I18N_PHONENUMBERS_USE_RE2
    285         pub_pages_(regexp_factory_->CreateRegExp(
    286             "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")),
    287         slash_separated_dates_(regexp_factory_->CreateRegExp(
    288             "(?:(?:[0-3]?\\d/[01]?\\d)|"
    289             "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")),
    290         time_stamps_(regexp_factory_->CreateRegExp(
    291             "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$")),
    292         time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")),
    293         matching_brackets_(regexp_factory_->CreateRegExp(
    294             StrCat(leading_maybe_matched_bracket_, non_parens_, "+",
    295                    bracket_pairs_, non_parens_, "*"))),
    296         group_separator_(regexp_factory_->CreateRegExp(
    297             StrCat("\\p{Z}", "[^", lead_class_chars_, "\\p{Nd}]*"))),
    298         capture_up_to_second_number_start_pattern_(
    299             regexp_factory_->CreateRegExp(
    300                 PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
    301         capturing_ascii_digits_pattern_(
    302             regexp_factory_->CreateRegExp("(\\d+)")),
    303         lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)),
    304         pattern_(regexp_factory_for_pattern_->CreateRegExp(
    305             StrCat("(", opening_punctuation_, lead_limit_,
    306                    digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
    307                    block_limit_, optional_extn_pattern_, ")"))) {
    308   }
    309 
    310  private:
    311   DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps);
    312 };
    313 
    314 class AlternateFormats : public Singleton<AlternateFormats> {
    315  public:
    316   PhoneMetadataCollection format_data_;
    317 
    318   map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_;
    319 
    320   AlternateFormats()
    321       : format_data_(),
    322         calling_code_to_alternate_formats_map_() {
    323     if (!LoadAlternateFormats(&format_data_)) {
    324       LOG(DFATAL) << "Could not parse compiled-in metadata.";
    325       return;
    326     }
    327     for (RepeatedPtrField<PhoneMetadata>::const_iterator it =
    328              format_data_.metadata().begin();
    329          it != format_data_.metadata().end();
    330          ++it) {
    331       calling_code_to_alternate_formats_map_.insert(
    332           make_pair(it->country_code(), &*it));
    333     }
    334   }
    335 
    336   const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code)
    337       const {
    338     map<int, const PhoneMetadata*>::const_iterator it =
    339         calling_code_to_alternate_formats_map_.find(country_calling_code);
    340     if (it != calling_code_to_alternate_formats_map_.end()) {
    341       return it->second;
    342     }
    343     return NULL;
    344   }
    345 
    346  private:
    347   DISALLOW_COPY_AND_ASSIGN(AlternateFormats);
    348 };
    349 
    350 PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util,
    351                                        const string& text,
    352                                        const string& region_code,
    353                                        PhoneNumberMatcher::Leniency leniency,
    354                                        int max_tries)
    355     : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
    356       alternate_formats_(AlternateFormats::GetInstance()),
    357       phone_util_(util),
    358       text_(text),
    359       preferred_region_(region_code),
    360       leniency_(leniency),
    361       max_tries_(max_tries),
    362       state_(NOT_READY),
    363       last_match_(NULL),
    364       search_index_(0) {
    365 }
    366 
    367 PhoneNumberMatcher::PhoneNumberMatcher(const string& text,
    368                                        const string& region_code)
    369     : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
    370       alternate_formats_(NULL),  // Not used.
    371       phone_util_(*PhoneNumberUtil::GetInstance()),
    372       text_(text),
    373       preferred_region_(region_code),
    374       leniency_(VALID),
    375       max_tries_(numeric_limits<int>::max()),
    376       state_(NOT_READY),
    377       last_match_(NULL),
    378       search_index_(0) {
    379 }
    380 
    381 PhoneNumberMatcher::~PhoneNumberMatcher() {
    382 }
    383 
    384 // static
    385 bool PhoneNumberMatcher::IsLatinLetter(char32 letter) {
    386   // Combining marks are a subset of non-spacing-mark.
    387   if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) {
    388     return false;
    389   }
    390   UBlockCode block = ublock_getCode(letter);
    391   return ((block == UBLOCK_BASIC_LATIN) ||
    392       (block == UBLOCK_LATIN_1_SUPPLEMENT) ||
    393       (block == UBLOCK_LATIN_EXTENDED_A) ||
    394       (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
    395       (block == UBLOCK_LATIN_EXTENDED_B) ||
    396       (block == UBLOCK_COMBINING_DIACRITICAL_MARKS));
    397 }
    398 
    399 bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset,
    400                                         PhoneNumberMatch* match) {
    401   DCHECK(match);
    402   // Check the candidate doesn't contain any formatting which would indicate
    403   // that it really isn't a phone number.
    404   if (!reg_exps_->matching_brackets_->FullMatch(candidate)) {
    405     return false;
    406   }
    407 
    408   // If leniency is set to VALID or stricter, we also want to skip numbers that
    409   // are surrounded by Latin alphabetic characters, to skip cases like
    410   // abc8005001234 or 8005001234def.
    411   if (leniency_ >= VALID) {
    412     // If the candidate is not at the start of the text, and does not start with
    413     // phone-number punctuation, check the previous character.
    414     scoped_ptr<RegExpInput> candidate_input(
    415         reg_exps_->regexp_factory_->CreateInput(candidate));
    416     if (offset > 0 &&
    417         !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) {
    418       char32 previous_char;
    419       const char* previous_char_ptr =
    420           EncodingUtils::BackUpOneUTF8Character(text_.c_str(),
    421                                                 text_.c_str() + offset);
    422       EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char);
    423       // We return false if it is a latin letter or an invalid punctuation
    424       // symbol.
    425       if (IsInvalidPunctuationSymbol(previous_char) ||
    426           IsLatinLetter(previous_char)) {
    427         return false;
    428       }
    429     }
    430     size_t lastCharIndex = offset + candidate.length();
    431     if (lastCharIndex < text_.length()) {
    432       char32 next_char;
    433       const char* next_char_ptr =
    434           EncodingUtils::AdvanceOneUTF8Character(
    435               text_.c_str() + lastCharIndex - 1);
    436       EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char);
    437       if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) {
    438         return false;
    439       }
    440     }
    441   }
    442 
    443   PhoneNumber number;
    444   if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) !=
    445       PhoneNumberUtil::NO_PARSING_ERROR) {
    446     return false;
    447   }
    448   if (VerifyAccordingToLeniency(leniency_, number, candidate)) {
    449     match->set_start(offset);
    450     match->set_raw_string(candidate);
    451     // We used ParseAndKeepRawInput to create this number, but for now we don't
    452     // return the extra values parsed. TODO: stop clearing all values here and
    453     // switch all users over to using raw_input() rather than the raw_string()
    454     // of PhoneNumberMatch.
    455     number.clear_country_code_source();
    456     number.clear_preferred_domestic_carrier_code();
    457     number.clear_raw_input();
    458     match->set_number(number);
    459     return true;
    460   }
    461   return false;
    462 }
    463 
    464 // Helper method to replace the verification method for each enum in the Java
    465 // version.
    466 bool PhoneNumberMatcher::VerifyAccordingToLeniency(
    467     Leniency leniency, const PhoneNumber& number,
    468     const string& candidate) const {
    469   switch (leniency) {
    470     case PhoneNumberMatcher::POSSIBLE:
    471       return phone_util_.IsPossibleNumber(number);
    472     case PhoneNumberMatcher::VALID:
    473       if (!phone_util_.IsValidNumber(number) ||
    474           !ContainsOnlyValidXChars(number, candidate, phone_util_)) {
    475         return false;
    476       }
    477       return IsNationalPrefixPresentIfRequired(number);
    478     case PhoneNumberMatcher::STRICT_GROUPING: {
    479       if (!phone_util_.IsValidNumber(number) ||
    480           !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
    481           // Two or more slashes were present.
    482           (FindNth(candidate, '/', 2) != string::npos) ||
    483           !IsNationalPrefixPresentIfRequired(number)) {
    484         return false;
    485       }
    486       ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
    487                       const string&, const vector<string>&>* callback =
    488           NewPermanentCallback(&AllNumberGroupsRemainGrouped);
    489       bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
    490       delete(callback);
    491       return is_valid;
    492     }
    493     case PhoneNumberMatcher::EXACT_GROUPING: {
    494       if (!phone_util_.IsValidNumber(number) ||
    495           !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
    496           // Two or more slashes were present.
    497           (FindNth(candidate, '/', 2) != string::npos) ||
    498           !IsNationalPrefixPresentIfRequired(number)) {
    499         return false;
    500       }
    501       ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
    502                       const string&, const vector<string>&>* callback =
    503           NewPermanentCallback(
    504               this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent);
    505       bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
    506       delete(callback);
    507       return is_valid;
    508     }
    509     default:
    510       LOG(ERROR) << "No implementation defined for verification for leniency "
    511                  << static_cast<int>(leniency);
    512       return false;
    513   }
    514 }
    515 
    516 bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset,
    517                                            PhoneNumberMatch* match) {
    518   DCHECK(match);
    519   // Try removing either the first or last "group" in the number and see if this
    520   // gives a result. We consider white space to be a possible indication of
    521   // the start or end of the phone number.
    522   scoped_ptr<RegExpInput> candidate_input(
    523       reg_exps_->regexp_factory_->CreateInput(candidate));
    524   if (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
    525                                                   NULL)) {
    526     // Try the first group by itself.
    527     int group_start_index =
    528         candidate.length() - candidate_input->ToString().length();
    529     string first_group_only = candidate.substr(0, group_start_index);
    530     phone_util_.TrimUnwantedEndChars(&first_group_only);
    531     bool success = ParseAndVerify(first_group_only, offset, match);
    532     if (success) {
    533       return true;
    534     }
    535     --max_tries_;
    536 
    537     // Try the rest of the candidate without the first group.
    538     string without_first_group(candidate_input->ToString());
    539     phone_util_.TrimUnwantedEndChars(&without_first_group);
    540     success =
    541         ParseAndVerify(without_first_group, offset + group_start_index, match);
    542     if (success) {
    543       return true;
    544     }
    545     --max_tries_;
    546 
    547     if (max_tries_ > 0) {
    548       while (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
    549                                                          NULL)) {
    550         // Find the last group.
    551       }
    552       int last_group_start =
    553           candidate.length() - candidate_input->ToString().length();
    554       string without_last_group = candidate.substr(0, last_group_start);
    555       phone_util_.TrimUnwantedEndChars(&without_last_group);
    556       if (without_last_group == first_group_only) {
    557         // If there are only two groups, then the group "without the last group"
    558         // is the same as the first group. In these cases, we don't want to
    559         // re-check the number group, so we exit already.
    560         return false;
    561       }
    562       success = ParseAndVerify(without_last_group, offset, match);
    563       if (success) {
    564         return true;
    565       }
    566       --max_tries_;
    567     }
    568   }
    569   return false;
    570 }
    571 
    572 bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset,
    573                                       PhoneNumberMatch* match) {
    574   DCHECK(match);
    575   // Skip a match that is more likely a publication page reference or a date.
    576   if (reg_exps_->pub_pages_->PartialMatch(candidate) ||
    577       reg_exps_->slash_separated_dates_->PartialMatch(candidate)) {
    578     return false;
    579   }
    580   // Skip potential time-stamps.
    581   if (reg_exps_->time_stamps_->PartialMatch(candidate)) {
    582     scoped_ptr<RegExpInput> following_text(
    583         reg_exps_->regexp_factory_->CreateInput(
    584             text_.substr(offset + candidate.size())));
    585     if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) {
    586       return false;
    587     }
    588   }
    589 
    590   // Try to come up with a valid match given the entire candidate.
    591   if (ParseAndVerify(candidate, offset, match)) {
    592     return true;
    593   }
    594 
    595   // If that failed, try to find an "inner match" - there might be a phone
    596   // number within this candidate.
    597   return ExtractInnerMatch(candidate, offset, match);
    598 }
    599 
    600 bool PhoneNumberMatcher::HasNext() {
    601   if (state_ == NOT_READY) {
    602     PhoneNumberMatch temp_match;
    603     if (!Find(search_index_, &temp_match)) {
    604       state_ = DONE;
    605     } else {
    606       last_match_.reset(new PhoneNumberMatch(temp_match.start(),
    607                                              temp_match.raw_string(),
    608                                              temp_match.number()));
    609       search_index_ = last_match_->end();
    610       state_ = READY;
    611     }
    612   }
    613   return state_ == READY;
    614 }
    615 
    616 bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) {
    617   DCHECK(match);
    618   // Check the state and find the next match as a side-effect if necessary.
    619   if (!HasNext()) {
    620     return false;
    621   }
    622   match->CopyFrom(*last_match_);
    623   state_ = NOT_READY;
    624   last_match_.reset(NULL);
    625   return true;
    626 }
    627 
    628 bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) {
    629   DCHECK(match);
    630 
    631   scoped_ptr<RegExpInput> text(
    632       reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index)));
    633   string candidate;
    634   while ((max_tries_ > 0) &&
    635          reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) {
    636     int start = text_.length() - text->ToString().length() - candidate.length();
    637     // Check for extra numbers at the end.
    638     reg_exps_->capture_up_to_second_number_start_pattern_->
    639         PartialMatch(candidate, &candidate);
    640     if (ExtractMatch(candidate, start, match)) {
    641       return true;
    642     }
    643 
    644     index = start + candidate.length();
    645     --max_tries_;
    646   }
    647   return false;
    648 }
    649 
    650 bool PhoneNumberMatcher::CheckNumberGroupingIsValid(
    651     const PhoneNumber& phone_number,
    652     const string& candidate,
    653     ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
    654                     const string&, const vector<string>&>* checker) const {
    655   DCHECK(checker);
    656   // TODO: Evaluate how this works for other locales (testing has been limited
    657   // to NANPA regions) and optimise if necessary.
    658   string normalized_candidate =
    659       NormalizeUTF8::NormalizeDecimalDigits(candidate);
    660   vector<string> formatted_number_groups;
    661   GetNationalNumberGroups(phone_number, NULL,  // Use default formatting pattern
    662                           &formatted_number_groups);
    663   if (checker->Run(phone_util_, phone_number, normalized_candidate,
    664                    formatted_number_groups)) {
    665     return true;
    666   }
    667   // If this didn't pass, see if there are any alternate formats, and try them
    668   // instead.
    669   const PhoneMetadata* alternate_formats =
    670     alternate_formats_->GetAlternateFormatsForCountry(
    671         phone_number.country_code());
    672   if (alternate_formats) {
    673     for (RepeatedPtrField<NumberFormat>::const_iterator it =
    674              alternate_formats->number_format().begin();
    675          it != alternate_formats->number_format().end(); ++it) {
    676       formatted_number_groups.clear();
    677       GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups);
    678       if (checker->Run(phone_util_, phone_number, normalized_candidate,
    679                        formatted_number_groups)) {
    680         return true;
    681       }
    682     }
    683   }
    684   return false;
    685 }
    686 
    687 // Helper method to get the national-number part of a number, formatted without
    688 // any national prefix, and return it as a set of digit blocks that would be
    689 // formatted together.
    690 void PhoneNumberMatcher::GetNationalNumberGroups(
    691     const PhoneNumber& number,
    692     const NumberFormat* formatting_pattern,
    693     vector<string>* digit_blocks) const {
    694   string rfc3966_format;
    695   if (!formatting_pattern) {
    696     // This will be in the format +CC-DG;ext=EXT where DG represents groups of
    697     // digits.
    698     phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
    699     // We remove the extension part from the formatted string before splitting
    700     // it into different groups.
    701     size_t end_index = rfc3966_format.find(';');
    702     if (end_index == string::npos) {
    703       end_index = rfc3966_format.length();
    704     }
    705     // The country-code will have a '-' following it.
    706     size_t start_index = rfc3966_format.find('-') + 1;
    707     SplitStringUsing(rfc3966_format.substr(start_index,
    708                                            end_index - start_index),
    709                      "-", digit_blocks);
    710   } else {
    711     // We format the NSN only, and split that according to the separator.
    712     string national_significant_number;
    713     phone_util_.GetNationalSignificantNumber(number,
    714                                              &national_significant_number);
    715     phone_util_.FormatNsnUsingPattern(national_significant_number,
    716                                       *formatting_pattern,
    717                                       PhoneNumberUtil::RFC3966,
    718                                       &rfc3966_format);
    719     SplitStringUsing(rfc3966_format, "-", digit_blocks);
    720   }
    721 }
    722 
    723 bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired(
    724     const PhoneNumber& number) const {
    725   // First, check how we deduced the country code. If it was written in
    726   // international format, then the national prefix is not required.
    727   if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
    728     return true;
    729   }
    730   string phone_number_region;
    731   phone_util_.GetRegionCodeForCountryCode(
    732       number.country_code(), &phone_number_region);
    733   const PhoneMetadata* metadata =
    734       phone_util_.GetMetadataForRegion(phone_number_region);
    735   if (!metadata) {
    736     return true;
    737   }
    738   // Check if a national prefix should be present when formatting this number.
    739   string national_number;
    740   phone_util_.GetNationalSignificantNumber(number, &national_number);
    741   const NumberFormat* format_rule =
    742       phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(),
    743                                                    national_number);
    744   // To do this, we check that a national prefix formatting rule was present and
    745   // that it wasn't just the first-group symbol ($1) with punctuation.
    746   if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) {
    747     if (format_rule->national_prefix_optional_when_formatting()) {
    748       // The national-prefix is optional in these cases, so we don't need to
    749       // check if it was present.
    750       return true;
    751     }
    752     if (phone_util_.FormattingRuleHasFirstGroupOnly(
    753         format_rule->national_prefix_formatting_rule())) {
    754       // National Prefix not needed for this number.
    755       return true;
    756     }
    757     // Normalize the remainder.
    758     string raw_input_copy(number.raw_input());
    759     // Check if we found a national prefix and/or carrier code at the start of
    760     // the raw input, and return the result.
    761     phone_util_.NormalizeDigitsOnly(&raw_input_copy);
    762     return phone_util_.MaybeStripNationalPrefixAndCarrierCode(
    763         *metadata,
    764         &raw_input_copy,
    765         NULL);  // Don't need to keep the stripped carrier code.
    766   }
    767   return true;
    768 }
    769 
    770 bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent(
    771     const PhoneNumberUtil& util,
    772     const PhoneNumber& phone_number,
    773     const string& normalized_candidate,
    774     const vector<string>& formatted_number_groups) const {
    775     const scoped_ptr<RegExpInput> candidate_number(
    776         reg_exps_->regexp_factory_->CreateInput(normalized_candidate));
    777   vector<string> candidate_groups;
    778   string digit_block;
    779   while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume(
    780              candidate_number.get(),
    781              &digit_block)) {
    782     candidate_groups.push_back(digit_block);
    783   }
    784 
    785   // Set this to the last group, skipping it if the number has an extension.
    786   int candidate_number_group_index =
    787       phone_number.has_extension() ? candidate_groups.size() - 2
    788                                    : candidate_groups.size() - 1;
    789   // First we check if the national significant number is formatted as a block.
    790   // We use find and not equals, since the national significant number may be
    791   // present with a prefix such as a national number prefix, or the country code
    792   // itself.
    793   string national_significant_number;
    794   util.GetNationalSignificantNumber(phone_number,
    795                                     &national_significant_number);
    796   if (candidate_groups.size() == 1 ||
    797       candidate_groups.at(candidate_number_group_index).find(
    798           national_significant_number) != string::npos) {
    799     return true;
    800   }
    801   // Starting from the end, go through in reverse, excluding the first group,
    802   // and check the candidate and number groups are the same.
    803   for (int formatted_number_group_index =
    804            (formatted_number_groups.size() - 1);
    805        formatted_number_group_index > 0 &&
    806        candidate_number_group_index >= 0;
    807        --formatted_number_group_index, --candidate_number_group_index) {
    808     if (candidate_groups.at(candidate_number_group_index) !=
    809         formatted_number_groups.at(formatted_number_group_index)) {
    810       return false;
    811     }
    812   }
    813   // Now check the first group. There may be a national prefix at the start, so
    814   // we only check that the candidate group ends with the formatted number
    815   // group.
    816   return (candidate_number_group_index >= 0 &&
    817           HasSuffixString(candidate_groups.at(candidate_number_group_index),
    818                           formatted_number_groups.at(0)));
    819 }
    820 
    821 }  // namespace phonenumbers
    822 }  // namespace i18n
    823