1 // Copyright (C) 2011 The Libphonenumber Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // Author: Lara Rennie 16 // Author: Tao Huang 17 // 18 // Implementation of a stateful class that finds and extracts telephone numbers 19 // from text. 20 21 #include "phonenumbers/phonenumbermatcher.h" 22 23 #ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP 24 #error phonenumbermatcher depends on ICU \ 25 (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set) 26 #endif // I18N_PHONENUMBERS_USE_ICU_REGEXP 27 28 #include <ctype.h> 29 #include <iostream> 30 #include <limits> 31 #include <map> 32 #include <stddef.h> 33 #include <string> 34 #include <utility> 35 #include <vector> 36 37 #include <unicode/uchar.h> 38 39 #include "phonenumbers/alternate_format.h" 40 #include "phonenumbers/base/logging.h" 41 #include "phonenumbers/base/memory/scoped_ptr.h" 42 #include "phonenumbers/base/memory/singleton.h" 43 #include "phonenumbers/callback.h" 44 #include "phonenumbers/default_logger.h" 45 #include "phonenumbers/encoding_utils.h" 46 #include "phonenumbers/normalize_utf8.h" 47 #include "phonenumbers/phonemetadata.pb.h" 48 #include "phonenumbers/phonenumber.pb.h" 49 #include "phonenumbers/phonenumbermatch.h" 50 #include "phonenumbers/phonenumberutil.h" 51 #include "phonenumbers/regexp_adapter.h" 52 #include "phonenumbers/regexp_adapter_icu.h" 53 #include "phonenumbers/stringutil.h" 54 55 #ifdef I18N_PHONENUMBERS_USE_RE2 56 #include "phonenumbers/regexp_adapter_re2.h" 57 #endif // I18N_PHONENUMBERS_USE_RE2_AND_ICU 58 59 using std::cerr; 60 using std::endl; 61 using std::make_pair; 62 using std::map; 63 using std::numeric_limits; 64 using std::string; 65 using std::vector; 66 67 namespace i18n { 68 namespace phonenumbers { 69 70 namespace { 71 // Returns a regular expression quantifier with an upper and lower limit. 72 string Limit(int lower, int upper) { 73 DCHECK_GE(lower, 0); 74 DCHECK_GT(upper, 0); 75 DCHECK_LT(lower, upper); 76 return StrCat("{", lower, ",", upper, "}"); 77 } 78 79 bool IsInvalidPunctuationSymbol(char32 character) { 80 return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL; 81 } 82 83 bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate, 84 const PhoneNumberUtil& util) { 85 // The characters 'x' and 'X' can be (1) a carrier code, in which case they 86 // always precede the national significant number or (2) an extension sign, 87 // in which case they always precede the extension number. We assume a 88 // carrier code is more than 1 digit, so the first case has to have more than 89 // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1 90 // 'x' or 'X'. 91 size_t found; 92 found = candidate.find_first_of("xX"); 93 // We ignore the character if 'x' or 'X' appears as the last character of 94 // the string. 95 while (found != string::npos && found < candidate.length() - 1) { 96 // We only look for 'x' or 'X' in ASCII form. 97 char next_char = candidate[found + 1]; 98 if (next_char == 'x' || next_char == 'X') { 99 // This is the carrier code case, in which the 'X's always precede the 100 // national significant number. 101 ++found; 102 if (util.IsNumberMatchWithOneString( 103 number, candidate.substr(found, candidate.length() - found)) 104 != PhoneNumberUtil::NSN_MATCH) { 105 return false; 106 } 107 } else { 108 string normalized_extension(candidate.substr(found, 109 candidate.length() - found)); 110 util.NormalizeDigitsOnly(&normalized_extension); 111 if (normalized_extension != number.extension()) { 112 return false; 113 } 114 } 115 found = candidate.find_first_of("xX", found + 1); 116 } 117 return true; 118 } 119 120 bool AllNumberGroupsRemainGrouped( 121 const PhoneNumberUtil& util, 122 const PhoneNumber& phone_number, 123 const string& normalized_candidate, 124 const vector<string>& formatted_number_groups) { 125 size_t from_index = 0; 126 // Check each group of consecutive digits are not broken into separate 127 // groupings in the normalized_candidate string. 128 for (size_t i = 0; i < formatted_number_groups.size(); ++i) { 129 // Fails if the substring of normalized_candidate starting from from_index 130 // doesn't contain the consecutive digits in formatted_number_groups.at(i). 131 from_index = normalized_candidate.find(formatted_number_groups.at(i), 132 from_index); 133 if (from_index == string::npos) { 134 return false; 135 } 136 // Moves from_index forward. 137 from_index += formatted_number_groups.at(i).length(); 138 if (i == 0 && from_index < normalized_candidate.length()) { 139 // We are at the position right after the NDC. Note although 140 // normalized_candidate might contain non-ASCII formatting characters, 141 // they won't be treated as ASCII digits when converted to a char. 142 if (isdigit(normalized_candidate.at(from_index))) { 143 // This means there is no formatting symbol after the NDC. In this case, 144 // we only accept the number if there is no formatting symbol at all in 145 // the number, except for extensions. 146 string national_significant_number; 147 util.GetNationalSignificantNumber( 148 phone_number, &national_significant_number); 149 return HasPrefixString(normalized_candidate.substr( 150 from_index - formatted_number_groups.at(i).length()), 151 national_significant_number); 152 } 153 } 154 } 155 // The check here makes sure that we haven't mistakenly already used the 156 // extension to match the last group of the subscriber number. Note the 157 // extension cannot have formatting in-between digits. 158 return normalized_candidate.substr(from_index) 159 .find(phone_number.extension()) != string::npos; 160 } 161 162 bool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) { 163 #if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS) 164 if (!alternate_formats->ParseFromArray(alternate_format_get(), 165 alternate_format_size())) { 166 cerr << "Could not parse binary data." << endl; 167 return false; 168 } 169 return true; 170 #else 171 return false; 172 #endif 173 } 174 175 } // namespace 176 177 class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> { 178 private: 179 friend class Singleton<PhoneNumberMatcherRegExps>; 180 181 string opening_parens_; 182 string closing_parens_; 183 string non_parens_; 184 // Limit on the number of pairs of brackets in a phone number. 185 string bracket_pair_limit_; 186 // Helper strings for the matching_brackets_ pattern. 187 // An opening bracket at the beginning may not be closed, but subsequent ones 188 // should be. It's also possible that the leading bracket was dropped, so we 189 // shouldn't be surprised if we see a closing bracket first. 190 string leading_maybe_matched_bracket_; 191 string bracket_pairs_; 192 // Limit on the number of leading (plus) characters. 193 string lead_limit_; 194 // Limit on the number of consecutive punctuation characters. 195 string punctuation_limit_; 196 // The maximum number of digits allowed in a digit-separated block. As we 197 // allow all digits in a single block, this should be set high enough to 198 // accommodate the entire national number and the international country code. 199 int digit_block_limit_; 200 // Limit on the number of blocks separated by punctuation. Uses 201 // kDigitBlockLimit since some formats use spaces to separate each digit. 202 string block_limit_; 203 // A punctuation sequence allowing white space. 204 string punctuation_; 205 // A digits block without punctuation. 206 string digit_sequence_; 207 // Punctuation that may be at the start of a phone number - brackets and plus 208 // signs. 209 string lead_class_chars_; 210 // Same as lead_class_chars_, but enclosed as a character class. 211 string lead_class_; 212 // Extra helper strings that form part of pattern_. These are stored 213 // separately since StrCat has a limit of 12 args. 214 string opening_punctuation_; 215 string optional_extn_pattern_; 216 217 public: 218 // We use two different reg-ex factories here for performance reasons. RE2 is 219 // much faster for smaller reg-ex patterns, but the main pattern cannot be 220 // handled by RE2 in an efficient way. 221 scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_; 222 scoped_ptr<const AbstractRegExpFactory> regexp_factory_; 223 224 // Matches strings that look like publication pages. Example: 225 // Computing Complete Answers to Queries in the Presence of Limited Access 226 // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003). 227 // 228 // The string "211-227 (2003)" is not a telephone number. 229 scoped_ptr<const RegExp> pub_pages_; 230 // Matches strings that look like dates using "/" as a separator. Examples: 231 // 3/10/2011, 31/10/96 or 08/31/95. 232 scoped_ptr<const RegExp> slash_separated_dates_; 233 // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does 234 // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_. 235 scoped_ptr<const RegExp> time_stamps_; 236 scoped_ptr<const RegExp> time_stamps_suffix_; 237 // Pattern to check that brackets match. Opening brackets should be closed 238 // within a phone number. This also checks that there is something inside the 239 // brackets. Having no brackets at all is also fine. 240 scoped_ptr<const RegExp> matching_brackets_; 241 // Matches white-space, which may indicate the end of a phone number and the 242 // start of something else (such as a neighbouring zip-code). If white-space 243 // is found, continues to match all characters that are not typically used to 244 // start a phone number. 245 scoped_ptr<const RegExp> group_separator_; 246 scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_; 247 scoped_ptr<const RegExp> capturing_ascii_digits_pattern_; 248 // Compiled reg-ex representing lead_class_; 249 scoped_ptr<const RegExp> lead_class_pattern_; 250 // Phone number pattern allowing optional punctuation. 251 scoped_ptr<const RegExp> pattern_; 252 253 PhoneNumberMatcherRegExps() 254 : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[" */), 255 closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\]" */), 256 non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")), 257 bracket_pair_limit_(Limit(0, 3)), 258 leading_maybe_matched_bracket_(StrCat( 259 "(?:[", opening_parens_, "])?", 260 "(?:", non_parens_, "+[", closing_parens_, "])?")), 261 bracket_pairs_(StrCat( 262 "(?:[", opening_parens_, "]", non_parens_, "+", 263 "[", closing_parens_, "])", bracket_pair_limit_)), 264 lead_limit_(Limit(0, 2)), 265 punctuation_limit_(Limit(0, 4)), 266 digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn + 267 PhoneNumberUtil::kMaxLengthCountryCode), 268 block_limit_(Limit(0, digit_block_limit_)), 269 punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]", 270 punctuation_limit_)), 271 digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))), 272 lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)), 273 lead_class_(StrCat("[", lead_class_chars_, "]")), 274 opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")), 275 optional_extn_pattern_(StrCat( 276 "(?i)(?:", 277 PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(), 278 ")?")), 279 regexp_factory_for_pattern_(new ICURegExpFactory()), 280 #ifdef I18N_PHONENUMBERS_USE_RE2 281 regexp_factory_(new RE2RegExpFactory()), 282 #else 283 regexp_factory_(new ICURegExpFactory()), 284 #endif // I18N_PHONENUMBERS_USE_RE2 285 pub_pages_(regexp_factory_->CreateRegExp( 286 "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")), 287 slash_separated_dates_(regexp_factory_->CreateRegExp( 288 "(?:(?:[0-3]?\\d/[01]?\\d)|" 289 "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")), 290 time_stamps_(regexp_factory_->CreateRegExp( 291 "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$")), 292 time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")), 293 matching_brackets_(regexp_factory_->CreateRegExp( 294 StrCat(leading_maybe_matched_bracket_, non_parens_, "+", 295 bracket_pairs_, non_parens_, "*"))), 296 group_separator_(regexp_factory_->CreateRegExp( 297 StrCat("\\p{Z}", "[^", lead_class_chars_, "\\p{Nd}]*"))), 298 capture_up_to_second_number_start_pattern_( 299 regexp_factory_->CreateRegExp( 300 PhoneNumberUtil::kCaptureUpToSecondNumberStart)), 301 capturing_ascii_digits_pattern_( 302 regexp_factory_->CreateRegExp("(\\d+)")), 303 lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)), 304 pattern_(regexp_factory_for_pattern_->CreateRegExp( 305 StrCat("(", opening_punctuation_, lead_limit_, 306 digit_sequence_, "(?:", punctuation_, digit_sequence_, ")", 307 block_limit_, optional_extn_pattern_, ")"))) { 308 } 309 310 private: 311 DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps); 312 }; 313 314 class AlternateFormats : public Singleton<AlternateFormats> { 315 public: 316 PhoneMetadataCollection format_data_; 317 318 map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_; 319 320 AlternateFormats() 321 : format_data_(), 322 calling_code_to_alternate_formats_map_() { 323 if (!LoadAlternateFormats(&format_data_)) { 324 LOG(DFATAL) << "Could not parse compiled-in metadata."; 325 return; 326 } 327 for (RepeatedPtrField<PhoneMetadata>::const_iterator it = 328 format_data_.metadata().begin(); 329 it != format_data_.metadata().end(); 330 ++it) { 331 calling_code_to_alternate_formats_map_.insert( 332 make_pair(it->country_code(), &*it)); 333 } 334 } 335 336 const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code) 337 const { 338 map<int, const PhoneMetadata*>::const_iterator it = 339 calling_code_to_alternate_formats_map_.find(country_calling_code); 340 if (it != calling_code_to_alternate_formats_map_.end()) { 341 return it->second; 342 } 343 return NULL; 344 } 345 346 private: 347 DISALLOW_COPY_AND_ASSIGN(AlternateFormats); 348 }; 349 350 PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util, 351 const string& text, 352 const string& region_code, 353 PhoneNumberMatcher::Leniency leniency, 354 int max_tries) 355 : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()), 356 alternate_formats_(AlternateFormats::GetInstance()), 357 phone_util_(util), 358 text_(text), 359 preferred_region_(region_code), 360 leniency_(leniency), 361 max_tries_(max_tries), 362 state_(NOT_READY), 363 last_match_(NULL), 364 search_index_(0) { 365 } 366 367 PhoneNumberMatcher::PhoneNumberMatcher(const string& text, 368 const string& region_code) 369 : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()), 370 alternate_formats_(NULL), // Not used. 371 phone_util_(*PhoneNumberUtil::GetInstance()), 372 text_(text), 373 preferred_region_(region_code), 374 leniency_(VALID), 375 max_tries_(numeric_limits<int>::max()), 376 state_(NOT_READY), 377 last_match_(NULL), 378 search_index_(0) { 379 } 380 381 PhoneNumberMatcher::~PhoneNumberMatcher() { 382 } 383 384 // static 385 bool PhoneNumberMatcher::IsLatinLetter(char32 letter) { 386 // Combining marks are a subset of non-spacing-mark. 387 if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) { 388 return false; 389 } 390 UBlockCode block = ublock_getCode(letter); 391 return ((block == UBLOCK_BASIC_LATIN) || 392 (block == UBLOCK_LATIN_1_SUPPLEMENT) || 393 (block == UBLOCK_LATIN_EXTENDED_A) || 394 (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) || 395 (block == UBLOCK_LATIN_EXTENDED_B) || 396 (block == UBLOCK_COMBINING_DIACRITICAL_MARKS)); 397 } 398 399 bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset, 400 PhoneNumberMatch* match) { 401 DCHECK(match); 402 // Check the candidate doesn't contain any formatting which would indicate 403 // that it really isn't a phone number. 404 if (!reg_exps_->matching_brackets_->FullMatch(candidate)) { 405 return false; 406 } 407 408 // If leniency is set to VALID or stricter, we also want to skip numbers that 409 // are surrounded by Latin alphabetic characters, to skip cases like 410 // abc8005001234 or 8005001234def. 411 if (leniency_ >= VALID) { 412 // If the candidate is not at the start of the text, and does not start with 413 // phone-number punctuation, check the previous character. 414 scoped_ptr<RegExpInput> candidate_input( 415 reg_exps_->regexp_factory_->CreateInput(candidate)); 416 if (offset > 0 && 417 !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) { 418 char32 previous_char; 419 const char* previous_char_ptr = 420 EncodingUtils::BackUpOneUTF8Character(text_.c_str(), 421 text_.c_str() + offset); 422 EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char); 423 // We return false if it is a latin letter or an invalid punctuation 424 // symbol. 425 if (IsInvalidPunctuationSymbol(previous_char) || 426 IsLatinLetter(previous_char)) { 427 return false; 428 } 429 } 430 size_t lastCharIndex = offset + candidate.length(); 431 if (lastCharIndex < text_.length()) { 432 char32 next_char; 433 const char* next_char_ptr = 434 EncodingUtils::AdvanceOneUTF8Character( 435 text_.c_str() + lastCharIndex - 1); 436 EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char); 437 if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) { 438 return false; 439 } 440 } 441 } 442 443 PhoneNumber number; 444 if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) != 445 PhoneNumberUtil::NO_PARSING_ERROR) { 446 return false; 447 } 448 if (VerifyAccordingToLeniency(leniency_, number, candidate)) { 449 match->set_start(offset); 450 match->set_raw_string(candidate); 451 // We used ParseAndKeepRawInput to create this number, but for now we don't 452 // return the extra values parsed. TODO: stop clearing all values here and 453 // switch all users over to using raw_input() rather than the raw_string() 454 // of PhoneNumberMatch. 455 number.clear_country_code_source(); 456 number.clear_preferred_domestic_carrier_code(); 457 number.clear_raw_input(); 458 match->set_number(number); 459 return true; 460 } 461 return false; 462 } 463 464 // Helper method to replace the verification method for each enum in the Java 465 // version. 466 bool PhoneNumberMatcher::VerifyAccordingToLeniency( 467 Leniency leniency, const PhoneNumber& number, 468 const string& candidate) const { 469 switch (leniency) { 470 case PhoneNumberMatcher::POSSIBLE: 471 return phone_util_.IsPossibleNumber(number); 472 case PhoneNumberMatcher::VALID: 473 if (!phone_util_.IsValidNumber(number) || 474 !ContainsOnlyValidXChars(number, candidate, phone_util_)) { 475 return false; 476 } 477 return IsNationalPrefixPresentIfRequired(number); 478 case PhoneNumberMatcher::STRICT_GROUPING: { 479 if (!phone_util_.IsValidNumber(number) || 480 !ContainsOnlyValidXChars(number, candidate, phone_util_) || 481 // Two or more slashes were present. 482 (FindNth(candidate, '/', 2) != string::npos) || 483 !IsNationalPrefixPresentIfRequired(number)) { 484 return false; 485 } 486 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&, 487 const string&, const vector<string>&>* callback = 488 NewPermanentCallback(&AllNumberGroupsRemainGrouped); 489 bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback); 490 delete(callback); 491 return is_valid; 492 } 493 case PhoneNumberMatcher::EXACT_GROUPING: { 494 if (!phone_util_.IsValidNumber(number) || 495 !ContainsOnlyValidXChars(number, candidate, phone_util_) || 496 // Two or more slashes were present. 497 (FindNth(candidate, '/', 2) != string::npos) || 498 !IsNationalPrefixPresentIfRequired(number)) { 499 return false; 500 } 501 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&, 502 const string&, const vector<string>&>* callback = 503 NewPermanentCallback( 504 this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent); 505 bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback); 506 delete(callback); 507 return is_valid; 508 } 509 default: 510 LOG(ERROR) << "No implementation defined for verification for leniency " 511 << static_cast<int>(leniency); 512 return false; 513 } 514 } 515 516 bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset, 517 PhoneNumberMatch* match) { 518 DCHECK(match); 519 // Try removing either the first or last "group" in the number and see if this 520 // gives a result. We consider white space to be a possible indication of 521 // the start or end of the phone number. 522 scoped_ptr<RegExpInput> candidate_input( 523 reg_exps_->regexp_factory_->CreateInput(candidate)); 524 if (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(), 525 NULL)) { 526 // Try the first group by itself. 527 int group_start_index = 528 candidate.length() - candidate_input->ToString().length(); 529 string first_group_only = candidate.substr(0, group_start_index); 530 phone_util_.TrimUnwantedEndChars(&first_group_only); 531 bool success = ParseAndVerify(first_group_only, offset, match); 532 if (success) { 533 return true; 534 } 535 --max_tries_; 536 537 // Try the rest of the candidate without the first group. 538 string without_first_group(candidate_input->ToString()); 539 phone_util_.TrimUnwantedEndChars(&without_first_group); 540 success = 541 ParseAndVerify(without_first_group, offset + group_start_index, match); 542 if (success) { 543 return true; 544 } 545 --max_tries_; 546 547 if (max_tries_ > 0) { 548 while (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(), 549 NULL)) { 550 // Find the last group. 551 } 552 int last_group_start = 553 candidate.length() - candidate_input->ToString().length(); 554 string without_last_group = candidate.substr(0, last_group_start); 555 phone_util_.TrimUnwantedEndChars(&without_last_group); 556 if (without_last_group == first_group_only) { 557 // If there are only two groups, then the group "without the last group" 558 // is the same as the first group. In these cases, we don't want to 559 // re-check the number group, so we exit already. 560 return false; 561 } 562 success = ParseAndVerify(without_last_group, offset, match); 563 if (success) { 564 return true; 565 } 566 --max_tries_; 567 } 568 } 569 return false; 570 } 571 572 bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset, 573 PhoneNumberMatch* match) { 574 DCHECK(match); 575 // Skip a match that is more likely a publication page reference or a date. 576 if (reg_exps_->pub_pages_->PartialMatch(candidate) || 577 reg_exps_->slash_separated_dates_->PartialMatch(candidate)) { 578 return false; 579 } 580 // Skip potential time-stamps. 581 if (reg_exps_->time_stamps_->PartialMatch(candidate)) { 582 scoped_ptr<RegExpInput> following_text( 583 reg_exps_->regexp_factory_->CreateInput( 584 text_.substr(offset + candidate.size()))); 585 if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) { 586 return false; 587 } 588 } 589 590 // Try to come up with a valid match given the entire candidate. 591 if (ParseAndVerify(candidate, offset, match)) { 592 return true; 593 } 594 595 // If that failed, try to find an "inner match" - there might be a phone 596 // number within this candidate. 597 return ExtractInnerMatch(candidate, offset, match); 598 } 599 600 bool PhoneNumberMatcher::HasNext() { 601 if (state_ == NOT_READY) { 602 PhoneNumberMatch temp_match; 603 if (!Find(search_index_, &temp_match)) { 604 state_ = DONE; 605 } else { 606 last_match_.reset(new PhoneNumberMatch(temp_match.start(), 607 temp_match.raw_string(), 608 temp_match.number())); 609 search_index_ = last_match_->end(); 610 state_ = READY; 611 } 612 } 613 return state_ == READY; 614 } 615 616 bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) { 617 DCHECK(match); 618 // Check the state and find the next match as a side-effect if necessary. 619 if (!HasNext()) { 620 return false; 621 } 622 match->CopyFrom(*last_match_); 623 state_ = NOT_READY; 624 last_match_.reset(NULL); 625 return true; 626 } 627 628 bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) { 629 DCHECK(match); 630 631 scoped_ptr<RegExpInput> text( 632 reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index))); 633 string candidate; 634 while ((max_tries_ > 0) && 635 reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) { 636 int start = text_.length() - text->ToString().length() - candidate.length(); 637 // Check for extra numbers at the end. 638 reg_exps_->capture_up_to_second_number_start_pattern_-> 639 PartialMatch(candidate, &candidate); 640 if (ExtractMatch(candidate, start, match)) { 641 return true; 642 } 643 644 index = start + candidate.length(); 645 --max_tries_; 646 } 647 return false; 648 } 649 650 bool PhoneNumberMatcher::CheckNumberGroupingIsValid( 651 const PhoneNumber& phone_number, 652 const string& candidate, 653 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&, 654 const string&, const vector<string>&>* checker) const { 655 DCHECK(checker); 656 // TODO: Evaluate how this works for other locales (testing has been limited 657 // to NANPA regions) and optimise if necessary. 658 string normalized_candidate = 659 NormalizeUTF8::NormalizeDecimalDigits(candidate); 660 vector<string> formatted_number_groups; 661 GetNationalNumberGroups(phone_number, NULL, // Use default formatting pattern 662 &formatted_number_groups); 663 if (checker->Run(phone_util_, phone_number, normalized_candidate, 664 formatted_number_groups)) { 665 return true; 666 } 667 // If this didn't pass, see if there are any alternate formats, and try them 668 // instead. 669 const PhoneMetadata* alternate_formats = 670 alternate_formats_->GetAlternateFormatsForCountry( 671 phone_number.country_code()); 672 if (alternate_formats) { 673 for (RepeatedPtrField<NumberFormat>::const_iterator it = 674 alternate_formats->number_format().begin(); 675 it != alternate_formats->number_format().end(); ++it) { 676 formatted_number_groups.clear(); 677 GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups); 678 if (checker->Run(phone_util_, phone_number, normalized_candidate, 679 formatted_number_groups)) { 680 return true; 681 } 682 } 683 } 684 return false; 685 } 686 687 // Helper method to get the national-number part of a number, formatted without 688 // any national prefix, and return it as a set of digit blocks that would be 689 // formatted together. 690 void PhoneNumberMatcher::GetNationalNumberGroups( 691 const PhoneNumber& number, 692 const NumberFormat* formatting_pattern, 693 vector<string>* digit_blocks) const { 694 string rfc3966_format; 695 if (!formatting_pattern) { 696 // This will be in the format +CC-DG;ext=EXT where DG represents groups of 697 // digits. 698 phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format); 699 // We remove the extension part from the formatted string before splitting 700 // it into different groups. 701 size_t end_index = rfc3966_format.find(';'); 702 if (end_index == string::npos) { 703 end_index = rfc3966_format.length(); 704 } 705 // The country-code will have a '-' following it. 706 size_t start_index = rfc3966_format.find('-') + 1; 707 SplitStringUsing(rfc3966_format.substr(start_index, 708 end_index - start_index), 709 "-", digit_blocks); 710 } else { 711 // We format the NSN only, and split that according to the separator. 712 string national_significant_number; 713 phone_util_.GetNationalSignificantNumber(number, 714 &national_significant_number); 715 phone_util_.FormatNsnUsingPattern(national_significant_number, 716 *formatting_pattern, 717 PhoneNumberUtil::RFC3966, 718 &rfc3966_format); 719 SplitStringUsing(rfc3966_format, "-", digit_blocks); 720 } 721 } 722 723 bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired( 724 const PhoneNumber& number) const { 725 // First, check how we deduced the country code. If it was written in 726 // international format, then the national prefix is not required. 727 if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) { 728 return true; 729 } 730 string phone_number_region; 731 phone_util_.GetRegionCodeForCountryCode( 732 number.country_code(), &phone_number_region); 733 const PhoneMetadata* metadata = 734 phone_util_.GetMetadataForRegion(phone_number_region); 735 if (!metadata) { 736 return true; 737 } 738 // Check if a national prefix should be present when formatting this number. 739 string national_number; 740 phone_util_.GetNationalSignificantNumber(number, &national_number); 741 const NumberFormat* format_rule = 742 phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(), 743 national_number); 744 // To do this, we check that a national prefix formatting rule was present and 745 // that it wasn't just the first-group symbol ($1) with punctuation. 746 if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) { 747 if (format_rule->national_prefix_optional_when_formatting()) { 748 // The national-prefix is optional in these cases, so we don't need to 749 // check if it was present. 750 return true; 751 } 752 if (phone_util_.FormattingRuleHasFirstGroupOnly( 753 format_rule->national_prefix_formatting_rule())) { 754 // National Prefix not needed for this number. 755 return true; 756 } 757 // Normalize the remainder. 758 string raw_input_copy(number.raw_input()); 759 // Check if we found a national prefix and/or carrier code at the start of 760 // the raw input, and return the result. 761 phone_util_.NormalizeDigitsOnly(&raw_input_copy); 762 return phone_util_.MaybeStripNationalPrefixAndCarrierCode( 763 *metadata, 764 &raw_input_copy, 765 NULL); // Don't need to keep the stripped carrier code. 766 } 767 return true; 768 } 769 770 bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent( 771 const PhoneNumberUtil& util, 772 const PhoneNumber& phone_number, 773 const string& normalized_candidate, 774 const vector<string>& formatted_number_groups) const { 775 const scoped_ptr<RegExpInput> candidate_number( 776 reg_exps_->regexp_factory_->CreateInput(normalized_candidate)); 777 vector<string> candidate_groups; 778 string digit_block; 779 while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume( 780 candidate_number.get(), 781 &digit_block)) { 782 candidate_groups.push_back(digit_block); 783 } 784 785 // Set this to the last group, skipping it if the number has an extension. 786 int candidate_number_group_index = 787 phone_number.has_extension() ? candidate_groups.size() - 2 788 : candidate_groups.size() - 1; 789 // First we check if the national significant number is formatted as a block. 790 // We use find and not equals, since the national significant number may be 791 // present with a prefix such as a national number prefix, or the country code 792 // itself. 793 string national_significant_number; 794 util.GetNationalSignificantNumber(phone_number, 795 &national_significant_number); 796 if (candidate_groups.size() == 1 || 797 candidate_groups.at(candidate_number_group_index).find( 798 national_significant_number) != string::npos) { 799 return true; 800 } 801 // Starting from the end, go through in reverse, excluding the first group, 802 // and check the candidate and number groups are the same. 803 for (int formatted_number_group_index = 804 (formatted_number_groups.size() - 1); 805 formatted_number_group_index > 0 && 806 candidate_number_group_index >= 0; 807 --formatted_number_group_index, --candidate_number_group_index) { 808 if (candidate_groups.at(candidate_number_group_index) != 809 formatted_number_groups.at(formatted_number_group_index)) { 810 return false; 811 } 812 } 813 // Now check the first group. There may be a national prefix at the start, so 814 // we only check that the candidate group ends with the formatted number 815 // group. 816 return (candidate_number_group_index >= 0 && 817 HasSuffixString(candidate_groups.at(candidate_number_group_index), 818 formatted_number_groups.at(0))); 819 } 820 821 } // namespace phonenumbers 822 } // namespace i18n 823