1 /* 2 * Copyright (C) 2012 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 33 // Magic pretend-to-be-a-chromium-build flags 34 #undef WEBKIT_IMPLEMENTATION 35 #undef LOG 36 37 #include "content/address_detector.h" 38 39 #include <bitset> 40 41 #include "base/utf_string_conversions.h" 42 #include "net/base/escape.h" 43 #include "Settings.h" 44 #include "WebString.h" 45 46 #include <wtf/HashSet.h> 47 #include <wtf/Noncopyable.h> 48 #include <wtf/text/StringHash.h> 49 #include <wtf/text/WTFString.h> 50 51 namespace { 52 53 // Prefix used for geographical address intent URIs. 54 static const char kAddressSchemaPrefix[] = "geo:0,0?q="; 55 56 // Maximum text length to be searched for address detection. 57 static const size_t kMaxAddressLength = 500; 58 59 // Minimum number of words in an address after the house number 60 // before a state is expected to be found. 61 // A value too high can miss short addresses. 62 const size_t kMinAddressWords = 3; 63 64 // Maximum number of words allowed in an address between the house number 65 // and the state, both not included. 66 const size_t kMaxAddressWords = 12; 67 68 // Maximum number of lines allowed in an address between the house number 69 // and the state, both not included. 70 const size_t kMaxAddressLines = 5; 71 72 // Maximum length allowed for any address word between the house number 73 // and the state, both not included. 74 const size_t kMaxAddressNameWordLength = 25; 75 76 // Maximum number of words after the house number in which the location name 77 // should be found. 78 const size_t kMaxLocationNameDistance = 4; 79 80 // Number of digits for a valid zip code. 81 const size_t kZipDigits = 5; 82 83 // Number of digits for a valid zip code in the Zip Plus 4 format. 84 const size_t kZipPlus4Digits = 9; 85 86 // Maximum number of digits of a house number, including possible hyphens. 87 const size_t kMaxHouseDigits = 5; 88 89 // Additional characters used as new line delimiters. 90 const char16 kNewlineDelimiters[] = { 91 ',', 92 '*', 93 0x2022, // Unicode bullet 94 0, 95 }; 96 97 char16 SafePreviousChar(const string16::const_iterator& it, 98 const string16::const_iterator& begin) { 99 if (it == begin) 100 return ' '; 101 return *(it - 1); 102 } 103 104 char16 SafeNextChar(const string16::const_iterator& it, 105 const string16::const_iterator& end) { 106 if (it == end) 107 return ' '; 108 return *(it + 1); 109 } 110 111 bool WordLowerCaseEqualsASCII(string16::const_iterator word_begin, 112 string16::const_iterator word_end, const char* ascii_to_match) { 113 for (string16::const_iterator it = word_begin; it != word_end; 114 ++it, ++ascii_to_match) { 115 if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match) 116 return false; 117 } 118 return *ascii_to_match == 0 || *ascii_to_match == ' '; 119 } 120 121 bool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin, 122 string16::const_iterator word_end, const char* ascii_to_match, 123 bool allow_plural) { 124 for (string16::const_iterator it = word_begin; it != word_end; 125 ++it, ++ascii_to_match) { 126 if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end) 127 return true; 128 129 if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match) 130 return false; 131 } 132 return *ascii_to_match == 0; 133 } 134 135 } // anonymous namespace 136 137 138 AddressDetector::AddressDetector() { 139 } 140 141 AddressDetector::~AddressDetector() { 142 } 143 144 std::string AddressDetector::GetContentText(const WebKit::WebRange& range) { 145 // Get the address and replace unicode bullets with commas. 146 string16 address_16 = CollapseWhitespace(range.toPlainText(), false); 147 std::replace(address_16.begin(), address_16.end(), 148 static_cast<char16>(0x2022), static_cast<char16>(',')); 149 return UTF16ToUTF8(address_16); 150 } 151 152 GURL AddressDetector::GetIntentURL(const std::string& content_text) { 153 return GURL(kAddressSchemaPrefix + 154 EscapeQueryParamValue(content_text, true)); 155 } 156 157 size_t AddressDetector::GetMaximumContentLength() { 158 return kMaxAddressLength; 159 } 160 161 bool AddressDetector::IsEnabled(const WebKit::WebHitTestInfo& hit_test) { 162 WebCore::Settings* settings = GetSettings(hit_test); 163 return settings && settings->formatDetectionAddress(); 164 } 165 166 bool AddressDetector::FindContent(const string16::const_iterator& begin, 167 const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) { 168 HouseNumberParser house_number_parser; 169 170 // Keep going through the input string until a potential house number is 171 // detected. Start tokenizing the following words to find a valid 172 // street name within a word range. Then, find a state name followed 173 // by a valid zip code for that state. Also keep a look for any other 174 // possible house numbers to continue from in case of no match and for 175 // state names not followed by a zip code (e.g. New York, NY 10000). 176 const string16 newline_delimiters = kNewlineDelimiters; 177 const string16 delimiters = kWhitespaceUTF16 + newline_delimiters; 178 for (string16::const_iterator it = begin; it != end; ) { 179 Word house_number; 180 if (!house_number_parser.Parse(it, end, &house_number)) 181 return false; 182 183 String16Tokenizer tokenizer(house_number.end, end, delimiters); 184 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS); 185 186 std::vector<Word> words; 187 words.push_back(house_number); 188 189 bool found_location_name = false; 190 bool continue_on_house_number = true; 191 size_t next_house_number_word = 0; 192 size_t num_lines = 1; 193 194 // Don't include the house number in the word count. 195 size_t next_word = 1; 196 for (; next_word <= kMaxAddressWords + 1; ++next_word) { 197 198 // Extract a new word from the tokenizer. 199 if (next_word == words.size()) { 200 do { 201 if (!tokenizer.GetNext()) 202 return false; 203 204 // Check the number of address lines. 205 if (tokenizer.token_is_delim() && newline_delimiters.find( 206 *tokenizer.token_begin()) != string16::npos) { 207 ++num_lines; 208 } 209 } while (tokenizer.token_is_delim()); 210 211 if (num_lines > kMaxAddressLines) 212 break; 213 214 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end())); 215 } 216 217 // Check the word length. If too long, don't try to continue from 218 // the next house number as no address can hold this word. 219 const Word& current_word = words[next_word]; 220 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0); 221 size_t current_word_length = std::distance( 222 current_word.begin, current_word.end); 223 if (current_word_length > kMaxAddressNameWordLength) { 224 continue_on_house_number = false; 225 break; 226 } 227 228 // Check if the new word is a valid house number. 229 // This is used to properly resume parsing in case the maximum number 230 // of words is exceeded. 231 if (next_house_number_word == 0 && 232 house_number_parser.Parse(current_word.begin, current_word.end, NULL)) { 233 next_house_number_word = next_word; 234 continue; 235 } 236 237 // Look for location names in the words after the house number. 238 // A range limitation is introduced to avoid matching 239 // anything that starts with a number before a legitimate address. 240 if (next_word <= kMaxLocationNameDistance && 241 IsValidLocationName(current_word)) { 242 found_location_name = true; 243 continue; 244 } 245 246 // Don't count the house number. 247 if (next_word > kMinAddressWords) { 248 // Looking for the state is likely to add new words to the list while 249 // checking for multi-word state names. 250 size_t state_first_word = next_word; 251 size_t state_last_word, state_index; 252 if (FindStateStartingInWord(&words, state_first_word, &state_last_word, 253 &tokenizer, &state_index)) { 254 255 // A location name should have been found at this point. 256 if (!found_location_name) 257 break; 258 259 // Explicitly exclude "et al", as "al" is a valid state code. 260 if (current_word_length == 2 && words.size() > 2) { 261 const Word& previous_word = words[state_first_word - 1]; 262 if (previous_word.end - previous_word.begin == 2 && 263 LowerCaseEqualsASCII(previous_word.begin, previous_word.end, 264 "et") && 265 LowerCaseEqualsASCII(current_word.begin, current_word.end, 266 "al")) 267 break; 268 } 269 270 // Extract one more word from the tokenizer if not already available. 271 size_t zip_word = state_last_word + 1; 272 if (zip_word == words.size()) { 273 do { 274 if (!tokenizer.GetNext()) { 275 // Zip is optional 276 *start_pos = words[0].begin - begin; 277 *end_pos = words[state_last_word].end - begin; 278 return true; 279 } 280 } while (tokenizer.token_is_delim()); 281 words.push_back(Word(tokenizer.token_begin(), 282 tokenizer.token_end())); 283 } 284 285 // Check the parsing validity and state range of the zip code. 286 next_word = state_last_word; 287 if (!IsZipValid(words[zip_word], state_index)) 288 continue; 289 290 *start_pos = words[0].begin - begin; 291 *end_pos = words[zip_word].end - begin; 292 return true; 293 } 294 } 295 } 296 297 // Avoid skipping too many words because of a non-address number 298 // at the beginning of the contents to parse. 299 if (continue_on_house_number && next_house_number_word > 0) { 300 it = words[next_house_number_word].begin; 301 } else { 302 DCHECK(!words.empty()); 303 next_word = std::min(next_word, words.size() - 1); 304 it = words[next_word].end; 305 } 306 } 307 308 return false; 309 } 310 311 bool AddressDetector::HouseNumberParser::IsPreDelimiter( 312 char16 character) { 313 return character == ':' || IsPostDelimiter(character); 314 } 315 316 bool AddressDetector::HouseNumberParser::IsPostDelimiter( 317 char16 character) { 318 return IsWhitespace(character) || strchr(",\"'", character); 319 } 320 321 void AddressDetector::HouseNumberParser::RestartOnNextDelimiter() { 322 ResetState(); 323 for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {} 324 } 325 326 void AddressDetector::HouseNumberParser::AcceptChars(size_t num_chars) { 327 size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)), 328 num_chars); 329 it_ += offset; 330 result_chars_ += offset; 331 } 332 333 void AddressDetector::HouseNumberParser::SkipChars(size_t num_chars) { 334 it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars); 335 } 336 337 void AddressDetector::HouseNumberParser::ResetState() { 338 num_digits_ = 0; 339 result_chars_ = 0; 340 } 341 342 bool AddressDetector::HouseNumberParser::CheckFinished(Word* word) const { 343 // There should always be a number after a hyphen. 344 if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-') 345 return false; 346 347 if (word) { 348 word->begin = it_ - result_chars_; 349 word->end = it_; 350 } 351 return true; 352 } 353 354 bool AddressDetector::HouseNumberParser::Parse( 355 const string16::const_iterator& begin, 356 const string16::const_iterator& end, Word* word) { 357 it_ = begin_ = begin; 358 end_ = end; 359 ResetState(); 360 361 // Iterations only used as a fail-safe against any buggy infinite loops. 362 size_t iterations = 0; 363 size_t max_iterations = end - begin + 1; 364 for (; it_ != end_ && iterations < max_iterations; ++iterations) { 365 366 // Word finished case. 367 if (IsPostDelimiter(*it_)) { 368 if (CheckFinished(word)) 369 return true; 370 else if (result_chars_) 371 ResetState(); 372 373 SkipChars(1); 374 continue; 375 } 376 377 // More digits. There should be no more after a letter was found. 378 if (IsAsciiDigit(*it_)) { 379 if (num_digits_ >= kMaxHouseDigits) { 380 RestartOnNextDelimiter(); 381 } else { 382 AcceptChars(1); 383 ++num_digits_; 384 } 385 continue; 386 } 387 388 if (IsAsciiAlpha(*it_)) { 389 // Handle special case 'one'. 390 if (result_chars_ == 0) { 391 if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one")) 392 AcceptChars(3); 393 else 394 RestartOnNextDelimiter(); 395 continue; 396 } 397 398 // There should be more than 1 character because of result_chars. 399 DCHECK_GT(result_chars_, 0U); 400 DCHECK_NE(it_, begin_); 401 char16 previous = SafePreviousChar(it_, begin_); 402 if (IsAsciiDigit(previous)) { 403 // Check cases like '12A'. 404 char16 next = SafeNextChar(it_, end_); 405 if (IsPostDelimiter(next)) { 406 AcceptChars(1); 407 continue; 408 } 409 410 // Handle cases like 12a, 1st, 2nd, 3rd, 7th. 411 if (IsAsciiAlpha(next)) { 412 char16 last_digit = previous; 413 char16 first_letter = base::ToLowerASCII(*it_); 414 char16 second_letter = base::ToLowerASCII(next); 415 bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' && 416 num_digits_ == 2; 417 418 switch (last_digit - '0') { 419 case 1: 420 if ((first_letter == 's' && second_letter == 't') || 421 (first_letter == 't' && second_letter == 'h' && is_teen)) { 422 AcceptChars(2); 423 continue; 424 } 425 break; 426 427 case 2: 428 if ((first_letter == 'n' && second_letter == 'd') || 429 (first_letter == 't' && second_letter == 'h' && is_teen)) { 430 AcceptChars(2); 431 continue; 432 } 433 break; 434 435 case 3: 436 if ((first_letter == 'r' && second_letter == 'd') || 437 (first_letter == 't' && second_letter == 'h' && is_teen)) { 438 AcceptChars(2); 439 continue; 440 } 441 break; 442 443 case 0: 444 // Explicitly exclude '0th'. 445 if (num_digits_ == 1) 446 break; 447 448 case 4: 449 case 5: 450 case 6: 451 case 7: 452 case 8: 453 case 9: 454 if (first_letter == 't' && second_letter == 'h') { 455 AcceptChars(2); 456 continue; 457 } 458 break; 459 460 default: 461 NOTREACHED(); 462 } 463 } 464 } 465 466 RestartOnNextDelimiter(); 467 continue; 468 } 469 470 if (*it_ == '-' && num_digits_ > 0) { 471 AcceptChars(1); 472 ++num_digits_; 473 continue; 474 } 475 476 RestartOnNextDelimiter(); 477 SkipChars(1); 478 } 479 480 if (iterations >= max_iterations) 481 return false; 482 483 return CheckFinished(word); 484 } 485 486 bool AddressDetector::FindStateStartingInWord(WordList* words, 487 size_t state_first_word, size_t* state_last_word, 488 String16Tokenizer* tokenizer, size_t* state_index) { 489 490 // Bitmasks containing the allowed suffixes for 2-letter state codes. 491 static const int state_two_letter_suffix[23] = { 492 0x02060c00, // A followed by: [KLRSZ]. 493 0x00000000, // B. 494 0x00084001, // C followed by: [AOT]. 495 0x00000014, // D followed by: [CE]. 496 0x00000000, // E. 497 0x00001800, // F followed by: [LM]. 498 0x00100001, // G followed by: [AU]. 499 0x00000100, // H followed by: [I]. 500 0x00002809, // I followed by: [ADLN]. 501 0x00000000, // J. 502 0x01040000, // K followed by: [SY]. 503 0x00000001, // L followed by: [A]. 504 0x000ce199, // M followed by: [ADEHINOPST]. 505 0x0120129c, // N followed by: [CDEHJMVY]. 506 0x00020480, // O followed by: [HKR]. 507 0x00420001, // P followed by: [ARW]. 508 0x00000000, // Q. 509 0x00000100, // R followed by: [I]. 510 0x0000000c, // S followed by: [CD]. 511 0x00802000, // T followed by: [NX]. 512 0x00080000, // U followed by: [T]. 513 0x00080101, // V followed by: [AIT]. 514 0x01200101 // W followed by: [AIVY]. 515 }; 516 517 // Accumulative number of states for the 2-letter code indexed by the first. 518 static const int state_two_letter_accumulative[24] = { 519 0, 5, 5, 8, 10, 10, 12, 14, 520 15, 19, 19, 21, 22, 32, 40, 43, 521 46, 46, 47, 49, 51, 52, 55, 59 522 }; 523 524 // State names sorted alphabetically with their lengths. 525 // There can be more than one possible name for a same state if desired. 526 static const struct StateNameInfo { 527 const char* string; 528 char first_word_length; 529 char length; 530 char state_index; // Relative to two-character code alphabetical order. 531 } state_names[59] = { 532 { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 }, 533 { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 }, 534 { "arkansas", 8, 8, 2 }, 535 { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 }, 536 { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 }, 537 { "district of columbia", 8, 20, 8 }, 538 { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 }, 539 { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 }, 540 { "hawaii", 6, 6, 14 }, 541 { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 }, 542 { "iowa", 4, 4, 15 }, 543 { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 }, 544 { "louisiana", 9, 9, 21 }, 545 { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 }, 546 { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 }, 547 { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 }, 548 { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 }, 549 { "montana", 7, 7, 31 }, 550 { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 }, 551 { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 }, 552 { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 }, 553 { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 }, 554 { "northern mariana islands", 8, 24, 29 }, 555 { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 }, 556 { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 }, 557 { "puerto rico", 6, 11, 44 }, 558 { "rhode island", 5, 5, 46 }, 559 { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 }, 560 { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 }, 561 { "utah", 4, 4, 51 }, 562 { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 }, 563 { "virginia", 8, 8, 52 }, 564 { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 }, 565 { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 } 566 }; 567 568 // Accumulative number of states for sorted names indexed by the first letter. 569 // Required a different one since there are codes that don't share their 570 // first letter with the name of their state (MP = Northern Mariana Islands). 571 static const int state_names_accumulative[24] = { 572 0, 5, 5, 8, 10, 10, 12, 14, 573 15, 19, 19, 21, 22, 31, 40, 43, 574 46, 46, 47, 49, 51, 52, 55, 59 575 }; 576 577 DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1], 578 static_cast<int>(ARRAYSIZE_UNSAFE(state_names))); 579 580 const Word& first_word = words->at(state_first_word); 581 int length = first_word.end - first_word.begin; 582 if (length < 2 || !IsAsciiAlpha(*first_word.begin)) 583 return false; 584 585 // No state names start with x, y, z. 586 char16 first_letter = base::ToLowerASCII(*first_word.begin); 587 if (first_letter > 'w') 588 return false; 589 590 DCHECK(first_letter >= 'a'); 591 int first_index = first_letter - 'a'; 592 593 // Look for two-letter state names. 594 if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) { 595 char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1)); 596 DCHECK(second_letter >= 'a'); 597 598 int second_index = second_letter - 'a'; 599 if (!(state_two_letter_suffix[first_index] & (1 << second_index))) 600 return false; 601 602 std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] & 603 ((1 << second_index) - 1); 604 *state_last_word = state_first_word; 605 *state_index = state_two_letter_accumulative[first_index] + 606 previous_suffixes.count(); 607 return true; 608 } 609 610 // Look for full state names by their first letter. Discard by length. 611 for (int state = state_names_accumulative[first_index]; 612 state < state_names_accumulative[first_index + 1]; ++state) { 613 if (state_names[state].first_word_length != length) 614 continue; 615 616 bool state_match = false; 617 size_t state_word = state_first_word; 618 for (int pos = 0; true; ) { 619 if (!WordLowerCaseEqualsASCII(words->at(state_word).begin, 620 words->at(state_word).end, &state_names[state].string[pos])) 621 break; 622 623 pos += words->at(state_word).end - words->at(state_word).begin + 1; 624 if (pos >= state_names[state].length) { 625 state_match = true; 626 break; 627 } 628 629 // Ran out of words, extract more from the tokenizer. 630 if (++state_word == words->size()) { 631 do { 632 if (!tokenizer->GetNext()) 633 break; 634 } while (tokenizer->token_is_delim()); 635 words->push_back(Word(tokenizer->token_begin(), tokenizer->token_end())); 636 } 637 } 638 639 if (state_match) { 640 *state_last_word = state_word; 641 *state_index = state_names[state].state_index; 642 return true; 643 } 644 } 645 646 return false; 647 } 648 649 bool AddressDetector::IsZipValid(const Word& word, size_t state_index) { 650 size_t length = word.end - word.begin; 651 if (length != kZipDigits && length != kZipPlus4Digits + 1) 652 return false; 653 654 for (string16::const_iterator it = word.begin; it != word.end; ++it) { 655 size_t pos = it - word.begin; 656 if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits)) 657 continue; 658 return false; 659 } 660 return IsZipValidForState(word, state_index); 661 } 662 663 bool AddressDetector::IsZipValidForState(const Word& word, size_t state_index) 664 { 665 enum USState { 666 AP = -4, // AP (military base in the Pacific) 667 AA = -3, // AA (military base inside the US) 668 AE = -2, // AE (military base outside the US) 669 XX = -1, // (not in use) 670 AK = 0, // AK Alaska 671 AL = 1, // AL Alabama 672 AR = 2, // AR Arkansas 673 AS = 3, // AS American Samoa 674 AZ = 4, // AZ Arizona 675 CA = 5, // CA California 676 CO = 6, // CO Colorado 677 CT = 7, // CT Connecticut 678 DC = 8, // DC District of Columbia 679 DE = 9, // DE Delaware 680 FL = 10, // FL Florida 681 FM = 11, // FM Federated States of Micronesia 682 GA = 12, // GA Georgia 683 GU = 13, // GU Guam 684 HI = 14, // HI Hawaii 685 IA = 15, // IA Iowa 686 ID = 16, // ID Idaho 687 IL = 17, // IL Illinois 688 IN = 18, // IN Indiana 689 KS = 19, // KS Kansas 690 KY = 20, // KY Kentucky 691 LA = 21, // LA Louisiana 692 MA = 22, // MA Massachusetts 693 MD = 23, // MD Maryland 694 ME = 24, // ME Maine 695 MH = 25, // MH Marshall Islands 696 MI = 26, // MI Michigan 697 MN = 27, // MN Minnesota 698 MO = 28, // MO Missouri 699 MP = 29, // MP Northern Mariana Islands 700 MS = 30, // MS Mississippi 701 MT = 31, // MT Montana 702 NC = 32, // NC North Carolina 703 ND = 33, // ND North Dakota 704 NE = 34, // NE Nebraska 705 NH = 35, // NH New Hampshire 706 NJ = 36, // NJ New Jersey 707 NM = 37, // NM New Mexico 708 NV = 38, // NV Nevada 709 NY = 39, // NY New York 710 OH = 40, // OH Ohio 711 OK = 41, // OK Oklahoma 712 OR = 42, // OR Oregon 713 PA = 43, // PA Pennsylvania 714 PR = 44, // PR Puerto Rico 715 PW = 45, // PW Palau 716 RI = 46, // RI Rhode Island 717 SC = 47, // SC South Carolina 718 SD = 48, // SD South Dakota 719 TN = 49, // TN Tennessee 720 TX = 50, // TX Texas 721 UT = 51, // UT Utah 722 VA = 52, // VA Virginia 723 VI = 53, // VI Virgin Islands 724 VT = 54, // VT Vermont 725 WA = 55, // WA Washington 726 WI = 56, // WI Wisconsin 727 WV = 57, // WV West Virginia 728 WY = 58, // WY Wyoming 729 }; 730 731 static const USState stateForZipPrefix[] = { 732 // 0 1 2 3 4 5 6 7 8 9 733 XX, XX, XX, XX, XX, NY, PR, PR, VI, PR, // 000-009 734 MA, MA, MA, MA, MA, MA, MA, MA, MA, MA, // 010-019 735 MA, MA, MA, MA, MA, MA, MA, MA, RI, RI, // 020-029 736 NH, NH, NH, NH, NH, NH, NH, NH, NH, ME, // 030-039 737 ME, ME, ME, ME, ME, ME, ME, ME, ME, ME, // 040-049 738 VT, VT, VT, VT, VT, MA, VT, VT, VT, VT, // 050-059 739 CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 060-069 740 NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 070-079 741 NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 080-089 742 AE, AE, AE, AE, AE, AE, AE, AE, AE, XX, // 090-099 743 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 100-109 744 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 110-119 745 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 120-129 746 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 130-139 747 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 140-149 748 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 150-159 749 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 160-169 750 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 170-179 751 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 180-189 752 PA, PA, PA, PA, PA, PA, PA, DE, DE, DE, // 190-199 753 DC, VA, DC, DC, DC, DC, MD, MD, MD, MD, // 200-209 754 MD, MD, MD, XX, MD, MD, MD, MD, MD, MD, // 210-219 755 VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 220-229 756 VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 230-239 757 VA, VA, VA, VA, VA, VA, VA, WV, WV, WV, // 240-249 758 WV, WV, WV, WV, WV, WV, WV, WV, WV, WV, // 250-259 759 WV, WV, WV, WV, WV, WV, WV, WV, WV, XX, // 260-269 760 NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 270-279 761 NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 280-289 762 SC, SC, SC, SC, SC, SC, SC, SC, SC, SC, // 290-299 763 GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 300-309 764 GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 310-319 765 FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 320-329 766 FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 330-339 767 AA, FL, FL, XX, FL, XX, FL, FL, XX, FL, // 340-349 768 AL, AL, AL, XX, AL, AL, AL, AL, AL, AL, // 350-359 769 AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 360-369 770 TN, TN, TN, TN, TN, TN, TN, TN, TN, TN, // 370-379 771 TN, TN, TN, TN, TN, TN, MS, MS, MS, MS, // 380-389 772 MS, MS, MS, MS, MS, MS, MS, MS, GA, GA, // 390-399 773 KY, KY, KY, KY, KY, KY, KY, KY, KY, KY, // 400-409 774 KY, KY, KY, KY, KY, KY, KY, KY, KY, XX, // 410-419 775 KY, KY, KY, KY, KY, KY, KY, KY, XX, XX, // 420-429 776 OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 430-439 777 OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 440-449 778 OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 450-459 779 IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 460-469 780 IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 470-479 781 MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 480-489 782 MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 490-499 783 IA, IA, IA, IA, IA, IA, IA, IA, IA, IA, // 500-509 784 IA, IA, IA, IA, IA, IA, IA, XX, XX, XX, // 510-519 785 IA, IA, IA, IA, IA, IA, IA, IA, IA, XX, // 520-529 786 WI, WI, WI, XX, WI, WI, XX, WI, WI, WI, // 530-539 787 WI, WI, WI, WI, WI, WI, WI, WI, WI, WI, // 540-549 788 MN, MN, XX, MN, MN, MN, MN, MN, MN, MN, // 550-559 789 MN, MN, MN, MN, MN, MN, MN, MN, XX, DC, // 560-569 790 SD, SD, SD, SD, SD, SD, SD, SD, XX, XX, // 570-579 791 ND, ND, ND, ND, ND, ND, ND, ND, ND, XX, // 580-589 792 MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, // 590-599 793 IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 600-609 794 IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 610-619 795 IL, XX, IL, IL, IL, IL, IL, IL, IL, IL, // 620-629 796 MO, MO, XX, MO, MO, MO, MO, MO, MO, MO, // 630-639 797 MO, MO, XX, XX, MO, MO, MO, MO, MO, MO, // 640-649 798 MO, MO, MO, MO, MO, MO, MO, MO, MO, XX, // 650-659 799 KS, KS, KS, XX, KS, KS, KS, KS, KS, KS, // 660-669 800 KS, KS, KS, KS, KS, KS, KS, KS, KS, KS, // 670-679 801 NE, NE, XX, NE, NE, NE, NE, NE, NE, NE, // 680-689 802 NE, NE, NE, NE, XX, XX, XX, XX, XX, XX, // 690-699 803 LA, LA, XX, LA, LA, LA, LA, LA, LA, XX, // 700-709 804 LA, LA, LA, LA, LA, XX, AR, AR, AR, AR, // 710-719 805 AR, AR, AR, AR, AR, AR, AR, AR, AR, AR, // 720-729 806 OK, OK, XX, TX, OK, OK, OK, OK, OK, OK, // 730-739 807 OK, OK, XX, OK, OK, OK, OK, OK, OK, OK, // 740-749 808 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 750-759 809 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 760-769 810 TX, XX, TX, TX, TX, TX, TX, TX, TX, TX, // 770-779 811 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 780-789 812 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 790-799 813 CO, CO, CO, CO, CO, CO, CO, CO, CO, CO, // 800-809 814 CO, CO, CO, CO, CO, CO, CO, XX, XX, XX, // 810-819 815 WY, WY, WY, WY, WY, WY, WY, WY, WY, WY, // 820-829 816 WY, WY, ID, ID, ID, ID, ID, ID, ID, XX, // 830-839 817 UT, UT, UT, UT, UT, UT, UT, UT, XX, XX, // 840-849 818 AZ, AZ, AZ, AZ, XX, AZ, AZ, AZ, XX, AZ, // 850-859 819 AZ, XX, XX, AZ, AZ, AZ, XX, XX, XX, XX, // 860-869 820 NM, NM, NM, NM, NM, NM, XX, NM, NM, NM, // 870-879 821 NM, NM, NM, NM, NM, TX, XX, XX, XX, NV, // 880-889 822 NV, NV, XX, NV, NV, NV, XX, NV, NV, XX, // 890-899 823 CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 900-909 824 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 910-919 825 CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 920-929 826 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 930-939 827 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 940-949 828 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 950-959 829 CA, CA, AP, AP, AP, AP, AP, HI, HI, GU, // 960-969 830 OR, OR, OR, OR, OR, OR, OR, OR, OR, OR, // 970-979 831 WA, WA, WA, WA, WA, WA, WA, XX, WA, WA, // 980-989 832 WA, WA, WA, WA, WA, AK, AK, AK, AK, AK, // 990-999 833 }; 834 835 if (!word.begin || !word.end || (word.end - word.begin) < 3) 836 return false; 837 const char16* zipPtr = word.begin; 838 if (zipPtr[0] < '0' || zipPtr[0] > '9' || 839 zipPtr[1] < '0' || zipPtr[1] > '9' || 840 zipPtr[2] < '0' || zipPtr[2] > '9') 841 return false; 842 843 int zip = zipPtr[0] - '0'; 844 zip *= 10; 845 zip += zipPtr[1] - '0'; 846 zip *= 10; 847 zip += zipPtr[2] - '0'; 848 return stateForZipPrefix[zip] == (int) state_index; 849 } 850 851 static const char* s_rawStreetSuffixes[] = { 852 "allee", "alley", "ally", "aly", 853 "anex", "annex", "anx", "arc", "arcade", "av", "ave", "aven", "avenu", 854 "avenue", "avn", "avnue", "bayoo", "bayou", "bch", "beach", "bend", 855 "bg", "bgs", "blf", "blfs", "bluf", "bluff", "bluffs", "blvd", "bnd", 856 "bot", "bottm", "bottom", "boul", "boulevard", "boulv", "br", "branch", 857 "brdge", "brg", "bridge", "brk", "brks", "brnch", "brook", "brooks", 858 "btm", "burg", "burgs", "byp", "bypa", "bypas", "bypass", "byps", "byu", 859 "camp", "canyn", "canyon", "cape", "causeway", "causway", "cen", "cent", 860 "center", "centers", "centr", "centre", "cir", "circ", "circl", 861 "circle", "circles", "cirs", "ck", "clb", "clf", "clfs", "cliff", 862 "cliffs", "club", "cmn", "cmp", "cnter", "cntr", "cnyn", "common", 863 "cor", "corner", "corners", "cors", "course", "court", "courts", "cove", 864 "coves", "cp", "cpe", "cr", "crcl", "crcle", "crecent", "creek", "cres", 865 "crescent", "cresent", "crest", "crk", "crossing", "crossroad", 866 "crscnt", "crse", "crsent", "crsnt", "crssing", "crssng", "crst", "crt", 867 "cswy", "ct", "ctr", "ctrs", "cts", "curv", "curve", "cv", "cvs", "cyn", 868 "dale", "dam", "div", "divide", "dl", "dm", "dr", "driv", "drive", 869 "drives", "drs", "drv", "dv", "dvd", "est", "estate", "estates", "ests", 870 "exp", "expr", "express", "expressway", "expw", "expy", "ext", 871 "extension", "extensions", "extn", "extnsn", "exts", "fall", "falls", 872 "ferry", "field", "fields", "flat", "flats", "fld", "flds", "fls", 873 "flt", "flts", "ford", "fords", "forest", "forests", "forg", "forge", 874 "forges", "fork", "forks", "fort", "frd", "frds", "freeway", "freewy", 875 "frg", "frgs", "frk", "frks", "frry", "frst", "frt", "frway", "frwy", 876 "fry", "ft", "fwy", "garden", "gardens", "gardn", "gateway", "gatewy", 877 "gatway", "gdn", "gdns", "glen", "glens", "gln", "glns", "grden", 878 "grdn", "grdns", "green", "greens", "grn", "grns", "grov", "grove", 879 "groves", "grv", "grvs", "gtway", "gtwy", "harb", "harbor", "harbors", 880 "harbr", "haven", "havn", "hbr", "hbrs", "height", "heights", "hgts", 881 "highway", "highwy", "hill", "hills", "hiway", "hiwy", "hl", "hllw", 882 "hls", "hollow", "hollows", "holw", "holws", "hrbor", "ht", "hts", 883 "hvn", "hway", "hwy", "inlet", "inlt", "is", "island", "islands", 884 "isle", "isles", "islnd", "islnds", "iss", "jct", "jction", "jctn", 885 "jctns", "jcts", "junction", "junctions", "junctn", "juncton", "key", 886 "keys", "knl", "knls", "knol", "knoll", "knolls", "ky", "kys", "la", 887 "lake", "lakes", "land", "landing", "lane", "lanes", "lck", "lcks", 888 "ldg", "ldge", "lf", "lgt", "lgts", "light", "lights", "lk", "lks", 889 "ln", "lndg", "lndng", "loaf", "lock", "locks", "lodg", "lodge", "loop", 890 "loops", "mall", "manor", "manors", "mdw", "mdws", "meadow", "meadows", 891 "medows", "mews", "mill", "mills", "mission", "missn", "ml", "mls", 892 "mnr", "mnrs", "mnt", "mntain", "mntn", "mntns", "motorway", "mount", 893 "mountain", "mountains", "mountin", "msn", "mssn", "mt", "mtin", "mtn", 894 "mtns", "mtwy", "nck", "neck", "opas", "orch", "orchard", "orchrd", 895 "oval", "overpass", "ovl", "park", "parks", "parkway", "parkways", 896 "parkwy", "pass", "passage", "path", "paths", "pike", "pikes", "pine", 897 "pines", "pk", "pkway", "pkwy", "pkwys", "pky", "pl", "place", "plain", 898 "plaines", "plains", "plaza", "pln", "plns", "plz", "plza", "pne", 899 "pnes", "point", "points", "port", "ports", "pr", "prairie", "prarie", 900 "prk", "prr", "prt", "prts", "psge", "pt", "pts", "rad", "radial", 901 "radiel", "radl", "ramp", "ranch", "ranches", "rapid", "rapids", "rd", 902 "rdg", "rdge", "rdgs", "rds", "real", "rest", "ridge", "ridges", "riv", "river", 903 "rivr", "rnch", "rnchs", "road", "roads", "route", "row", "rpd", "rpds", 904 "rst", "rte", "rue", "run", "rvr", "shl", "shls", "shoal", "shoals", 905 "shoar", "shoars", "shore", "shores", "shr", "shrs", "skwy", "skyway", 906 "smt", "spg", "spgs", "spng", "spngs", "spring", "springs", "sprng", 907 "sprngs", "spur", "spurs", "sq", "sqr", "sqre", "sqrs", "sqs", "squ", 908 "square", "squares", "st", "sta", "station", "statn", "stn", "str", 909 "stra", "strav", "strave", "straven", "stravenue", "stravn", "stream", 910 "street", "streets", "streme", "strm", "strt", "strvn", "strvnue", 911 "sts", "sumit", "sumitt", "summit", "ter", "terr", "terrace", 912 "throughway", "tpk", "tpke", "tr", "trace", "traces", "track", "tracks", 913 "trafficway", "trail", "trails", "trak", "trce", "trfy", "trk", "trks", 914 "trl", "trls", "trnpk", "trpk", "trwy", "tunel", "tunl", "tunls", 915 "tunnel", "tunnels", "tunnl", "turnpike", "turnpk", "un", "underpass", 916 "union", "unions", "uns", "upas", "valley", "valleys", "vally", "vdct", 917 "via", "viadct", "viaduct", "view", "views", "vill", "villag", 918 "village", "villages", "ville", "villg", "villiage", "vis", "vist", 919 "vista", "vl", "vlg", "vlgs", "vlly", "vly", "vlys", "vst", "vsta", 920 "vw", "vws", "walk", "walks", "wall", "way", "ways", "well", "wells", 921 "wl", "wls", "wy", "xing", "xrd", 922 0, 923 }; 924 925 bool AddressDetector::IsValidLocationName(const Word& word) { 926 using namespace WTF; 927 static HashSet<String> streetNames; 928 if (!streetNames.size()) { 929 const char** suffixes = s_rawStreetSuffixes; 930 while (const char* suffix = *suffixes) { 931 int index = suffix[0] - 'a'; 932 streetNames.add(suffix); 933 suffixes++; 934 } 935 } 936 char16 first_letter = base::ToLowerASCII(*word.begin); 937 if (first_letter > 'z' || first_letter < 'a') 938 return false; 939 int index = first_letter - 'a'; 940 int length = std::distance(word.begin, word.end); 941 if (*word.end == '.') 942 length--; 943 String value(word.begin, length); 944 return streetNames.contains(value.lower()); 945 } 946