Home | History | Annotate | Download | only in content
      1 /*
      2  * Copyright (C) 2012 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *    * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *    * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *    * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 
     33 // Magic pretend-to-be-a-chromium-build flags
     34 #undef WEBKIT_IMPLEMENTATION
     35 #undef LOG
     36 
     37 #include "content/address_detector.h"
     38 
     39 #include <bitset>
     40 
     41 #include "base/utf_string_conversions.h"
     42 #include "net/base/escape.h"
     43 #include "Settings.h"
     44 #include "WebString.h"
     45 
     46 #include <wtf/HashSet.h>
     47 #include <wtf/Noncopyable.h>
     48 #include <wtf/text/StringHash.h>
     49 #include <wtf/text/WTFString.h>
     50 
     51 namespace {
     52 
     53 // Prefix used for geographical address intent URIs.
     54 static const char kAddressSchemaPrefix[] = "geo:0,0?q=";
     55 
     56 // Maximum text length to be searched for address detection.
     57 static const size_t kMaxAddressLength = 500;
     58 
     59 // Minimum number of words in an address after the house number
     60 // before a state is expected to be found.
     61 // A value too high can miss short addresses.
     62 const size_t kMinAddressWords = 3;
     63 
     64 // Maximum number of words allowed in an address between the house number
     65 // and the state, both not included.
     66 const size_t kMaxAddressWords = 12;
     67 
     68 // Maximum number of lines allowed in an address between the house number
     69 // and the state, both not included.
     70 const size_t kMaxAddressLines = 5;
     71 
     72 // Maximum length allowed for any address word between the house number
     73 // and the state, both not included.
     74 const size_t kMaxAddressNameWordLength = 25;
     75 
     76 // Maximum number of words after the house number in which the location name
     77 // should be found.
     78 const size_t kMaxLocationNameDistance = 4;
     79 
     80 // Number of digits for a valid zip code.
     81 const size_t kZipDigits = 5;
     82 
     83 // Number of digits for a valid zip code in the Zip Plus 4 format.
     84 const size_t kZipPlus4Digits = 9;
     85 
     86 // Maximum number of digits of a house number, including possible hyphens.
     87 const size_t kMaxHouseDigits = 5;
     88 
     89 // Additional characters used as new line delimiters.
     90 const char16 kNewlineDelimiters[] = {
     91   ',',
     92   '*',
     93   0x2022,  // Unicode bullet
     94   0,
     95 };
     96 
     97 char16 SafePreviousChar(const string16::const_iterator& it,
     98     const string16::const_iterator& begin) {
     99   if (it == begin)
    100     return ' ';
    101   return *(it - 1);
    102 }
    103 
    104 char16 SafeNextChar(const string16::const_iterator& it,
    105     const string16::const_iterator& end) {
    106   if (it == end)
    107     return ' ';
    108   return *(it + 1);
    109 }
    110 
    111 bool WordLowerCaseEqualsASCII(string16::const_iterator word_begin,
    112     string16::const_iterator word_end, const char* ascii_to_match) {
    113   for (string16::const_iterator it = word_begin; it != word_end;
    114       ++it, ++ascii_to_match) {
    115     if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
    116       return false;
    117   }
    118   return *ascii_to_match == 0 || *ascii_to_match == ' ';
    119 }
    120 
    121 bool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin,
    122     string16::const_iterator word_end, const char* ascii_to_match,
    123     bool allow_plural) {
    124   for (string16::const_iterator it = word_begin; it != word_end;
    125       ++it, ++ascii_to_match) {
    126     if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end)
    127       return true;
    128 
    129     if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
    130       return false;
    131   }
    132   return *ascii_to_match == 0;
    133 }
    134 
    135 }  // anonymous namespace
    136 
    137 
    138 AddressDetector::AddressDetector() {
    139 }
    140 
    141 AddressDetector::~AddressDetector() {
    142 }
    143 
    144 std::string AddressDetector::GetContentText(const WebKit::WebRange& range) {
    145   // Get the address and replace unicode bullets with commas.
    146   string16 address_16 = CollapseWhitespace(range.toPlainText(), false);
    147   std::replace(address_16.begin(), address_16.end(),
    148       static_cast<char16>(0x2022), static_cast<char16>(','));
    149   return UTF16ToUTF8(address_16);
    150 }
    151 
    152 GURL AddressDetector::GetIntentURL(const std::string& content_text) {
    153   return GURL(kAddressSchemaPrefix +
    154       EscapeQueryParamValue(content_text, true));
    155 }
    156 
    157 size_t AddressDetector::GetMaximumContentLength() {
    158   return kMaxAddressLength;
    159 }
    160 
    161 bool AddressDetector::IsEnabled(const WebKit::WebHitTestInfo& hit_test) {
    162   WebCore::Settings* settings = GetSettings(hit_test);
    163   return settings && settings->formatDetectionAddress();
    164 }
    165 
    166 bool AddressDetector::FindContent(const string16::const_iterator& begin,
    167     const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) {
    168   HouseNumberParser house_number_parser;
    169 
    170   // Keep going through the input string until a potential house number is
    171   // detected. Start tokenizing the following words to find a valid
    172   // street name within a word range. Then, find a state name followed
    173   // by a valid zip code for that state. Also keep a look for any other
    174   // possible house numbers to continue from in case of no match and for
    175   // state names not followed by a zip code (e.g. New York, NY 10000).
    176   const string16 newline_delimiters = kNewlineDelimiters;
    177   const string16 delimiters = kWhitespaceUTF16 + newline_delimiters;
    178   for (string16::const_iterator it = begin; it != end; ) {
    179     Word house_number;
    180     if (!house_number_parser.Parse(it, end, &house_number))
    181       return false;
    182 
    183     String16Tokenizer tokenizer(house_number.end, end, delimiters);
    184     tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
    185 
    186     std::vector<Word> words;
    187     words.push_back(house_number);
    188 
    189     bool found_location_name = false;
    190     bool continue_on_house_number = true;
    191     size_t next_house_number_word = 0;
    192     size_t num_lines = 1;
    193 
    194     // Don't include the house number in the word count.
    195     size_t next_word = 1;
    196     for (; next_word <= kMaxAddressWords + 1; ++next_word) {
    197 
    198       // Extract a new word from the tokenizer.
    199       if (next_word == words.size()) {
    200         do {
    201           if (!tokenizer.GetNext())
    202             return false;
    203 
    204           // Check the number of address lines.
    205           if (tokenizer.token_is_delim() && newline_delimiters.find(
    206               *tokenizer.token_begin()) != string16::npos) {
    207             ++num_lines;
    208           }
    209         } while (tokenizer.token_is_delim());
    210 
    211         if (num_lines > kMaxAddressLines)
    212           break;
    213 
    214         words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
    215       }
    216 
    217       // Check the word length. If too long, don't try to continue from
    218       // the next house number as no address can hold this word.
    219       const Word& current_word = words[next_word];
    220       DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
    221       size_t current_word_length = std::distance(
    222           current_word.begin, current_word.end);
    223       if (current_word_length > kMaxAddressNameWordLength) {
    224         continue_on_house_number = false;
    225         break;
    226       }
    227 
    228       // Check if the new word is a valid house number.
    229       // This is used to properly resume parsing in case the maximum number
    230       // of words is exceeded.
    231       if (next_house_number_word == 0 &&
    232           house_number_parser.Parse(current_word.begin, current_word.end, NULL)) {
    233         next_house_number_word = next_word;
    234         continue;
    235       }
    236 
    237       // Look for location names in the words after the house number.
    238       // A range limitation is introduced to avoid matching
    239       // anything that starts with a number before a legitimate address.
    240       if (next_word <= kMaxLocationNameDistance &&
    241           IsValidLocationName(current_word)) {
    242         found_location_name = true;
    243         continue;
    244       }
    245 
    246       // Don't count the house number.
    247       if (next_word > kMinAddressWords) {
    248         // Looking for the state is likely to add new words to the list while
    249         // checking for multi-word state names.
    250         size_t state_first_word = next_word;
    251         size_t state_last_word, state_index;
    252         if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
    253             &tokenizer, &state_index)) {
    254 
    255           // A location name should have been found at this point.
    256           if (!found_location_name)
    257             break;
    258 
    259           // Explicitly exclude "et al", as "al" is a valid state code.
    260           if (current_word_length == 2 && words.size() > 2) {
    261             const Word& previous_word = words[state_first_word - 1];
    262             if (previous_word.end - previous_word.begin == 2 &&
    263                 LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
    264                     "et") &&
    265                 LowerCaseEqualsASCII(current_word.begin, current_word.end,
    266                     "al"))
    267               break;
    268           }
    269 
    270           // Extract one more word from the tokenizer if not already available.
    271           size_t zip_word = state_last_word + 1;
    272           if (zip_word == words.size()) {
    273             do {
    274               if (!tokenizer.GetNext()) {
    275                 // Zip is optional
    276                 *start_pos = words[0].begin - begin;
    277                 *end_pos = words[state_last_word].end - begin;
    278                 return true;
    279               }
    280             } while (tokenizer.token_is_delim());
    281             words.push_back(Word(tokenizer.token_begin(),
    282                 tokenizer.token_end()));
    283           }
    284 
    285           // Check the parsing validity and state range of the zip code.
    286           next_word = state_last_word;
    287           if (!IsZipValid(words[zip_word], state_index))
    288             continue;
    289 
    290           *start_pos = words[0].begin - begin;
    291           *end_pos = words[zip_word].end - begin;
    292           return true;
    293         }
    294       }
    295     }
    296 
    297     // Avoid skipping too many words because of a non-address number
    298     // at the beginning of the contents to parse.
    299     if (continue_on_house_number && next_house_number_word > 0) {
    300       it = words[next_house_number_word].begin;
    301     } else {
    302       DCHECK(!words.empty());
    303       next_word = std::min(next_word, words.size() - 1);
    304       it = words[next_word].end;
    305     }
    306   }
    307 
    308   return false;
    309 }
    310 
    311 bool AddressDetector::HouseNumberParser::IsPreDelimiter(
    312     char16 character) {
    313   return character == ':' || IsPostDelimiter(character);
    314 }
    315 
    316 bool AddressDetector::HouseNumberParser::IsPostDelimiter(
    317     char16 character) {
    318   return IsWhitespace(character) || strchr(",\"'", character);
    319 }
    320 
    321 void AddressDetector::HouseNumberParser::RestartOnNextDelimiter() {
    322   ResetState();
    323   for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {}
    324 }
    325 
    326 void AddressDetector::HouseNumberParser::AcceptChars(size_t num_chars) {
    327   size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)),
    328       num_chars);
    329   it_ += offset;
    330   result_chars_ += offset;
    331 }
    332 
    333 void AddressDetector::HouseNumberParser::SkipChars(size_t num_chars) {
    334   it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars);
    335 }
    336 
    337 void AddressDetector::HouseNumberParser::ResetState() {
    338   num_digits_ = 0;
    339   result_chars_ = 0;
    340 }
    341 
    342 bool AddressDetector::HouseNumberParser::CheckFinished(Word* word) const {
    343   // There should always be a number after a hyphen.
    344   if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-')
    345     return false;
    346 
    347   if (word) {
    348     word->begin = it_ - result_chars_;
    349     word->end = it_;
    350   }
    351   return true;
    352 }
    353 
    354 bool AddressDetector::HouseNumberParser::Parse(
    355     const string16::const_iterator& begin,
    356     const string16::const_iterator& end, Word* word) {
    357   it_ = begin_ = begin;
    358   end_ = end;
    359   ResetState();
    360 
    361   // Iterations only used as a fail-safe against any buggy infinite loops.
    362   size_t iterations = 0;
    363   size_t max_iterations = end - begin + 1;
    364   for (; it_ != end_ && iterations < max_iterations; ++iterations) {
    365 
    366     // Word finished case.
    367     if (IsPostDelimiter(*it_)) {
    368       if (CheckFinished(word))
    369         return true;
    370       else if (result_chars_)
    371         ResetState();
    372 
    373       SkipChars(1);
    374       continue;
    375     }
    376 
    377     // More digits. There should be no more after a letter was found.
    378     if (IsAsciiDigit(*it_)) {
    379       if (num_digits_ >= kMaxHouseDigits) {
    380         RestartOnNextDelimiter();
    381       } else {
    382         AcceptChars(1);
    383         ++num_digits_;
    384       }
    385       continue;
    386     }
    387 
    388     if (IsAsciiAlpha(*it_)) {
    389       // Handle special case 'one'.
    390       if (result_chars_ == 0) {
    391         if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one"))
    392           AcceptChars(3);
    393         else
    394           RestartOnNextDelimiter();
    395         continue;
    396       }
    397 
    398       // There should be more than 1 character because of result_chars.
    399       DCHECK_GT(result_chars_, 0U);
    400       DCHECK_NE(it_, begin_);
    401       char16 previous = SafePreviousChar(it_, begin_);
    402       if (IsAsciiDigit(previous)) {
    403         // Check cases like '12A'.
    404         char16 next = SafeNextChar(it_, end_);
    405         if (IsPostDelimiter(next)) {
    406           AcceptChars(1);
    407           continue;
    408         }
    409 
    410         // Handle cases like 12a, 1st, 2nd, 3rd, 7th.
    411         if (IsAsciiAlpha(next)) {
    412           char16 last_digit = previous;
    413           char16 first_letter = base::ToLowerASCII(*it_);
    414           char16 second_letter = base::ToLowerASCII(next);
    415           bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' &&
    416               num_digits_ == 2;
    417 
    418           switch (last_digit - '0') {
    419           case 1:
    420             if ((first_letter == 's' && second_letter == 't') ||
    421                 (first_letter == 't' && second_letter == 'h' && is_teen)) {
    422               AcceptChars(2);
    423               continue;
    424             }
    425             break;
    426 
    427           case 2:
    428             if ((first_letter == 'n' && second_letter == 'd') ||
    429                 (first_letter == 't' && second_letter == 'h' && is_teen)) {
    430               AcceptChars(2);
    431               continue;
    432             }
    433             break;
    434 
    435           case 3:
    436             if ((first_letter == 'r' && second_letter == 'd') ||
    437                 (first_letter == 't' && second_letter == 'h' && is_teen)) {
    438               AcceptChars(2);
    439               continue;
    440             }
    441             break;
    442 
    443           case 0:
    444             // Explicitly exclude '0th'.
    445             if (num_digits_ == 1)
    446               break;
    447 
    448           case 4:
    449           case 5:
    450           case 6:
    451           case 7:
    452           case 8:
    453           case 9:
    454             if (first_letter == 't' && second_letter == 'h') {
    455               AcceptChars(2);
    456               continue;
    457             }
    458             break;
    459 
    460           default:
    461             NOTREACHED();
    462           }
    463         }
    464       }
    465 
    466       RestartOnNextDelimiter();
    467       continue;
    468     }
    469 
    470     if (*it_ == '-' && num_digits_ > 0) {
    471       AcceptChars(1);
    472       ++num_digits_;
    473       continue;
    474     }
    475 
    476     RestartOnNextDelimiter();
    477     SkipChars(1);
    478   }
    479 
    480   if (iterations >= max_iterations)
    481     return false;
    482 
    483   return CheckFinished(word);
    484 }
    485 
    486 bool AddressDetector::FindStateStartingInWord(WordList* words,
    487     size_t state_first_word, size_t* state_last_word,
    488     String16Tokenizer* tokenizer, size_t* state_index) {
    489 
    490   // Bitmasks containing the allowed suffixes for 2-letter state codes.
    491   static const int state_two_letter_suffix[23] = {
    492     0x02060c00,  // A followed by: [KLRSZ].
    493     0x00000000,  // B.
    494     0x00084001,  // C followed by: [AOT].
    495     0x00000014,  // D followed by: [CE].
    496     0x00000000,  // E.
    497     0x00001800,  // F followed by: [LM].
    498     0x00100001,  // G followed by: [AU].
    499     0x00000100,  // H followed by: [I].
    500     0x00002809,  // I followed by: [ADLN].
    501     0x00000000,  // J.
    502     0x01040000,  // K followed by: [SY].
    503     0x00000001,  // L followed by: [A].
    504     0x000ce199,  // M followed by: [ADEHINOPST].
    505     0x0120129c,  // N followed by: [CDEHJMVY].
    506     0x00020480,  // O followed by: [HKR].
    507     0x00420001,  // P followed by: [ARW].
    508     0x00000000,  // Q.
    509     0x00000100,  // R followed by: [I].
    510     0x0000000c,  // S followed by: [CD].
    511     0x00802000,  // T followed by: [NX].
    512     0x00080000,  // U followed by: [T].
    513     0x00080101,  // V followed by: [AIT].
    514     0x01200101   // W followed by: [AIVY].
    515   };
    516 
    517   // Accumulative number of states for the 2-letter code indexed by the first.
    518   static const int state_two_letter_accumulative[24] = {
    519      0,  5,  5,  8, 10, 10, 12, 14,
    520     15, 19, 19, 21, 22, 32, 40, 43,
    521     46, 46, 47, 49, 51, 52, 55, 59
    522   };
    523 
    524   // State names sorted alphabetically with their lengths.
    525   // There can be more than one possible name for a same state if desired.
    526   static const struct StateNameInfo {
    527     const char* string;
    528     char first_word_length;
    529     char length;
    530     char state_index; // Relative to two-character code alphabetical order.
    531   } state_names[59] = {
    532     { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 },
    533     { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 },
    534     { "arkansas", 8, 8, 2 },
    535     { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 },
    536     { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 },
    537     { "district of columbia", 8, 20, 8 },
    538     { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 },
    539     { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 },
    540     { "hawaii", 6, 6, 14 },
    541     { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 },
    542     { "iowa", 4, 4, 15 },
    543     { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 },
    544     { "louisiana", 9, 9, 21 },
    545     { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 },
    546     { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 },
    547     { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 },
    548     { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 },
    549     { "montana", 7, 7, 31 },
    550     { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 },
    551     { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 },
    552     { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 },
    553     { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 },
    554     { "northern mariana islands", 8, 24, 29 },
    555     { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 },
    556     { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 },
    557     { "puerto rico", 6, 11, 44 },
    558     { "rhode island", 5, 5, 46 },
    559     { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 },
    560     { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 },
    561     { "utah", 4, 4, 51 },
    562     { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 },
    563     { "virginia", 8, 8, 52 },
    564     { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 },
    565     { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 }
    566   };
    567 
    568   // Accumulative number of states for sorted names indexed by the first letter.
    569   // Required a different one since there are codes that don't share their
    570   // first letter with the name of their state (MP = Northern Mariana Islands).
    571   static const int state_names_accumulative[24] = {
    572      0,  5,  5,  8, 10, 10, 12, 14,
    573     15, 19, 19, 21, 22, 31, 40, 43,
    574     46, 46, 47, 49, 51, 52, 55, 59
    575   };
    576 
    577   DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1],
    578       static_cast<int>(ARRAYSIZE_UNSAFE(state_names)));
    579 
    580   const Word& first_word = words->at(state_first_word);
    581   int length = first_word.end - first_word.begin;
    582   if (length < 2 || !IsAsciiAlpha(*first_word.begin))
    583     return false;
    584 
    585   // No state names start with x, y, z.
    586   char16 first_letter = base::ToLowerASCII(*first_word.begin);
    587   if (first_letter > 'w')
    588     return false;
    589 
    590   DCHECK(first_letter >= 'a');
    591   int first_index = first_letter - 'a';
    592 
    593   // Look for two-letter state names.
    594   if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) {
    595     char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1));
    596     DCHECK(second_letter >= 'a');
    597 
    598     int second_index = second_letter - 'a';
    599     if (!(state_two_letter_suffix[first_index] & (1 << second_index)))
    600       return false;
    601 
    602     std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] &
    603         ((1 << second_index) - 1);
    604     *state_last_word = state_first_word;
    605     *state_index = state_two_letter_accumulative[first_index] +
    606         previous_suffixes.count();
    607     return true;
    608   }
    609 
    610   // Look for full state names by their first letter. Discard by length.
    611   for (int state = state_names_accumulative[first_index];
    612       state < state_names_accumulative[first_index + 1]; ++state) {
    613     if (state_names[state].first_word_length != length)
    614       continue;
    615 
    616     bool state_match = false;
    617     size_t state_word = state_first_word;
    618     for (int pos = 0; true; ) {
    619       if (!WordLowerCaseEqualsASCII(words->at(state_word).begin,
    620           words->at(state_word).end, &state_names[state].string[pos]))
    621         break;
    622 
    623       pos += words->at(state_word).end - words->at(state_word).begin + 1;
    624       if (pos >= state_names[state].length) {
    625         state_match = true;
    626         break;
    627       }
    628 
    629       // Ran out of words, extract more from the tokenizer.
    630       if (++state_word == words->size()) {
    631         do {
    632           if (!tokenizer->GetNext())
    633             break;
    634         } while (tokenizer->token_is_delim());
    635         words->push_back(Word(tokenizer->token_begin(), tokenizer->token_end()));
    636       }
    637     }
    638 
    639     if (state_match) {
    640       *state_last_word = state_word;
    641       *state_index = state_names[state].state_index;
    642       return true;
    643     }
    644   }
    645 
    646   return false;
    647 }
    648 
    649 bool AddressDetector::IsZipValid(const Word& word, size_t state_index) {
    650   size_t length = word.end - word.begin;
    651   if (length != kZipDigits && length != kZipPlus4Digits + 1)
    652     return false;
    653 
    654   for (string16::const_iterator it = word.begin; it != word.end; ++it) {
    655     size_t pos = it - word.begin;
    656     if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits))
    657       continue;
    658     return false;
    659   }
    660   return IsZipValidForState(word, state_index);
    661 }
    662 
    663 bool AddressDetector::IsZipValidForState(const Word& word, size_t state_index)
    664 {
    665     enum USState {
    666         AP = -4, // AP (military base in the Pacific)
    667         AA = -3, // AA (military base inside the US)
    668         AE = -2, // AE (military base outside the US)
    669         XX = -1, // (not in use)
    670         AK =  0, // AK Alaska
    671         AL =  1, // AL Alabama
    672         AR =  2, // AR Arkansas
    673         AS =  3, // AS American Samoa
    674         AZ =  4, // AZ Arizona
    675         CA =  5, // CA California
    676         CO =  6, // CO Colorado
    677         CT =  7, // CT Connecticut
    678         DC =  8, // DC District of Columbia
    679         DE =  9, // DE Delaware
    680         FL = 10, // FL Florida
    681         FM = 11, // FM Federated States of Micronesia
    682         GA = 12, // GA Georgia
    683         GU = 13, // GU Guam
    684         HI = 14, // HI Hawaii
    685         IA = 15, // IA Iowa
    686         ID = 16, // ID Idaho
    687         IL = 17, // IL Illinois
    688         IN = 18, // IN Indiana
    689         KS = 19, // KS Kansas
    690         KY = 20, // KY Kentucky
    691         LA = 21, // LA Louisiana
    692         MA = 22, // MA Massachusetts
    693         MD = 23, // MD Maryland
    694         ME = 24, // ME Maine
    695         MH = 25, // MH Marshall Islands
    696         MI = 26, // MI Michigan
    697         MN = 27, // MN Minnesota
    698         MO = 28, // MO Missouri
    699         MP = 29, // MP Northern Mariana Islands
    700         MS = 30, // MS Mississippi
    701         MT = 31, // MT Montana
    702         NC = 32, // NC North Carolina
    703         ND = 33, // ND North Dakota
    704         NE = 34, // NE Nebraska
    705         NH = 35, // NH New Hampshire
    706         NJ = 36, // NJ New Jersey
    707         NM = 37, // NM New Mexico
    708         NV = 38, // NV Nevada
    709         NY = 39, // NY New York
    710         OH = 40, // OH Ohio
    711         OK = 41, // OK Oklahoma
    712         OR = 42, // OR Oregon
    713         PA = 43, // PA Pennsylvania
    714         PR = 44, // PR Puerto Rico
    715         PW = 45, // PW Palau
    716         RI = 46, // RI Rhode Island
    717         SC = 47, // SC South Carolina
    718         SD = 48, // SD South Dakota
    719         TN = 49, // TN Tennessee
    720         TX = 50, // TX Texas
    721         UT = 51, // UT Utah
    722         VA = 52, // VA Virginia
    723         VI = 53, // VI Virgin Islands
    724         VT = 54, // VT Vermont
    725         WA = 55, // WA Washington
    726         WI = 56, // WI Wisconsin
    727         WV = 57, // WV West Virginia
    728         WY = 58, // WY Wyoming
    729     };
    730 
    731     static const USState stateForZipPrefix[] = {
    732     //   0   1   2   3   4   5   6   7   8   9
    733         XX, XX, XX, XX, XX, NY, PR, PR, VI, PR, // 000-009
    734         MA, MA, MA, MA, MA, MA, MA, MA, MA, MA, // 010-019
    735         MA, MA, MA, MA, MA, MA, MA, MA, RI, RI, // 020-029
    736         NH, NH, NH, NH, NH, NH, NH, NH, NH, ME, // 030-039
    737         ME, ME, ME, ME, ME, ME, ME, ME, ME, ME, // 040-049
    738         VT, VT, VT, VT, VT, MA, VT, VT, VT, VT, // 050-059
    739         CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 060-069
    740         NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 070-079
    741         NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 080-089
    742         AE, AE, AE, AE, AE, AE, AE, AE, AE, XX, // 090-099
    743         NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 100-109
    744         NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 110-119
    745         NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 120-129
    746         NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 130-139
    747         NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 140-149
    748         PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 150-159
    749         PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 160-169
    750         PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 170-179
    751         PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 180-189
    752         PA, PA, PA, PA, PA, PA, PA, DE, DE, DE, // 190-199
    753         DC, VA, DC, DC, DC, DC, MD, MD, MD, MD, // 200-209
    754         MD, MD, MD, XX, MD, MD, MD, MD, MD, MD, // 210-219
    755         VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 220-229
    756         VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 230-239
    757         VA, VA, VA, VA, VA, VA, VA, WV, WV, WV, // 240-249
    758         WV, WV, WV, WV, WV, WV, WV, WV, WV, WV, // 250-259
    759         WV, WV, WV, WV, WV, WV, WV, WV, WV, XX, // 260-269
    760         NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 270-279
    761         NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 280-289
    762         SC, SC, SC, SC, SC, SC, SC, SC, SC, SC, // 290-299
    763         GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 300-309
    764         GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 310-319
    765         FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 320-329
    766         FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 330-339
    767         AA, FL, FL, XX, FL, XX, FL, FL, XX, FL, // 340-349
    768         AL, AL, AL, XX, AL, AL, AL, AL, AL, AL, // 350-359
    769         AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 360-369
    770         TN, TN, TN, TN, TN, TN, TN, TN, TN, TN, // 370-379
    771         TN, TN, TN, TN, TN, TN, MS, MS, MS, MS, // 380-389
    772         MS, MS, MS, MS, MS, MS, MS, MS, GA, GA, // 390-399
    773         KY, KY, KY, KY, KY, KY, KY, KY, KY, KY, // 400-409
    774         KY, KY, KY, KY, KY, KY, KY, KY, KY, XX, // 410-419
    775         KY, KY, KY, KY, KY, KY, KY, KY, XX, XX, // 420-429
    776         OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 430-439
    777         OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 440-449
    778         OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 450-459
    779         IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 460-469
    780         IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 470-479
    781         MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 480-489
    782         MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 490-499
    783         IA, IA, IA, IA, IA, IA, IA, IA, IA, IA, // 500-509
    784         IA, IA, IA, IA, IA, IA, IA, XX, XX, XX, // 510-519
    785         IA, IA, IA, IA, IA, IA, IA, IA, IA, XX, // 520-529
    786         WI, WI, WI, XX, WI, WI, XX, WI, WI, WI, // 530-539
    787         WI, WI, WI, WI, WI, WI, WI, WI, WI, WI, // 540-549
    788         MN, MN, XX, MN, MN, MN, MN, MN, MN, MN, // 550-559
    789         MN, MN, MN, MN, MN, MN, MN, MN, XX, DC, // 560-569
    790         SD, SD, SD, SD, SD, SD, SD, SD, XX, XX, // 570-579
    791         ND, ND, ND, ND, ND, ND, ND, ND, ND, XX, // 580-589
    792         MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, // 590-599
    793         IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 600-609
    794         IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 610-619
    795         IL, XX, IL, IL, IL, IL, IL, IL, IL, IL, // 620-629
    796         MO, MO, XX, MO, MO, MO, MO, MO, MO, MO, // 630-639
    797         MO, MO, XX, XX, MO, MO, MO, MO, MO, MO, // 640-649
    798         MO, MO, MO, MO, MO, MO, MO, MO, MO, XX, // 650-659
    799         KS, KS, KS, XX, KS, KS, KS, KS, KS, KS, // 660-669
    800         KS, KS, KS, KS, KS, KS, KS, KS, KS, KS, // 670-679
    801         NE, NE, XX, NE, NE, NE, NE, NE, NE, NE, // 680-689
    802         NE, NE, NE, NE, XX, XX, XX, XX, XX, XX, // 690-699
    803         LA, LA, XX, LA, LA, LA, LA, LA, LA, XX, // 700-709
    804         LA, LA, LA, LA, LA, XX, AR, AR, AR, AR, // 710-719
    805         AR, AR, AR, AR, AR, AR, AR, AR, AR, AR, // 720-729
    806         OK, OK, XX, TX, OK, OK, OK, OK, OK, OK, // 730-739
    807         OK, OK, XX, OK, OK, OK, OK, OK, OK, OK, // 740-749
    808         TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 750-759
    809         TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 760-769
    810         TX, XX, TX, TX, TX, TX, TX, TX, TX, TX, // 770-779
    811         TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 780-789
    812         TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 790-799
    813         CO, CO, CO, CO, CO, CO, CO, CO, CO, CO, // 800-809
    814         CO, CO, CO, CO, CO, CO, CO, XX, XX, XX, // 810-819
    815         WY, WY, WY, WY, WY, WY, WY, WY, WY, WY, // 820-829
    816         WY, WY, ID, ID, ID, ID, ID, ID, ID, XX, // 830-839
    817         UT, UT, UT, UT, UT, UT, UT, UT, XX, XX, // 840-849
    818         AZ, AZ, AZ, AZ, XX, AZ, AZ, AZ, XX, AZ, // 850-859
    819         AZ, XX, XX, AZ, AZ, AZ, XX, XX, XX, XX, // 860-869
    820         NM, NM, NM, NM, NM, NM, XX, NM, NM, NM, // 870-879
    821         NM, NM, NM, NM, NM, TX, XX, XX, XX, NV, // 880-889
    822         NV, NV, XX, NV, NV, NV, XX, NV, NV, XX, // 890-899
    823         CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 900-909
    824         CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 910-919
    825         CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 920-929
    826         CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 930-939
    827         CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 940-949
    828         CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 950-959
    829         CA, CA, AP, AP, AP, AP, AP, HI, HI, GU, // 960-969
    830         OR, OR, OR, OR, OR, OR, OR, OR, OR, OR, // 970-979
    831         WA, WA, WA, WA, WA, WA, WA, XX, WA, WA, // 980-989
    832         WA, WA, WA, WA, WA, AK, AK, AK, AK, AK, // 990-999
    833     };
    834 
    835     if (!word.begin || !word.end || (word.end - word.begin) < 3)
    836         return false;
    837     const char16* zipPtr = word.begin;
    838     if (zipPtr[0] < '0' || zipPtr[0] > '9' ||
    839         zipPtr[1] < '0' || zipPtr[1] > '9' ||
    840         zipPtr[2] < '0' || zipPtr[2] > '9')
    841         return false;
    842 
    843     int zip = zipPtr[0] - '0';
    844     zip *= 10;
    845     zip += zipPtr[1] - '0';
    846     zip *= 10;
    847     zip += zipPtr[2] - '0';
    848     return stateForZipPrefix[zip] == (int) state_index;
    849 }
    850 
    851 static const char* s_rawStreetSuffixes[] = {
    852     "allee", "alley", "ally", "aly",
    853     "anex", "annex", "anx", "arc", "arcade", "av", "ave", "aven", "avenu",
    854     "avenue", "avn", "avnue", "bayoo", "bayou", "bch", "beach", "bend",
    855     "bg", "bgs", "blf", "blfs", "bluf", "bluff", "bluffs", "blvd", "bnd",
    856     "bot", "bottm", "bottom", "boul", "boulevard", "boulv", "br", "branch",
    857     "brdge", "brg", "bridge", "brk", "brks", "brnch", "brook", "brooks",
    858     "btm", "burg", "burgs", "byp", "bypa", "bypas", "bypass", "byps", "byu",
    859     "camp", "canyn", "canyon", "cape", "causeway", "causway", "cen", "cent",
    860     "center", "centers", "centr", "centre", "cir", "circ", "circl",
    861     "circle", "circles", "cirs", "ck", "clb", "clf", "clfs", "cliff",
    862     "cliffs", "club", "cmn", "cmp", "cnter", "cntr", "cnyn", "common",
    863     "cor", "corner", "corners", "cors", "course", "court", "courts", "cove",
    864     "coves", "cp", "cpe", "cr", "crcl", "crcle", "crecent", "creek", "cres",
    865     "crescent", "cresent", "crest", "crk", "crossing", "crossroad",
    866     "crscnt", "crse", "crsent", "crsnt", "crssing", "crssng", "crst", "crt",
    867     "cswy", "ct", "ctr", "ctrs", "cts", "curv", "curve", "cv", "cvs", "cyn",
    868     "dale", "dam", "div", "divide", "dl", "dm", "dr", "driv", "drive",
    869     "drives", "drs", "drv", "dv", "dvd", "est", "estate", "estates", "ests",
    870     "exp", "expr", "express", "expressway", "expw", "expy", "ext",
    871     "extension", "extensions", "extn", "extnsn", "exts", "fall", "falls",
    872     "ferry", "field", "fields", "flat", "flats", "fld", "flds", "fls",
    873     "flt", "flts", "ford", "fords", "forest", "forests", "forg", "forge",
    874     "forges", "fork", "forks", "fort", "frd", "frds", "freeway", "freewy",
    875     "frg", "frgs", "frk", "frks", "frry", "frst", "frt", "frway", "frwy",
    876     "fry", "ft", "fwy", "garden", "gardens", "gardn", "gateway", "gatewy",
    877     "gatway", "gdn", "gdns", "glen", "glens", "gln", "glns", "grden",
    878     "grdn", "grdns", "green", "greens", "grn", "grns", "grov", "grove",
    879     "groves", "grv", "grvs", "gtway", "gtwy", "harb", "harbor", "harbors",
    880     "harbr", "haven", "havn", "hbr", "hbrs", "height", "heights", "hgts",
    881     "highway", "highwy", "hill", "hills", "hiway", "hiwy", "hl", "hllw",
    882     "hls", "hollow", "hollows", "holw", "holws", "hrbor", "ht", "hts",
    883     "hvn", "hway", "hwy", "inlet", "inlt", "is", "island", "islands",
    884     "isle", "isles", "islnd", "islnds", "iss", "jct", "jction", "jctn",
    885     "jctns", "jcts", "junction", "junctions", "junctn", "juncton", "key",
    886     "keys", "knl", "knls", "knol", "knoll", "knolls", "ky", "kys", "la",
    887     "lake", "lakes", "land", "landing", "lane", "lanes", "lck", "lcks",
    888     "ldg", "ldge", "lf", "lgt", "lgts", "light", "lights", "lk", "lks",
    889     "ln", "lndg", "lndng", "loaf", "lock", "locks", "lodg", "lodge", "loop",
    890     "loops", "mall", "manor", "manors", "mdw", "mdws", "meadow", "meadows",
    891     "medows", "mews", "mill", "mills", "mission", "missn", "ml", "mls",
    892     "mnr", "mnrs", "mnt", "mntain", "mntn", "mntns", "motorway", "mount",
    893     "mountain", "mountains", "mountin", "msn", "mssn", "mt", "mtin", "mtn",
    894     "mtns", "mtwy", "nck", "neck", "opas", "orch", "orchard", "orchrd",
    895     "oval", "overpass", "ovl", "park", "parks", "parkway", "parkways",
    896     "parkwy", "pass", "passage", "path", "paths", "pike", "pikes", "pine",
    897     "pines", "pk", "pkway", "pkwy", "pkwys", "pky", "pl", "place", "plain",
    898     "plaines", "plains", "plaza", "pln", "plns", "plz", "plza", "pne",
    899     "pnes", "point", "points", "port", "ports", "pr", "prairie", "prarie",
    900     "prk", "prr", "prt", "prts", "psge", "pt", "pts", "rad", "radial",
    901     "radiel", "radl", "ramp", "ranch", "ranches", "rapid", "rapids", "rd",
    902     "rdg", "rdge", "rdgs", "rds", "real", "rest", "ridge", "ridges", "riv", "river",
    903     "rivr", "rnch", "rnchs", "road", "roads", "route", "row", "rpd", "rpds",
    904     "rst", "rte", "rue", "run", "rvr", "shl", "shls", "shoal", "shoals",
    905     "shoar", "shoars", "shore", "shores", "shr", "shrs", "skwy", "skyway",
    906     "smt", "spg", "spgs", "spng", "spngs", "spring", "springs", "sprng",
    907     "sprngs", "spur", "spurs", "sq", "sqr", "sqre", "sqrs", "sqs", "squ",
    908     "square", "squares", "st", "sta", "station", "statn", "stn", "str",
    909     "stra", "strav", "strave", "straven", "stravenue", "stravn", "stream",
    910     "street", "streets", "streme", "strm", "strt", "strvn", "strvnue",
    911     "sts", "sumit", "sumitt", "summit", "ter", "terr", "terrace",
    912     "throughway", "tpk", "tpke", "tr", "trace", "traces", "track", "tracks",
    913     "trafficway", "trail", "trails", "trak", "trce", "trfy", "trk", "trks",
    914     "trl", "trls", "trnpk", "trpk", "trwy", "tunel", "tunl", "tunls",
    915     "tunnel", "tunnels", "tunnl", "turnpike", "turnpk", "un", "underpass",
    916     "union", "unions", "uns", "upas", "valley", "valleys", "vally", "vdct",
    917     "via", "viadct", "viaduct", "view", "views", "vill", "villag",
    918     "village", "villages", "ville", "villg", "villiage", "vis", "vist",
    919     "vista", "vl", "vlg", "vlgs", "vlly", "vly", "vlys", "vst", "vsta",
    920     "vw", "vws", "walk", "walks", "wall", "way", "ways", "well", "wells",
    921     "wl", "wls", "wy", "xing", "xrd",
    922     0,
    923 };
    924 
    925 bool AddressDetector::IsValidLocationName(const Word& word) {
    926     using namespace WTF;
    927     static HashSet<String> streetNames;
    928     if (!streetNames.size()) {
    929         const char** suffixes = s_rawStreetSuffixes;
    930         while (const char* suffix = *suffixes) {
    931             int index = suffix[0] - 'a';
    932             streetNames.add(suffix);
    933             suffixes++;
    934         }
    935     }
    936     char16 first_letter = base::ToLowerASCII(*word.begin);
    937     if (first_letter > 'z' || first_letter < 'a')
    938         return false;
    939     int index = first_letter - 'a';
    940     int length = std::distance(word.begin, word.end);
    941     if (*word.end == '.')
    942         length--;
    943     String value(word.begin, length);
    944     return streetNames.contains(value.lower());
    945 }
    946