Home | History | Annotate | Download | only in android
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "content/common/android/address_parser.h"
      6 
      7 #include "base/logging.h"
      8 #include "base/strings/string_util.h"
      9 #include "content/common/android/address_parser_internal.h"
     10 
     11 namespace {
     12 
     13 // Minimum number of words in an address after the house number
     14 // before a state is expected to be found.
     15 // A value too high can miss short addresses.
     16 const size_t kMinAddressWords = 3;
     17 
     18 // Maximum number of words allowed in an address between the house number
     19 // and the state, both not included.
     20 const size_t kMaxAddressWords = 12;
     21 
     22 // Maximum number of lines allowed in an address between the house number
     23 // and the state, both not included.
     24 const size_t kMaxAddressLines = 5;
     25 
     26 // Maximum length allowed for any address word between the house number
     27 // and the state, both not included.
     28 const size_t kMaxAddressNameWordLength = 25;
     29 
     30 // Maximum number of words after the house number in which the location name
     31 // should be found.
     32 const size_t kMaxLocationNameDistance = 4;
     33 
     34 // Additional characters used as new line delimiters.
     35 const char16 kNewlineDelimiters[] = {
     36   '\n',
     37   ',',
     38   '*',
     39   0x2022,  // Unicode bullet
     40   0,
     41 };
     42 
     43 }  // anonymous namespace
     44 
     45 namespace content {
     46 
     47 namespace address_parser {
     48 
     49 using namespace internal;
     50 
     51 bool FindAddress(const string16& text, string16* address) {
     52   size_t start, end;
     53   if (FindAddress(text.begin(), text.end(), &start, &end)) {
     54     address->assign(text.substr(start, end));
     55     return true;
     56   }
     57   return false;
     58 }
     59 
     60 bool FindAddress(const string16::const_iterator& begin,
     61                  const string16::const_iterator& end,
     62                  size_t* start_pos,
     63                  size_t* end_pos) {
     64   HouseNumberParser house_number_parser;
     65 
     66   // Keep going through the input string until a potential house number is
     67   // detected. Start tokenizing the following words to find a valid
     68   // street name within a word range. Then, find a state name followed
     69   // by a valid zip code for that state. Also keep a look for any other
     70   // possible house numbers to continue from in case of no match and for
     71   // state names not followed by a zip code (e.g. New York, NY 10000).
     72   const string16 newline_delimiters = kNewlineDelimiters;
     73   const string16 delimiters = kWhitespaceUTF16 + newline_delimiters;
     74   for (string16::const_iterator it = begin; it != end; ) {
     75     Word house_number;
     76     if (!house_number_parser.Parse(it, end, &house_number))
     77       return false;
     78 
     79     String16Tokenizer tokenizer(house_number.end, end, delimiters);
     80     tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
     81 
     82     WordList words;
     83     words.push_back(house_number);
     84 
     85     bool found_location_name = false;
     86     bool continue_on_house_number = true;
     87     bool consecutive_house_numbers = true;
     88     size_t next_house_number_word = 0;
     89     size_t num_lines = 1;
     90 
     91     // Don't include the house number in the word count.
     92     size_t next_word = 1;
     93     for (; next_word <= kMaxAddressWords + 1; ++next_word) {
     94 
     95       // Extract a new word from the tokenizer.
     96       if (next_word == words.size()) {
     97         do {
     98           if (!tokenizer.GetNext())
     99             return false;
    100 
    101           // Check the number of address lines.
    102           if (tokenizer.token_is_delim() && newline_delimiters.find(
    103               *tokenizer.token_begin()) != string16::npos) {
    104             ++num_lines;
    105           }
    106         } while (tokenizer.token_is_delim());
    107 
    108         if (num_lines > kMaxAddressLines)
    109           break;
    110 
    111         words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
    112       }
    113 
    114       // Check the word length. If too long, don't try to continue from
    115       // the next house number as no address can hold this word.
    116       const Word& current_word = words[next_word];
    117       DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
    118       size_t current_word_length = std::distance(
    119           current_word.begin, current_word.end);
    120       if (current_word_length > kMaxAddressNameWordLength) {
    121         continue_on_house_number = false;
    122         break;
    123       }
    124 
    125       // Check if the new word is a valid house number.
    126       if (house_number_parser.Parse(current_word.begin, current_word.end,
    127           NULL)) {
    128         // Increase the number of consecutive house numbers since the beginning.
    129         if (consecutive_house_numbers) {
    130           // Check if there is a new line between consecutive house numbers.
    131           // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
    132           if (num_lines > 1) {
    133             next_house_number_word = next_word;
    134             break;
    135           }
    136         }
    137 
    138         // Keep the next candidate to resume parsing from in case of failure.
    139         if (next_house_number_word == 0) {
    140           next_house_number_word = next_word;
    141           continue;
    142         }
    143       } else {
    144         consecutive_house_numbers = false;
    145       }
    146 
    147       // Look for location names in the words after the house number.
    148       // A range limitation is introduced to avoid matching
    149       // anything that starts with a number before a legitimate address.
    150       if (next_word <= kMaxLocationNameDistance &&
    151           IsValidLocationName(current_word)) {
    152         found_location_name = true;
    153         continue;
    154       }
    155 
    156       // Don't count the house number.
    157       if (next_word > kMinAddressWords) {
    158         // Looking for the state is likely to add new words to the list while
    159         // checking for multi-word state names.
    160         size_t state_first_word = next_word;
    161         size_t state_last_word, state_index;
    162         if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
    163                                     &tokenizer, &state_index)) {
    164 
    165           // A location name should have been found at this point.
    166           if (!found_location_name)
    167             break;
    168 
    169           // Explicitly exclude "et al", as "al" is a valid state code.
    170           if (current_word_length == 2 && words.size() > 2) {
    171             const Word& previous_word = words[state_first_word - 1];
    172             if (previous_word.end - previous_word.begin == 2 &&
    173                 LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
    174                                      "et") &&
    175                 LowerCaseEqualsASCII(current_word.begin, current_word.end,
    176                                      "al"))
    177               break;
    178           }
    179 
    180           // Extract one more word from the tokenizer if not already available.
    181           size_t zip_word = state_last_word + 1;
    182           if (zip_word == words.size()) {
    183             do {
    184               if (!tokenizer.GetNext())
    185                 return false;
    186             } while (tokenizer.token_is_delim());
    187             words.push_back(Word(tokenizer.token_begin(),
    188                             tokenizer.token_end()));
    189           }
    190 
    191           // Check the parsing validity and state range of the zip code.
    192           next_word = state_last_word;
    193           if (!IsZipValid(words[zip_word], state_index))
    194             continue;
    195 
    196           *start_pos = words[0].begin - begin;
    197           *end_pos = words[zip_word].end - begin;
    198           return true;
    199         }
    200       }
    201     }
    202 
    203     // Avoid skipping too many words because of a non-address number
    204     // at the beginning of the contents to parse.
    205     if (continue_on_house_number && next_house_number_word > 0) {
    206       it = words[next_house_number_word].begin;
    207     } else {
    208       DCHECK(!words.empty());
    209       next_word = std::min(next_word, words.size() - 1);
    210       it = words[next_word].end;
    211     }
    212   }
    213 
    214   return false;
    215 }
    216 
    217 }  // namespace address_parser
    218 
    219 }  // namespace content
    220