Home | History | Annotate | Download | only in android
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "content/common/android/address_parser.h"
      6 
      7 #include "base/logging.h"
      8 #include "base/strings/string_util.h"
      9 #include "content/common/android/address_parser_internal.h"
     10 
     11 namespace {
     12 
     13 // Minimum number of words in an address after the house number
     14 // before a state is expected to be found.
     15 // A value too high can miss short addresses.
     16 const size_t kMinAddressWords = 3;
     17 
     18 // Maximum number of words allowed in an address between the house number
     19 // and the state, both not included.
     20 const size_t kMaxAddressWords = 12;
     21 
     22 // Maximum number of lines allowed in an address between the house number
     23 // and the state, both not included.
     24 const size_t kMaxAddressLines = 5;
     25 
     26 // Maximum length allowed for any address word between the house number
     27 // and the state, both not included.
     28 const size_t kMaxAddressNameWordLength = 25;
     29 
     30 // Maximum number of words after the house number in which the location name
     31 // should be found.
     32 const size_t kMaxLocationNameDistance = 4;
     33 
     34 // Additional characters used as new line delimiters.
     35 const char16 kNewlineDelimiters[] = {
     36   '\n',
     37   ',',
     38   '*',
     39   0x2022,  // Unicode bullet
     40   0,
     41 };
     42 
     43 }  // anonymous namespace
     44 
     45 namespace content {
     46 
     47 namespace address_parser {
     48 
     49 using namespace internal;
     50 
     51 bool FindAddress(const base::string16& text, base::string16* address) {
     52   size_t start, end;
     53   if (FindAddress(text.begin(), text.end(), &start, &end)) {
     54     size_t len = end >= start ? end - start : 0;
     55     address->assign(text.substr(start, len));
     56     return true;
     57   }
     58   return false;
     59 }
     60 
     61 bool FindAddress(const base::string16::const_iterator& begin,
     62                  const base::string16::const_iterator& end,
     63                  size_t* start_pos,
     64                  size_t* end_pos) {
     65   HouseNumberParser house_number_parser;
     66 
     67   // Keep going through the input string until a potential house number is
     68   // detected. Start tokenizing the following words to find a valid
     69   // street name within a word range. Then, find a state name followed
     70   // by a valid zip code for that state. Also keep a look for any other
     71   // possible house numbers to continue from in case of no match and for
     72   // state names not followed by a zip code (e.g. New York, NY 10000).
     73   const base::string16 newline_delimiters = kNewlineDelimiters;
     74   const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;
     75   for (base::string16::const_iterator it = begin; it != end; ) {
     76     Word house_number;
     77     if (!house_number_parser.Parse(it, end, &house_number))
     78       return false;
     79 
     80     String16Tokenizer tokenizer(house_number.end, end, delimiters);
     81     tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
     82 
     83     WordList words;
     84     words.push_back(house_number);
     85 
     86     bool found_location_name = false;
     87     bool continue_on_house_number = true;
     88     bool consecutive_house_numbers = true;
     89     size_t next_house_number_word = 0;
     90     size_t num_lines = 1;
     91 
     92     // Don't include the house number in the word count.
     93     size_t next_word = 1;
     94     for (; next_word <= kMaxAddressWords + 1; ++next_word) {
     95 
     96       // Extract a new word from the tokenizer.
     97       if (next_word == words.size()) {
     98         do {
     99           if (!tokenizer.GetNext())
    100             return false;
    101 
    102           // Check the number of address lines.
    103           if (tokenizer.token_is_delim() && newline_delimiters.find(
    104               *tokenizer.token_begin()) != base::string16::npos) {
    105             ++num_lines;
    106           }
    107         } while (tokenizer.token_is_delim());
    108 
    109         if (num_lines > kMaxAddressLines)
    110           break;
    111 
    112         words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
    113       }
    114 
    115       // Check the word length. If too long, don't try to continue from
    116       // the next house number as no address can hold this word.
    117       const Word& current_word = words[next_word];
    118       DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
    119       size_t current_word_length = std::distance(
    120           current_word.begin, current_word.end);
    121       if (current_word_length > kMaxAddressNameWordLength) {
    122         continue_on_house_number = false;
    123         break;
    124       }
    125 
    126       // Check if the new word is a valid house number.
    127       if (house_number_parser.Parse(current_word.begin, current_word.end,
    128           NULL)) {
    129         // Increase the number of consecutive house numbers since the beginning.
    130         if (consecutive_house_numbers) {
    131           // Check if there is a new line between consecutive house numbers.
    132           // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
    133           if (num_lines > 1) {
    134             next_house_number_word = next_word;
    135             break;
    136           }
    137         }
    138 
    139         // Keep the next candidate to resume parsing from in case of failure.
    140         if (next_house_number_word == 0) {
    141           next_house_number_word = next_word;
    142           continue;
    143         }
    144       } else {
    145         consecutive_house_numbers = false;
    146       }
    147 
    148       // Look for location names in the words after the house number.
    149       // A range limitation is introduced to avoid matching
    150       // anything that starts with a number before a legitimate address.
    151       if (next_word <= kMaxLocationNameDistance &&
    152           IsValidLocationName(current_word)) {
    153         found_location_name = true;
    154         continue;
    155       }
    156 
    157       // Don't count the house number.
    158       if (next_word > kMinAddressWords) {
    159         // Looking for the state is likely to add new words to the list while
    160         // checking for multi-word state names.
    161         size_t state_first_word = next_word;
    162         size_t state_last_word, state_index;
    163         if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
    164                                     &tokenizer, &state_index)) {
    165 
    166           // A location name should have been found at this point.
    167           if (!found_location_name)
    168             break;
    169 
    170           // Explicitly exclude "et al", as "al" is a valid state code.
    171           if (current_word_length == 2 && words.size() > 2) {
    172             const Word& previous_word = words[state_first_word - 1];
    173             if (previous_word.end - previous_word.begin == 2 &&
    174                 LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
    175                                      "et") &&
    176                 LowerCaseEqualsASCII(current_word.begin, current_word.end,
    177                                      "al"))
    178               break;
    179           }
    180 
    181           // Extract one more word from the tokenizer if not already available.
    182           size_t zip_word = state_last_word + 1;
    183           if (zip_word == words.size()) {
    184             do {
    185               if (!tokenizer.GetNext())
    186                 return false;
    187             } while (tokenizer.token_is_delim());
    188             words.push_back(Word(tokenizer.token_begin(),
    189                             tokenizer.token_end()));
    190           }
    191 
    192           // Check the parsing validity and state range of the zip code.
    193           next_word = state_last_word;
    194           if (!IsZipValid(words[zip_word], state_index))
    195             continue;
    196 
    197           *start_pos = words[0].begin - begin;
    198           *end_pos = words[zip_word].end - begin;
    199           return true;
    200         }
    201       }
    202     }
    203 
    204     // Avoid skipping too many words because of a non-address number
    205     // at the beginning of the contents to parse.
    206     if (continue_on_house_number && next_house_number_word > 0) {
    207       it = words[next_house_number_word].begin;
    208     } else {
    209       DCHECK(!words.empty());
    210       next_word = std::min(next_word, words.size() - 1);
    211       it = words[next_word].end;
    212     }
    213   }
    214 
    215   return false;
    216 }
    217 
    218 }  // namespace address_parser
    219 
    220 }  // namespace content
    221