1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "content/common/android/address_parser.h" 6 7 #include "base/logging.h" 8 #include "base/strings/string_util.h" 9 #include "content/common/android/address_parser_internal.h" 10 11 namespace { 12 13 // Minimum number of words in an address after the house number 14 // before a state is expected to be found. 15 // A value too high can miss short addresses. 16 const size_t kMinAddressWords = 3; 17 18 // Maximum number of words allowed in an address between the house number 19 // and the state, both not included. 20 const size_t kMaxAddressWords = 12; 21 22 // Maximum number of lines allowed in an address between the house number 23 // and the state, both not included. 24 const size_t kMaxAddressLines = 5; 25 26 // Maximum length allowed for any address word between the house number 27 // and the state, both not included. 28 const size_t kMaxAddressNameWordLength = 25; 29 30 // Maximum number of words after the house number in which the location name 31 // should be found. 32 const size_t kMaxLocationNameDistance = 4; 33 34 // Additional characters used as new line delimiters. 35 const char16 kNewlineDelimiters[] = { 36 '\n', 37 ',', 38 '*', 39 0x2022, // Unicode bullet 40 0, 41 }; 42 43 } // anonymous namespace 44 45 namespace content { 46 47 namespace address_parser { 48 49 using namespace internal; 50 51 bool FindAddress(const base::string16& text, base::string16* address) { 52 size_t start, end; 53 if (FindAddress(text.begin(), text.end(), &start, &end)) { 54 size_t len = end >= start ? end - start : 0; 55 address->assign(text.substr(start, len)); 56 return true; 57 } 58 return false; 59 } 60 61 bool FindAddress(const base::string16::const_iterator& begin, 62 const base::string16::const_iterator& end, 63 size_t* start_pos, 64 size_t* end_pos) { 65 HouseNumberParser house_number_parser; 66 67 // Keep going through the input string until a potential house number is 68 // detected. Start tokenizing the following words to find a valid 69 // street name within a word range. Then, find a state name followed 70 // by a valid zip code for that state. Also keep a look for any other 71 // possible house numbers to continue from in case of no match and for 72 // state names not followed by a zip code (e.g. New York, NY 10000). 73 const base::string16 newline_delimiters = kNewlineDelimiters; 74 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters; 75 for (base::string16::const_iterator it = begin; it != end; ) { 76 Word house_number; 77 if (!house_number_parser.Parse(it, end, &house_number)) 78 return false; 79 80 String16Tokenizer tokenizer(house_number.end, end, delimiters); 81 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS); 82 83 WordList words; 84 words.push_back(house_number); 85 86 bool found_location_name = false; 87 bool continue_on_house_number = true; 88 bool consecutive_house_numbers = true; 89 size_t next_house_number_word = 0; 90 size_t num_lines = 1; 91 92 // Don't include the house number in the word count. 93 size_t next_word = 1; 94 for (; next_word <= kMaxAddressWords + 1; ++next_word) { 95 96 // Extract a new word from the tokenizer. 97 if (next_word == words.size()) { 98 do { 99 if (!tokenizer.GetNext()) 100 return false; 101 102 // Check the number of address lines. 103 if (tokenizer.token_is_delim() && newline_delimiters.find( 104 *tokenizer.token_begin()) != base::string16::npos) { 105 ++num_lines; 106 } 107 } while (tokenizer.token_is_delim()); 108 109 if (num_lines > kMaxAddressLines) 110 break; 111 112 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end())); 113 } 114 115 // Check the word length. If too long, don't try to continue from 116 // the next house number as no address can hold this word. 117 const Word& current_word = words[next_word]; 118 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0); 119 size_t current_word_length = std::distance( 120 current_word.begin, current_word.end); 121 if (current_word_length > kMaxAddressNameWordLength) { 122 continue_on_house_number = false; 123 break; 124 } 125 126 // Check if the new word is a valid house number. 127 if (house_number_parser.Parse(current_word.begin, current_word.end, 128 NULL)) { 129 // Increase the number of consecutive house numbers since the beginning. 130 if (consecutive_house_numbers) { 131 // Check if there is a new line between consecutive house numbers. 132 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.." 133 if (num_lines > 1) { 134 next_house_number_word = next_word; 135 break; 136 } 137 } 138 139 // Keep the next candidate to resume parsing from in case of failure. 140 if (next_house_number_word == 0) { 141 next_house_number_word = next_word; 142 continue; 143 } 144 } else { 145 consecutive_house_numbers = false; 146 } 147 148 // Look for location names in the words after the house number. 149 // A range limitation is introduced to avoid matching 150 // anything that starts with a number before a legitimate address. 151 if (next_word <= kMaxLocationNameDistance && 152 IsValidLocationName(current_word)) { 153 found_location_name = true; 154 continue; 155 } 156 157 // Don't count the house number. 158 if (next_word > kMinAddressWords) { 159 // Looking for the state is likely to add new words to the list while 160 // checking for multi-word state names. 161 size_t state_first_word = next_word; 162 size_t state_last_word, state_index; 163 if (FindStateStartingInWord(&words, state_first_word, &state_last_word, 164 &tokenizer, &state_index)) { 165 166 // A location name should have been found at this point. 167 if (!found_location_name) 168 break; 169 170 // Explicitly exclude "et al", as "al" is a valid state code. 171 if (current_word_length == 2 && words.size() > 2) { 172 const Word& previous_word = words[state_first_word - 1]; 173 if (previous_word.end - previous_word.begin == 2 && 174 LowerCaseEqualsASCII(previous_word.begin, previous_word.end, 175 "et") && 176 LowerCaseEqualsASCII(current_word.begin, current_word.end, 177 "al")) 178 break; 179 } 180 181 // Extract one more word from the tokenizer if not already available. 182 size_t zip_word = state_last_word + 1; 183 if (zip_word == words.size()) { 184 do { 185 if (!tokenizer.GetNext()) 186 return false; 187 } while (tokenizer.token_is_delim()); 188 words.push_back(Word(tokenizer.token_begin(), 189 tokenizer.token_end())); 190 } 191 192 // Check the parsing validity and state range of the zip code. 193 next_word = state_last_word; 194 if (!IsZipValid(words[zip_word], state_index)) 195 continue; 196 197 *start_pos = words[0].begin - begin; 198 *end_pos = words[zip_word].end - begin; 199 return true; 200 } 201 } 202 } 203 204 // Avoid skipping too many words because of a non-address number 205 // at the beginning of the contents to parse. 206 if (continue_on_house_number && next_house_number_word > 0) { 207 it = words[next_house_number_word].begin; 208 } else { 209 DCHECK(!words.empty()); 210 next_word = std::min(next_word, words.size() - 1); 211 it = words[next_word].end; 212 } 213 } 214 215 return false; 216 } 217 218 } // namespace address_parser 219 220 } // namespace content 221