Home | History | Annotate | Download | only in browser
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "components/autofill/core/browser/address_field.h"
      6 
      7 #include <stddef.h>
      8 
      9 #include "base/logging.h"
     10 #include "base/memory/scoped_ptr.h"
     11 #include "base/strings/string16.h"
     12 #include "base/strings/string_util.h"
     13 #include "base/strings/utf_string_conversions.h"
     14 #include "components/autofill/core/browser/autofill_field.h"
     15 #include "components/autofill/core/browser/autofill_regex_constants.h"
     16 #include "components/autofill/core/browser/autofill_scanner.h"
     17 #include "components/autofill/core/browser/field_types.h"
     18 #include "ui/base/l10n/l10n_util.h"
     19 
     20 using base::UTF8ToUTF16;
     21 
     22 namespace autofill {
     23 
     24 FormField* AddressField::Parse(AutofillScanner* scanner) {
     25   if (scanner->IsEnd())
     26     return NULL;
     27 
     28   scoped_ptr<AddressField> address_field(new AddressField);
     29   const AutofillField* const initial_field = scanner->Cursor();
     30   size_t saved_cursor = scanner->SaveCursor();
     31 
     32   base::string16 attention_ignored = UTF8ToUTF16(autofill::kAttentionIgnoredRe);
     33   base::string16 region_ignored = UTF8ToUTF16(autofill::kRegionIgnoredRe);
     34 
     35   // Allow address fields to appear in any order.
     36   size_t begin_trailing_non_labeled_fields = 0;
     37   bool has_trailing_non_labeled_fields = false;
     38   while (!scanner->IsEnd()) {
     39     const size_t cursor = scanner->SaveCursor();
     40     if (address_field->ParseAddressLines(scanner) ||
     41         address_field->ParseCity(scanner) ||
     42         address_field->ParseState(scanner) ||
     43         address_field->ParseZipCode(scanner) ||
     44         address_field->ParseCountry(scanner) ||
     45         address_field->ParseCompany(scanner)) {
     46       has_trailing_non_labeled_fields = false;
     47       continue;
     48     } else if (ParseField(scanner, attention_ignored, NULL) ||
     49                ParseField(scanner, region_ignored, NULL)) {
     50       // We ignore the following:
     51       // * Attention.
     52       // * Province/Region/Other.
     53       continue;
     54     } else if (scanner->Cursor() != initial_field &&
     55                ParseEmptyLabel(scanner, NULL)) {
     56       // Ignore non-labeled fields within an address; the page
     57       // MapQuest Driving Directions North America.html contains such a field.
     58       // We only ignore such fields after we've parsed at least one other field;
     59       // otherwise we'd effectively parse address fields before other field
     60       // types after any non-labeled fields, and we want email address fields to
     61       // have precedence since some pages contain fields labeled
     62       // "Email address".
     63       if (!has_trailing_non_labeled_fields) {
     64         has_trailing_non_labeled_fields = true;
     65         begin_trailing_non_labeled_fields = cursor;
     66       }
     67 
     68       continue;
     69     } else {
     70       // No field found.
     71       break;
     72     }
     73   }
     74 
     75   // If we have identified any address fields in this field then it should be
     76   // added to the list of fields.
     77   if (address_field->company_ ||
     78       address_field->address1_ ||
     79       address_field->address2_ ||
     80       address_field->street_address_ ||
     81       address_field->city_ ||
     82       address_field->state_ ||
     83       address_field->zip_ ||
     84       address_field->zip4_ ||
     85       address_field->country_) {
     86     // Don't slurp non-labeled fields at the end into the address.
     87     if (has_trailing_non_labeled_fields)
     88       scanner->RewindTo(begin_trailing_non_labeled_fields);
     89 
     90     return address_field.release();
     91   }
     92 
     93   scanner->RewindTo(saved_cursor);
     94   return NULL;
     95 }
     96 
     97 AddressField::AddressField()
     98     : company_(NULL),
     99       address1_(NULL),
    100       address2_(NULL),
    101       street_address_(NULL),
    102       city_(NULL),
    103       state_(NULL),
    104       zip_(NULL),
    105       zip4_(NULL),
    106       country_(NULL) {
    107 }
    108 
    109 bool AddressField::ClassifyField(ServerFieldTypeMap* map) const {
    110   // The page can request the address lines as a single textarea input or as
    111   // multiple text fields (or not at all), but it shouldn't be possible to
    112   // request both.
    113   DCHECK(!(address1_ && street_address_));
    114   DCHECK(!(address2_ && street_address_));
    115 
    116   return AddClassification(company_, COMPANY_NAME, map) &&
    117          AddClassification(address1_, ADDRESS_HOME_LINE1, map) &&
    118          AddClassification(address2_, ADDRESS_HOME_LINE2, map) &&
    119          AddClassification(street_address_, ADDRESS_HOME_STREET_ADDRESS, map) &&
    120          AddClassification(city_, ADDRESS_HOME_CITY, map) &&
    121          AddClassification(state_, ADDRESS_HOME_STATE, map) &&
    122          AddClassification(zip_, ADDRESS_HOME_ZIP, map) &&
    123          AddClassification(country_, ADDRESS_HOME_COUNTRY, map);
    124 }
    125 
    126 bool AddressField::ParseCompany(AutofillScanner* scanner) {
    127   if (company_ && !company_->IsEmpty())
    128     return false;
    129 
    130   return ParseField(scanner, UTF8ToUTF16(autofill::kCompanyRe), &company_);
    131 }
    132 
    133 bool AddressField::ParseAddressLines(AutofillScanner* scanner) {
    134   // We only match the string "address" in page text, not in element names,
    135   // because sometimes every element in a group of address fields will have
    136   // a name containing the string "address"; for example, on the page
    137   // Kohl's - Register Billing Address.html the text element labeled "city"
    138   // has the name "BILL_TO_ADDRESS<>city".  We do match address labels
    139   // such as "address1", which appear as element names on various pages (eg
    140   // AmericanGirl-Registration.html, BloomingdalesBilling.html,
    141   // EBay Registration Enter Information.html).
    142   if (address1_ || street_address_)
    143     return false;
    144 
    145   base::string16 pattern = UTF8ToUTF16(autofill::kAddressLine1Re);
    146   base::string16 label_pattern = UTF8ToUTF16(autofill::kAddressLine1LabelRe);
    147   if (!ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT, &address1_) &&
    148       !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT,
    149                            &address1_) &&
    150       !ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT | MATCH_TEXT_AREA,
    151                            &street_address_) &&
    152       !ParseFieldSpecifics(scanner, label_pattern,
    153                            MATCH_LABEL | MATCH_TEXT_AREA,
    154                            &street_address_)) {
    155     return false;
    156   }
    157 
    158   // Optionally parse more address lines, which may have empty labels.
    159   pattern = UTF8ToUTF16(autofill::kAddressLine2Re);
    160   label_pattern = UTF8ToUTF16(autofill::kAddressLine2LabelRe);
    161   if (!street_address_ &&
    162       !ParseEmptyLabel(scanner, &address2_) &&
    163       !ParseField(scanner, pattern, &address2_)) {
    164     ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT,
    165                         &address2_);
    166   }
    167 
    168   // Try for surplus lines, which we will promptly discard.
    169   // Some pages have 3 address lines (eg SharperImageModifyAccount.html)
    170   // Some pages even have 4 address lines (e.g. uk/ShoesDirect2.html)!
    171   if (address2_) {
    172     pattern = UTF8ToUTF16(autofill::kAddressLinesExtraRe);
    173     while (ParseField(scanner, pattern, NULL)) {
    174       // Consumed a surplus line, try for another.
    175     }
    176   }
    177 
    178   return true;
    179 }
    180 
    181 bool AddressField::ParseCountry(AutofillScanner* scanner) {
    182   // Parse a country.  The occasional page (e.g.
    183   // Travelocity_New Member Information1.html) calls this a "location".
    184   if (country_ && !country_->IsEmpty())
    185     return false;
    186 
    187   return ParseFieldSpecifics(scanner,
    188                              UTF8ToUTF16(autofill::kCountryRe),
    189                              MATCH_DEFAULT | MATCH_SELECT,
    190                              &country_);
    191 }
    192 
    193 bool AddressField::ParseZipCode(AutofillScanner* scanner) {
    194   // Parse a zip code.  On some UK pages (e.g. The China Shop2.html) this
    195   // is called a "post code".
    196   if (zip_)
    197     return false;
    198 
    199   base::string16 pattern = UTF8ToUTF16(autofill::kZipCodeRe);
    200   if (!ParseField(scanner, pattern, &zip_))
    201     return false;
    202 
    203   // Look for a zip+4, whose field name will also often contain
    204   // the substring "zip".
    205   ParseField(scanner, UTF8ToUTF16(autofill::kZip4Re), &zip4_);
    206   return true;
    207 }
    208 
    209 bool AddressField::ParseCity(AutofillScanner* scanner) {
    210   // Parse a city name.  Some UK pages (e.g. The China Shop2.html) use
    211   // the term "town".
    212   if (city_)
    213     return false;
    214 
    215   // Select fields are allowed here.  This occurs on top-100 site rediff.com.
    216   return ParseFieldSpecifics(scanner,
    217                              UTF8ToUTF16(autofill::kCityRe),
    218                              MATCH_DEFAULT | MATCH_SELECT,
    219                              &city_);
    220 }
    221 
    222 bool AddressField::ParseState(AutofillScanner* scanner) {
    223   if (state_)
    224     return false;
    225 
    226   return ParseFieldSpecifics(scanner,
    227                              UTF8ToUTF16(autofill::kStateRe),
    228                              MATCH_DEFAULT | MATCH_SELECT,
    229                              &state_);
    230 }
    231 
    232 }  // namespace autofill
    233