Home | History | Annotate | Download | only in search_engines
      1 // Copyright 2014 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "components/search_engines/template_url_parser.h"
      6 
      7 #include <algorithm>
      8 #include <map>
      9 #include <vector>
     10 
     11 #include "base/logging.h"
     12 #include "base/memory/scoped_ptr.h"
     13 #include "base/strings/string_number_conversions.h"
     14 #include "base/strings/string_util.h"
     15 #include "base/strings/utf_string_conversions.h"
     16 #include "components/search_engines/template_url.h"
     17 #include "libxml/parser.h"
     18 #include "libxml/xmlwriter.h"
     19 #include "ui/gfx/favicon_size.h"
     20 #include "url/gurl.h"
     21 #include "url/url_constants.h"
     22 
     23 namespace {
     24 
     25 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
     26 // to that of char, the following names are all in terms of char. This avoids
     27 // having to convert to wide, then do comparisons.
     28 
     29 // Defines for element names of the OSD document:
     30 const char kURLElement[] = "Url";
     31 const char kParamElement[] = "Param";
     32 const char kShortNameElement[] = "ShortName";
     33 const char kImageElement[] = "Image";
     34 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
     35 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
     36 const char kInputEncodingElement[] = "InputEncoding";
     37 const char kAliasElement[] = "Alias";
     38 
     39 // Various XML attributes used.
     40 const char kURLTypeAttribute[] = "type";
     41 const char kURLTemplateAttribute[] = "template";
     42 const char kImageTypeAttribute[] = "type";
     43 const char kImageWidthAttribute[] = "width";
     44 const char kImageHeightAttribute[] = "height";
     45 const char kParamNameAttribute[] = "name";
     46 const char kParamValueAttribute[] = "value";
     47 const char kParamMethodAttribute[] = "method";
     48 
     49 // Mime type for search results.
     50 const char kHTMLType[] = "text/html";
     51 
     52 // Mime type for as you type suggestions.
     53 const char kSuggestionType[] = "application/x-suggestions+json";
     54 
     55 std::string XMLCharToString(const xmlChar* value) {
     56   return std::string(reinterpret_cast<const char*>(value));
     57 }
     58 
     59 // Returns true if input_encoding contains a valid input encoding string. This
     60 // doesn't verify that we have a valid encoding for the string, just that the
     61 // string contains characters that constitute a valid input encoding.
     62 bool IsValidEncodingString(const std::string& input_encoding) {
     63   if (input_encoding.empty())
     64     return false;
     65 
     66   if (!IsAsciiAlpha(input_encoding[0]))
     67     return false;
     68 
     69   for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
     70     char c = input_encoding[i];
     71     if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
     72         c != '-') {
     73       return false;
     74     }
     75   }
     76   return true;
     77 }
     78 
     79 void AppendParamToQuery(const std::string& key,
     80                         const std::string& value,
     81                         std::string* query) {
     82   if (!query->empty())
     83     query->append("&");
     84   if (!key.empty()) {
     85     query->append(key);
     86     query->append("=");
     87   }
     88   query->append(value);
     89 }
     90 
     91 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S].
     92 bool IsHTTPRef(const std::string& url) {
     93   if (url.empty())
     94     return true;
     95   GURL gurl(url);
     96   return gurl.is_valid() && (gurl.SchemeIs(url::kHttpScheme) ||
     97                              gurl.SchemeIs(url::kHttpsScheme));
     98 }
     99 
    100 }  // namespace
    101 
    102 
    103 // TemplateURLParsingContext --------------------------------------------------
    104 
    105 // To minimize memory overhead while parsing, a SAX style parser is used.
    106 // TemplateURLParsingContext is used to maintain the state we're in the document
    107 // while parsing.
    108 class TemplateURLParsingContext {
    109  public:
    110   // Enum of the known element types.
    111   enum ElementType {
    112     UNKNOWN,
    113     OPEN_SEARCH_DESCRIPTION,
    114     URL,
    115     PARAM,
    116     SHORT_NAME,
    117     IMAGE,
    118     INPUT_ENCODING,
    119     ALIAS,
    120   };
    121 
    122   enum Method {
    123     GET,
    124     POST
    125   };
    126 
    127   // Key/value of a Param node.
    128   typedef std::pair<std::string, std::string> Param;
    129 
    130   explicit TemplateURLParsingContext(
    131       TemplateURLParser::ParameterFilter* parameter_filter);
    132 
    133   static void StartElementImpl(void* ctx,
    134                                const xmlChar* name,
    135                                const xmlChar** atts);
    136   static void EndElementImpl(void* ctx, const xmlChar* name);
    137   static void CharactersImpl(void* ctx, const xmlChar* ch, int len);
    138 
    139   // Returns a heap-allocated TemplateURL representing the result of parsing.
    140   // This will be NULL if parsing failed or if the results were invalid for some
    141   // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied,
    142   // a resulting TemplateURLRef was invalid, etc.).
    143   TemplateURL* GetTemplateURL(const SearchTermsData& search_terms_data,
    144                               bool show_in_default_list);
    145 
    146  private:
    147   // Key is UTF8 encoded.
    148   typedef std::map<std::string, ElementType> ElementNameToElementTypeMap;
    149 
    150   static void InitMapping();
    151 
    152   void ParseURL(const xmlChar** atts);
    153   void ParseImage(const xmlChar** atts);
    154   void ParseParam(const xmlChar** atts);
    155   void ProcessURLParams();
    156 
    157   // Returns the current ElementType.
    158   ElementType GetKnownType();
    159 
    160   static ElementNameToElementTypeMap* kElementNameToElementTypeMap;
    161 
    162   // Data that gets updated as we parse, and is converted to a TemplateURL by
    163   // GetTemplateURL().
    164   TemplateURLData data_;
    165 
    166   std::vector<ElementType> elements_;
    167   bool image_is_valid_for_favicon_;
    168 
    169   // Character content for the current element.
    170   base::string16 string_;
    171 
    172   TemplateURLParser::ParameterFilter* parameter_filter_;
    173 
    174   // The list of parameters parsed in the Param nodes of a Url node.
    175   std::vector<Param> extra_params_;
    176 
    177   // The HTTP methods used.
    178   Method method_;
    179   Method suggestion_method_;
    180 
    181   // If true, we are currently parsing a suggest URL, otherwise it is an HTML
    182   // search.  Note that we don't need a stack as URL nodes cannot be nested.
    183   bool is_suggest_url_;
    184 
    185   // If true, the user has set a keyword and we should use it. Otherwise,
    186   // we generate a keyword based on the URL.
    187   bool has_custom_keyword_;
    188 
    189   // Whether we should derive the image from the URL (when images are data
    190   // URLs).
    191   bool derive_image_from_url_;
    192 
    193   DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext);
    194 };
    195 
    196 // static
    197 TemplateURLParsingContext::ElementNameToElementTypeMap*
    198     TemplateURLParsingContext::kElementNameToElementTypeMap = NULL;
    199 
    200 TemplateURLParsingContext::TemplateURLParsingContext(
    201     TemplateURLParser::ParameterFilter* parameter_filter)
    202     : image_is_valid_for_favicon_(false),
    203       parameter_filter_(parameter_filter),
    204       method_(GET),
    205       suggestion_method_(GET),
    206       is_suggest_url_(false),
    207       has_custom_keyword_(false),
    208       derive_image_from_url_(false) {
    209   if (kElementNameToElementTypeMap == NULL)
    210     InitMapping();
    211 }
    212 
    213 // static
    214 void TemplateURLParsingContext::StartElementImpl(void* ctx,
    215                                                  const xmlChar* name,
    216                                                  const xmlChar** atts) {
    217   // Remove the namespace from |name|, ex: os:Url -> Url.
    218   std::string node_name(XMLCharToString(name));
    219   size_t index = node_name.find_first_of(":");
    220   if (index != std::string::npos)
    221     node_name.erase(0, index + 1);
    222 
    223   TemplateURLParsingContext* context =
    224       reinterpret_cast<TemplateURLParsingContext*>(ctx);
    225   context->elements_.push_back(
    226     context->kElementNameToElementTypeMap->count(node_name) ?
    227         (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN);
    228   switch (context->GetKnownType()) {
    229     case TemplateURLParsingContext::URL:
    230       context->extra_params_.clear();
    231       context->ParseURL(atts);
    232       break;
    233     case TemplateURLParsingContext::IMAGE:
    234       context->ParseImage(atts);
    235       break;
    236     case TemplateURLParsingContext::PARAM:
    237       context->ParseParam(atts);
    238       break;
    239     default:
    240       break;
    241   }
    242   context->string_.clear();
    243 }
    244 
    245 // static
    246 void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) {
    247   TemplateURLParsingContext* context =
    248       reinterpret_cast<TemplateURLParsingContext*>(ctx);
    249   switch (context->GetKnownType()) {
    250     case TemplateURLParsingContext::URL:
    251       context->ProcessURLParams();
    252       break;
    253     case TemplateURLParsingContext::SHORT_NAME:
    254       context->data_.short_name = context->string_;
    255       break;
    256     case TemplateURLParsingContext::IMAGE: {
    257       GURL image_url(base::UTF16ToUTF8(context->string_));
    258       if (image_url.SchemeIs(url::kDataScheme)) {
    259         // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
    260         // decode the data URL in the renderer. For now, we'll just point to the
    261         // favicon from the URL.
    262         context->derive_image_from_url_ = true;
    263       } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() &&
    264                  (image_url.SchemeIs(url::kHttpScheme) ||
    265                   image_url.SchemeIs(url::kHttpsScheme))) {
    266         context->data_.favicon_url = image_url;
    267       }
    268       context->image_is_valid_for_favicon_ = false;
    269       break;
    270     }
    271     case TemplateURLParsingContext::INPUT_ENCODING: {
    272       std::string input_encoding = base::UTF16ToASCII(context->string_);
    273       if (IsValidEncodingString(input_encoding))
    274         context->data_.input_encodings.push_back(input_encoding);
    275       break;
    276     }
    277     case TemplateURLParsingContext::ALIAS: {
    278       context->data_.SetKeyword(context->string_);
    279       context->has_custom_keyword_ = true;
    280       break;
    281     }
    282     default:
    283       break;
    284   }
    285   context->string_.clear();
    286   context->elements_.pop_back();
    287 }
    288 
    289 // static
    290 void TemplateURLParsingContext::CharactersImpl(void* ctx,
    291                                                const xmlChar* ch,
    292                                                int len) {
    293   reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ +=
    294       base::UTF8ToUTF16(std::string(reinterpret_cast<const char*>(ch), len));
    295 }
    296 
    297 TemplateURL* TemplateURLParsingContext::GetTemplateURL(
    298     const SearchTermsData& search_terms_data,
    299     bool show_in_default_list) {
    300   // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107
    301   if (method_ == TemplateURLParsingContext::POST || data_.short_name.empty() ||
    302       !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url))
    303     return NULL;
    304   if (suggestion_method_ == TemplateURLParsingContext::POST)
    305     data_.suggestions_url.clear();
    306 
    307   // If the image was a data URL, use the favicon from the search URL instead.
    308   // (see the TODO in EndElementImpl()).
    309   GURL search_url(data_.url());
    310   if (derive_image_from_url_ && data_.favicon_url.is_empty())
    311     data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url);
    312 
    313   // Generate a keyword for this search engine if a custom one was not present
    314   // in the imported data.
    315   if (!has_custom_keyword_)
    316     data_.SetKeyword(TemplateURL::GenerateKeyword(search_url));
    317 
    318   data_.show_in_default_list = show_in_default_list;
    319 
    320   // Bail if the search URL is empty or if either TemplateURLRef is invalid.
    321   scoped_ptr<TemplateURL> template_url(new TemplateURL(data_));
    322   if (template_url->url().empty() ||
    323       !template_url->url_ref().IsValid(search_terms_data) ||
    324       (!template_url->suggestions_url().empty() &&
    325        !template_url->suggestions_url_ref().IsValid(search_terms_data))) {
    326     return NULL;
    327   }
    328 
    329   return template_url.release();
    330 }
    331 
    332 // static
    333 void TemplateURLParsingContext::InitMapping() {
    334   kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
    335   (*kElementNameToElementTypeMap)[kURLElement] = URL;
    336   (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
    337   (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
    338   (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
    339   (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
    340       OPEN_SEARCH_DESCRIPTION;
    341   (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
    342       OPEN_SEARCH_DESCRIPTION;
    343   (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING;
    344   (*kElementNameToElementTypeMap)[kAliasElement] = ALIAS;
    345 }
    346 
    347 void TemplateURLParsingContext::ParseURL(const xmlChar** atts) {
    348   if (!atts)
    349     return;
    350 
    351   std::string template_url;
    352   bool is_post = false;
    353   bool is_html_url = false;
    354   bool is_suggest_url = false;
    355   for (; *atts; atts += 2) {
    356     std::string name(XMLCharToString(*atts));
    357     const xmlChar* value = atts[1];
    358     if (name == kURLTypeAttribute) {
    359       std::string type = XMLCharToString(value);
    360       is_html_url = (type == kHTMLType);
    361       is_suggest_url = (type == kSuggestionType);
    362     } else if (name == kURLTemplateAttribute) {
    363       template_url = XMLCharToString(value);
    364     } else if (name == kParamMethodAttribute) {
    365       is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
    366     }
    367   }
    368 
    369   if (is_html_url && !template_url.empty()) {
    370     data_.SetURL(template_url);
    371     is_suggest_url_ = false;
    372     if (is_post)
    373       method_ = POST;
    374   } else if (is_suggest_url) {
    375     data_.suggestions_url = template_url;
    376     is_suggest_url_ = true;
    377     if (is_post)
    378       suggestion_method_ = POST;
    379   }
    380 }
    381 
    382 void TemplateURLParsingContext::ParseImage(const xmlChar** atts) {
    383   if (!atts)
    384     return;
    385 
    386   int width = 0;
    387   int height = 0;
    388   std::string type;
    389   for (; *atts; atts += 2) {
    390     std::string name(XMLCharToString(*atts));
    391     const xmlChar* value = atts[1];
    392     if (name == kImageTypeAttribute) {
    393       type = XMLCharToString(value);
    394     } else if (name == kImageWidthAttribute) {
    395       base::StringToInt(XMLCharToString(value), &width);
    396     } else if (name == kImageHeightAttribute) {
    397       base::StringToInt(XMLCharToString(value), &height);
    398     }
    399   }
    400 
    401   image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) &&
    402       (height == gfx::kFaviconSize) &&
    403       ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon"));
    404 }
    405 
    406 void TemplateURLParsingContext::ParseParam(const xmlChar** atts) {
    407   if (!atts)
    408     return;
    409 
    410   std::string key, value;
    411   for (; *atts; atts += 2) {
    412     std::string name(XMLCharToString(*atts));
    413     const xmlChar* val = atts[1];
    414     if (name == kParamNameAttribute) {
    415       key = XMLCharToString(val);
    416     } else if (name == kParamValueAttribute) {
    417       value = XMLCharToString(val);
    418     }
    419   }
    420 
    421   if (!key.empty() &&
    422       (!parameter_filter_ || parameter_filter_->KeepParameter(key, value)))
    423     extra_params_.push_back(Param(key, value));
    424 }
    425 
    426 void TemplateURLParsingContext::ProcessURLParams() {
    427   if (!parameter_filter_ && extra_params_.empty())
    428     return;
    429 
    430   GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url());
    431   if (url.is_empty())
    432     return;
    433 
    434   // If there is a parameter filter, parse the existing URL and remove any
    435   // unwanted parameter.
    436   std::string new_query;
    437   bool modified = false;
    438   if (parameter_filter_) {
    439     url::Component query = url.parsed_for_possibly_invalid_spec().query;
    440     url::Component key, value;
    441     const char* url_spec = url.spec().c_str();
    442     while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
    443       std::string key_str(url_spec, key.begin, key.len);
    444       std::string value_str(url_spec, value.begin, value.len);
    445       if (parameter_filter_->KeepParameter(key_str, value_str)) {
    446         AppendParamToQuery(key_str, value_str, &new_query);
    447       } else {
    448         modified = true;
    449       }
    450     }
    451   }
    452   if (!modified)
    453     new_query = url.query();
    454 
    455   // Add the extra parameters if any.
    456   if (!extra_params_.empty()) {
    457     modified = true;
    458     for (std::vector<Param>::const_iterator iter(extra_params_.begin());
    459          iter != extra_params_.end(); ++iter)
    460       AppendParamToQuery(iter->first, iter->second, &new_query);
    461   }
    462 
    463   if (modified) {
    464     GURL::Replacements repl;
    465     repl.SetQueryStr(new_query);
    466     url = url.ReplaceComponents(repl);
    467     if (is_suggest_url_)
    468       data_.suggestions_url = url.spec();
    469     else if (url.is_valid())
    470       data_.SetURL(url.spec());
    471   }
    472 }
    473 
    474 TemplateURLParsingContext::ElementType
    475     TemplateURLParsingContext::GetKnownType() {
    476   if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
    477     return elements_[1];
    478   // We only expect PARAM nodes under the URL node.
    479   return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
    480       elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN;
    481 }
    482 
    483 
    484 // TemplateURLParser ----------------------------------------------------------
    485 
    486 // static
    487 TemplateURL* TemplateURLParser::Parse(
    488     const SearchTermsData& search_terms_data,
    489     bool show_in_default_list,
    490     const char* data,
    491     size_t length,
    492     TemplateURLParser::ParameterFilter* param_filter) {
    493   // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
    494   // &#38; . Unfortunately xmlSubstituteEntitiesDefault affects global state.
    495   // If this becomes problematic we'll need to provide our own entity
    496   // type for &amp;, or strip out &#38; by hand after parsing.
    497   int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
    498   TemplateURLParsingContext context(param_filter);
    499   xmlSAXHandler sax_handler;
    500   memset(&sax_handler, 0, sizeof(sax_handler));
    501   sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl;
    502   sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl;
    503   sax_handler.characters = &TemplateURLParsingContext::CharactersImpl;
    504   int error = xmlSAXUserParseMemory(&sax_handler, &context, data,
    505                                     static_cast<int>(length));
    506   xmlSubstituteEntitiesDefault(last_sub_entities_value);
    507 
    508   return error ?
    509       NULL : context.GetTemplateURL(search_terms_data, show_in_default_list);
    510 }
    511