Home | History | Annotate | Download | only in search_engines
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/search_engines/template_url_parser.h"
      6 
      7 #include <algorithm>
      8 #include <map>
      9 #include <vector>
     10 
     11 #include "base/logging.h"
     12 #include "base/memory/scoped_ptr.h"
     13 #include "base/strings/string_number_conversions.h"
     14 #include "base/strings/string_util.h"
     15 #include "base/strings/utf_string_conversions.h"
     16 #include "chrome/browser/search_engines/search_terms_data.h"
     17 #include "chrome/browser/search_engines/template_url.h"
     18 #include "chrome/browser/search_engines/template_url_service.h"
     19 #include "chrome/common/url_constants.h"
     20 #include "libxml/parser.h"
     21 #include "libxml/xmlwriter.h"
     22 #include "ui/gfx/favicon_size.h"
     23 #include "url/gurl.h"
     24 
     25 namespace {
     26 
     27 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
     28 // to that of char, the following names are all in terms of char. This avoids
     29 // having to convert to wide, then do comparisons.
     30 
     31 // Defines for element names of the OSD document:
     32 const char kURLElement[] = "Url";
     33 const char kParamElement[] = "Param";
     34 const char kShortNameElement[] = "ShortName";
     35 const char kImageElement[] = "Image";
     36 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
     37 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
     38 const char kInputEncodingElement[] = "InputEncoding";
     39 
     40 // Various XML attributes used.
     41 const char kURLTypeAttribute[] = "type";
     42 const char kURLTemplateAttribute[] = "template";
     43 const char kImageTypeAttribute[] = "type";
     44 const char kImageWidthAttribute[] = "width";
     45 const char kImageHeightAttribute[] = "height";
     46 const char kParamNameAttribute[] = "name";
     47 const char kParamValueAttribute[] = "value";
     48 const char kParamMethodAttribute[] = "method";
     49 
     50 // Mime type for search results.
     51 const char kHTMLType[] = "text/html";
     52 
     53 // Mime type for as you type suggestions.
     54 const char kSuggestionType[] = "application/x-suggestions+json";
     55 
     56 // Namespace identifier.
     57 const char kOSDNS[] = "xmlns";
     58 
     59 // The namespace for documents we understand.
     60 const char kNameSpace[] = "http://a9.com/-/spec/opensearch/1.1/";
     61 
     62 std::string XMLCharToString(const xmlChar* value) {
     63   return std::string(reinterpret_cast<const char*>(value));
     64 }
     65 
     66 // Returns true if input_encoding contains a valid input encoding string. This
     67 // doesn't verify that we have a valid encoding for the string, just that the
     68 // string contains characters that constitute a valid input encoding.
     69 bool IsValidEncodingString(const std::string& input_encoding) {
     70   if (input_encoding.empty())
     71     return false;
     72 
     73   if (!IsAsciiAlpha(input_encoding[0]))
     74     return false;
     75 
     76   for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
     77     char c = input_encoding[i];
     78     if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
     79         c != '-') {
     80       return false;
     81     }
     82   }
     83   return true;
     84 }
     85 
     86 void AppendParamToQuery(const std::string& key,
     87                         const std::string& value,
     88                         std::string* query) {
     89   if (!query->empty())
     90     query->append("&");
     91   if (!key.empty()) {
     92     query->append(key);
     93     query->append("=");
     94   }
     95   query->append(value);
     96 }
     97 
     98 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S].
     99 bool IsHTTPRef(const std::string& url) {
    100   if (url.empty())
    101     return true;
    102   GURL gurl(url);
    103   return gurl.is_valid() && (gurl.SchemeIs(chrome::kHttpScheme) ||
    104                              gurl.SchemeIs(chrome::kHttpsScheme));
    105 }
    106 
    107 }  // namespace
    108 
    109 
    110 // TemplateURLParsingContext --------------------------------------------------
    111 
    112 // To minimize memory overhead while parsing, a SAX style parser is used.
    113 // TemplateURLParsingContext is used to maintain the state we're in the document
    114 // while parsing.
    115 class TemplateURLParsingContext {
    116  public:
    117   // Enum of the known element types.
    118   enum ElementType {
    119     UNKNOWN,
    120     OPEN_SEARCH_DESCRIPTION,
    121     URL,
    122     PARAM,
    123     SHORT_NAME,
    124     IMAGE,
    125     INPUT_ENCODING,
    126   };
    127 
    128   enum Method {
    129     GET,
    130     POST
    131   };
    132 
    133   // Key/value of a Param node.
    134   typedef std::pair<std::string, std::string> Param;
    135 
    136   explicit TemplateURLParsingContext(
    137       TemplateURLParser::ParameterFilter* parameter_filter);
    138 
    139   static void StartElementImpl(void* ctx,
    140                                const xmlChar* name,
    141                                const xmlChar** atts);
    142   static void EndElementImpl(void* ctx, const xmlChar* name);
    143   static void CharactersImpl(void* ctx, const xmlChar* ch, int len);
    144 
    145   // Returns a heap-allocated TemplateURL representing the result of parsing.
    146   // This will be NULL if parsing failed or if the results were invalid for some
    147   // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied,
    148   // a resulting TemplateURLRef was invalid, etc.).
    149   TemplateURL* GetTemplateURL(Profile* profile, bool show_in_default_list);
    150 
    151  private:
    152   // Key is UTF8 encoded.
    153   typedef std::map<std::string, ElementType> ElementNameToElementTypeMap;
    154 
    155   static void InitMapping();
    156 
    157   void ParseURL(const xmlChar** atts);
    158   void ParseImage(const xmlChar** atts);
    159   void ParseParam(const xmlChar** atts);
    160   void ProcessURLParams();
    161 
    162   // Returns the current ElementType.
    163   ElementType GetKnownType();
    164 
    165   static ElementNameToElementTypeMap* kElementNameToElementTypeMap;
    166 
    167   // Data that gets updated as we parse, and is converted to a TemplateURL by
    168   // GetTemplateURL().
    169   TemplateURLData data_;
    170 
    171   std::vector<ElementType> elements_;
    172   bool image_is_valid_for_favicon_;
    173 
    174   // Character content for the current element.
    175   string16 string_;
    176 
    177   TemplateURLParser::ParameterFilter* parameter_filter_;
    178 
    179   // The list of parameters parsed in the Param nodes of a Url node.
    180   std::vector<Param> extra_params_;
    181 
    182   // The HTTP methods used.
    183   Method method_;
    184   Method suggestion_method_;
    185 
    186   // If true, we are currently parsing a suggest URL, otherwise it is an HTML
    187   // search.  Note that we don't need a stack as URL nodes cannot be nested.
    188   bool is_suggest_url_;
    189 
    190   // Whether we should derive the image from the URL (when images are data
    191   // URLs).
    192   bool derive_image_from_url_;
    193 
    194   DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext);
    195 };
    196 
    197 // static
    198 TemplateURLParsingContext::ElementNameToElementTypeMap*
    199     TemplateURLParsingContext::kElementNameToElementTypeMap = NULL;
    200 
    201 TemplateURLParsingContext::TemplateURLParsingContext(
    202     TemplateURLParser::ParameterFilter* parameter_filter)
    203     : image_is_valid_for_favicon_(false),
    204       parameter_filter_(parameter_filter),
    205       method_(GET),
    206       suggestion_method_(GET),
    207       is_suggest_url_(false),
    208       derive_image_from_url_(false) {
    209   if (kElementNameToElementTypeMap == NULL)
    210     InitMapping();
    211 }
    212 
    213 // static
    214 void TemplateURLParsingContext::StartElementImpl(void* ctx,
    215                                                  const xmlChar* name,
    216                                                  const xmlChar** atts) {
    217   // Remove the namespace from |name|, ex: os:Url -> Url.
    218   std::string node_name(XMLCharToString(name));
    219   size_t index = node_name.find_first_of(":");
    220   if (index != std::string::npos)
    221     node_name.erase(0, index + 1);
    222 
    223   TemplateURLParsingContext* context =
    224       reinterpret_cast<TemplateURLParsingContext*>(ctx);
    225   context->elements_.push_back(
    226     context->kElementNameToElementTypeMap->count(node_name) ?
    227         (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN);
    228   switch (context->GetKnownType()) {
    229     case TemplateURLParsingContext::URL:
    230       context->extra_params_.clear();
    231       context->ParseURL(atts);
    232       break;
    233     case TemplateURLParsingContext::IMAGE:
    234       context->ParseImage(atts);
    235       break;
    236     case TemplateURLParsingContext::PARAM:
    237       context->ParseParam(atts);
    238       break;
    239     default:
    240       break;
    241   }
    242   context->string_.clear();
    243 }
    244 
    245 // static
    246 void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) {
    247   TemplateURLParsingContext* context =
    248       reinterpret_cast<TemplateURLParsingContext*>(ctx);
    249   switch (context->GetKnownType()) {
    250     case TemplateURLParsingContext::SHORT_NAME:
    251       context->data_.short_name = context->string_;
    252       break;
    253     case TemplateURLParsingContext::IMAGE: {
    254       GURL image_url(UTF16ToUTF8(context->string_));
    255       if (image_url.SchemeIs(chrome::kDataScheme)) {
    256         // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
    257         // decode the data URL in the renderer. For now, we'll just point to the
    258         // favicon from the URL.
    259         context->derive_image_from_url_ = true;
    260       } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() &&
    261                  (image_url.SchemeIs(chrome::kHttpScheme) ||
    262                   image_url.SchemeIs(chrome::kHttpsScheme))) {
    263         context->data_.favicon_url = image_url;
    264       }
    265       context->image_is_valid_for_favicon_ = false;
    266       break;
    267     }
    268     case TemplateURLParsingContext::INPUT_ENCODING: {
    269       std::string input_encoding = UTF16ToASCII(context->string_);
    270       if (IsValidEncodingString(input_encoding))
    271         context->data_.input_encodings.push_back(input_encoding);
    272       break;
    273     }
    274     case TemplateURLParsingContext::URL:
    275       context->ProcessURLParams();
    276       break;
    277     default:
    278       break;
    279   }
    280   context->string_.clear();
    281   context->elements_.pop_back();
    282 }
    283 
    284 // static
    285 void TemplateURLParsingContext::CharactersImpl(void* ctx,
    286                                                const xmlChar* ch,
    287                                                int len) {
    288   reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ +=
    289       UTF8ToUTF16(std::string(reinterpret_cast<const char*>(ch), len));
    290 }
    291 
    292 TemplateURL* TemplateURLParsingContext::GetTemplateURL(
    293     Profile* profile,
    294     bool show_in_default_list) {
    295   // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107
    296   if (method_ == TemplateURLParsingContext::POST || data_.short_name.empty() ||
    297       !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url))
    298     return NULL;
    299   if (suggestion_method_ == TemplateURLParsingContext::POST)
    300     data_.suggestions_url.clear();
    301 
    302   // If the image was a data URL, use the favicon from the search URL instead.
    303   // (see the TODO in EndElementImpl()).
    304   GURL search_url(data_.url());
    305   if (derive_image_from_url_ && data_.favicon_url.is_empty())
    306     data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url);
    307 
    308   data_.SetKeyword(TemplateURLService::GenerateKeyword(search_url));
    309   data_.show_in_default_list = show_in_default_list;
    310 
    311   // Bail if the search URL is empty or if either TemplateURLRef is invalid.
    312   scoped_ptr<TemplateURL> template_url(new TemplateURL(profile, data_));
    313   scoped_ptr<SearchTermsData> search_terms_data(profile ?
    314       new UIThreadSearchTermsData(profile) : new SearchTermsData());
    315   if (template_url->url().empty() ||
    316       !template_url->url_ref().IsValidUsingTermsData(*search_terms_data) ||
    317       (!template_url->suggestions_url().empty() &&
    318        !template_url->suggestions_url_ref().
    319            IsValidUsingTermsData(*search_terms_data))) {
    320     return NULL;
    321   }
    322 
    323   return template_url.release();
    324 }
    325 
    326 // static
    327 void TemplateURLParsingContext::InitMapping() {
    328   kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
    329   (*kElementNameToElementTypeMap)[kURLElement] = URL;
    330   (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
    331   (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
    332   (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
    333   (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
    334       OPEN_SEARCH_DESCRIPTION;
    335   (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
    336       OPEN_SEARCH_DESCRIPTION;
    337   (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING;
    338 }
    339 
    340 void TemplateURLParsingContext::ParseURL(const xmlChar** atts) {
    341   if (!atts)
    342     return;
    343 
    344   std::string template_url;
    345   bool is_post = false;
    346   bool is_html_url = false;
    347   bool is_suggest_url = false;
    348   for (; *atts; atts += 2) {
    349     std::string name(XMLCharToString(*atts));
    350     const xmlChar* value = atts[1];
    351     if (name == kURLTypeAttribute) {
    352       std::string type = XMLCharToString(value);
    353       is_html_url = (type == kHTMLType);
    354       is_suggest_url = (type == kSuggestionType);
    355     } else if (name == kURLTemplateAttribute) {
    356       template_url = XMLCharToString(value);
    357     } else if (name == kParamMethodAttribute) {
    358       is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
    359     }
    360   }
    361 
    362   if (is_html_url && !template_url.empty()) {
    363     data_.SetURL(template_url);
    364     is_suggest_url_ = false;
    365     if (is_post)
    366       method_ = POST;
    367   } else if (is_suggest_url) {
    368     data_.suggestions_url = template_url;
    369     is_suggest_url_ = true;
    370     if (is_post)
    371       suggestion_method_ = POST;
    372   }
    373 }
    374 
    375 void TemplateURLParsingContext::ParseImage(const xmlChar** atts) {
    376   if (!atts)
    377     return;
    378 
    379   int width = 0;
    380   int height = 0;
    381   std::string type;
    382   for (; *atts; atts += 2) {
    383     std::string name(XMLCharToString(*atts));
    384     const xmlChar* value = atts[1];
    385     if (name == kImageTypeAttribute) {
    386       type = XMLCharToString(value);
    387     } else if (name == kImageWidthAttribute) {
    388       base::StringToInt(XMLCharToString(value), &width);
    389     } else if (name == kImageHeightAttribute) {
    390       base::StringToInt(XMLCharToString(value), &height);
    391     }
    392   }
    393 
    394   image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) &&
    395       (height == gfx::kFaviconSize) &&
    396       ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon"));
    397 }
    398 
    399 void TemplateURLParsingContext::ParseParam(const xmlChar** atts) {
    400   if (!atts)
    401     return;
    402 
    403   std::string key, value;
    404   for (; *atts; atts += 2) {
    405     std::string name(XMLCharToString(*atts));
    406     const xmlChar* val = atts[1];
    407     if (name == kParamNameAttribute) {
    408       key = XMLCharToString(val);
    409     } else if (name == kParamValueAttribute) {
    410       value = XMLCharToString(val);
    411     }
    412   }
    413 
    414   if (!key.empty() &&
    415       (!parameter_filter_ || parameter_filter_->KeepParameter(key, value)))
    416     extra_params_.push_back(Param(key, value));
    417 }
    418 
    419 void TemplateURLParsingContext::ProcessURLParams() {
    420   if (!parameter_filter_ && extra_params_.empty())
    421     return;
    422 
    423   GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url());
    424   if (url.is_empty())
    425     return;
    426 
    427   // If there is a parameter filter, parse the existing URL and remove any
    428   // unwanted parameter.
    429   std::string new_query;
    430   bool modified = false;
    431   if (parameter_filter_) {
    432     url_parse::Component query = url.parsed_for_possibly_invalid_spec().query;
    433     url_parse::Component key, value;
    434     const char* url_spec = url.spec().c_str();
    435     while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
    436       std::string key_str(url_spec, key.begin, key.len);
    437       std::string value_str(url_spec, value.begin, value.len);
    438       if (parameter_filter_->KeepParameter(key_str, value_str)) {
    439         AppendParamToQuery(key_str, value_str, &new_query);
    440       } else {
    441         modified = true;
    442       }
    443     }
    444   }
    445   if (!modified)
    446     new_query = url.query();
    447 
    448   // Add the extra parameters if any.
    449   if (!extra_params_.empty()) {
    450     modified = true;
    451     for (std::vector<Param>::const_iterator iter(extra_params_.begin());
    452          iter != extra_params_.end(); ++iter)
    453       AppendParamToQuery(iter->first, iter->second, &new_query);
    454   }
    455 
    456   if (modified) {
    457     GURL::Replacements repl;
    458     repl.SetQueryStr(new_query);
    459     url = url.ReplaceComponents(repl);
    460     if (is_suggest_url_)
    461       data_.suggestions_url = url.spec();
    462     else if (url.is_valid())
    463       data_.SetURL(url.spec());
    464   }
    465 }
    466 
    467 TemplateURLParsingContext::ElementType
    468     TemplateURLParsingContext::GetKnownType() {
    469   if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
    470     return elements_[1];
    471   // We only expect PARAM nodes under the URL node.
    472   return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
    473       elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN;
    474 }
    475 
    476 
    477 // TemplateURLParser ----------------------------------------------------------
    478 
    479 // static
    480 TemplateURL* TemplateURLParser::Parse(
    481     Profile* profile,
    482     bool show_in_default_list,
    483     const char* data,
    484     size_t length,
    485     TemplateURLParser::ParameterFilter* param_filter) {
    486   // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
    487   // &#38; . Unfortunately xmlSubstituteEntitiesDefault affects global state.
    488   // If this becomes problematic we'll need to provide our own entity
    489   // type for &amp;, or strip out &#38; by hand after parsing.
    490   int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
    491   TemplateURLParsingContext context(param_filter);
    492   xmlSAXHandler sax_handler;
    493   memset(&sax_handler, 0, sizeof(sax_handler));
    494   sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl;
    495   sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl;
    496   sax_handler.characters = &TemplateURLParsingContext::CharactersImpl;
    497   int error = xmlSAXUserParseMemory(&sax_handler, &context, data,
    498                                     static_cast<int>(length));
    499   xmlSubstituteEntitiesDefault(last_sub_entities_value);
    500 
    501   return error ? NULL : context.GetTemplateURL(profile, show_in_default_list);
    502 }
    503