Home | History | Annotate | Download | only in search_engines
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/search_engines/template_url_parser.h"
      6 
      7 #include <algorithm>
      8 #include <map>
      9 #include <vector>
     10 
     11 #include "base/logging.h"
     12 #include "base/memory/scoped_ptr.h"
     13 #include "base/string_number_conversions.h"
     14 #include "base/string_util.h"
     15 #include "base/utf_string_conversions.h"
     16 #include "chrome/browser/search_engines/template_url.h"
     17 #include "chrome/common/url_constants.h"
     18 #include "googleurl/src/gurl.h"
     19 #include "libxml/parser.h"
     20 #include "libxml/xmlwriter.h"
     21 
     22 namespace {
     23 
     24 //
     25 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
     26 // to that of char, the following names are all in terms of char. This avoids
     27 // having to convert to wide, then do comparisons
     28 
     29 // Defines for element names of the OSD document:
     30 static const char kURLElement[] = "Url";
     31 static const char kParamElement[] = "Param";
     32 static const char kShortNameElement[] = "ShortName";
     33 static const char kDescriptionElement[] = "Description";
     34 static const char kImageElement[] = "Image";
     35 static const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
     36 static const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
     37 static const char kLanguageElement[] = "Language";
     38 static const char kInputEncodingElement[] = "InputEncoding";
     39 
     40 // Various XML attributes used.
     41 static const char kURLTypeAttribute[] = "type";
     42 static const char kURLTemplateAttribute[] = "template";
     43 static const char kImageTypeAttribute[] = "type";
     44 static const char kImageWidthAttribute[] = "width";
     45 static const char kImageHeightAttribute[] = "height";
     46 static const char kURLIndexOffsetAttribute[] = "indexOffset";
     47 static const char kURLPageOffsetAttribute[] = "pageOffset";
     48 static const char kParamNameAttribute[] = "name";
     49 static const char kParamValueAttribute[] = "value";
     50 static const char kParamMethodAttribute[] = "method";
     51 
     52 // Mime type for search results.
     53 static const char kHTMLType[] = "text/html";
     54 
     55 // Mime type for as you type suggestions.
     56 static const char kSuggestionType[] = "application/x-suggestions+json";
     57 
     58 // Namespace identifier.
     59 static const char kOSDNS[] = "xmlns";
     60 
     61 // The namespace for documents we understand.
     62 static const char kNameSpace[] = "http://a9.com/-/spec/opensearch/1.1/";
     63 
     64 // Removes the namespace from the specified |name|, ex: os:Url -> Url.
     65 static void PruneNamespace(std::string* name) {
     66   size_t index = name->find_first_of(":");
     67   if (index != std::string::npos)
     68     name->erase(0, index + 1);
     69 }
     70 
     71 //
     72 // To minimize memory overhead while parsing, a SAX style parser is used.
     73 // ParsingContext is used to maintain the state we're in the document
     74 // while parsing.
     75 class ParsingContext {
     76  public:
     77   // Enum of the known element types.
     78   enum ElementType {
     79     UNKNOWN,
     80     OPEN_SEARCH_DESCRIPTION,
     81     URL,
     82     PARAM,
     83     SHORT_NAME,
     84     DESCRIPTION,
     85     IMAGE,
     86     LANGUAGE,
     87     INPUT_ENCODING,
     88   };
     89 
     90   enum Method {
     91     GET,
     92     POST
     93   };
     94 
     95   // Key/value of a Param node.
     96   typedef std::pair<std::string, std::string> Param;
     97 
     98   ParsingContext(TemplateURLParser::ParameterFilter* parameter_filter,
     99                  TemplateURL* url)
    100       : url_(url),
    101         parameter_filter_(parameter_filter),
    102         method_(GET),
    103         suggestion_method_(GET),
    104         is_suggest_url_(false),
    105         derive_image_from_url_(false) {
    106     if (kElementNameToElementTypeMap == NULL)
    107       InitMapping();
    108   }
    109 
    110   // Invoked when an element starts.
    111   void PushElement(const std::string& element) {
    112     ElementType type;
    113     if (kElementNameToElementTypeMap->find(element) ==
    114         kElementNameToElementTypeMap->end()) {
    115       type = UNKNOWN;
    116     } else {
    117       type = (*kElementNameToElementTypeMap)[element];
    118     }
    119     elements_.push_back(type);
    120   }
    121 
    122   void PopElement() {
    123     elements_.pop_back();
    124   }
    125 
    126   // Returns the current ElementType.
    127   ElementType GetKnownType() {
    128     if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
    129       return elements_[1];
    130 
    131     // We only expect PARAM nodes under the Url node
    132     if (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
    133         elements_[1] == URL && elements_[2] == PARAM)
    134       return PARAM;
    135 
    136     return UNKNOWN;
    137   }
    138 
    139   TemplateURL* template_url() { return url_; }
    140 
    141   void AddImageRef(const std::string& type, int width, int height) {
    142     if (width > 0 && height > 0)
    143       current_image_.reset(new TemplateURL::ImageRef(type, width, height));
    144   }
    145 
    146   void EndImage() {
    147     current_image_.reset();
    148   }
    149 
    150   void SetImageURL(const GURL& url) {
    151     if (current_image_.get()) {
    152       current_image_->url = url;
    153       url_->add_image_ref(*current_image_);
    154       current_image_.reset();
    155     }
    156   }
    157 
    158   void ResetString() {
    159     string_.clear();
    160   }
    161 
    162   void AppendString(const string16& string) {
    163     string_ += string;
    164   }
    165 
    166   const string16& GetString() {
    167     return string_;
    168   }
    169 
    170   void ResetExtraParams() {
    171     extra_params_.clear();
    172   }
    173 
    174   void AddExtraParams(const std::string& key, const std::string& value) {
    175     if (parameter_filter_ && !parameter_filter_->KeepParameter(key, value))
    176       return;
    177     extra_params_.push_back(Param(key, value));
    178   }
    179 
    180   const std::vector<Param>& extra_params() const { return extra_params_; }
    181 
    182   void set_is_suggestion(bool value) { is_suggest_url_ = value; }
    183   bool is_suggestion() const { return is_suggest_url_; }
    184 
    185   TemplateURLParser::ParameterFilter* parameter_filter() const {
    186     return parameter_filter_;
    187   }
    188 
    189   void set_derive_image_from_url(bool derive_image_from_url) {
    190     derive_image_from_url_ = derive_image_from_url;
    191   }
    192 
    193   void set_method(Method method) { method_ = method; }
    194   Method method() { return method_; }
    195 
    196   void set_suggestion_method(Method method) { suggestion_method_ = method; }
    197   Method suggestion_method() { return suggestion_method_; }
    198 
    199   // Builds the image URL from the Template search URL if no image URL has been
    200   // set.
    201   void DeriveImageFromURL() {
    202     if (derive_image_from_url_ &&
    203         url_->GetFaviconURL().is_empty() && url_->url()) {
    204       GURL url(url_->url()->url());  // More url's please...
    205       url_->SetFaviconURL(TemplateURL::GenerateFaviconURL(url));
    206     }
    207   }
    208 
    209  private:
    210   static void InitMapping() {
    211     kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
    212     (*kElementNameToElementTypeMap)[kURLElement] = URL;
    213     (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
    214     (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
    215     (*kElementNameToElementTypeMap)[kDescriptionElement] = DESCRIPTION;
    216     (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
    217     (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
    218         OPEN_SEARCH_DESCRIPTION;
    219     (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
    220         OPEN_SEARCH_DESCRIPTION;
    221     (*kElementNameToElementTypeMap)[kLanguageElement] =
    222         LANGUAGE;
    223     (*kElementNameToElementTypeMap)[kInputEncodingElement] =
    224         INPUT_ENCODING;
    225   }
    226 
    227   // Key is UTF8 encoded.
    228   static std::map<std::string, ElementType>* kElementNameToElementTypeMap;
    229   // TemplateURL supplied to Read method. It's owned by the caller, so we
    230   // don't need to free it.
    231   TemplateURL* url_;
    232   std::vector<ElementType> elements_;
    233   scoped_ptr<TemplateURL::ImageRef> current_image_;
    234 
    235   // Character content for the current element.
    236   string16 string_;
    237 
    238   TemplateURLParser::ParameterFilter* parameter_filter_;
    239 
    240   // The list of parameters parsed in the Param nodes of a Url node.
    241   std::vector<Param> extra_params_;
    242 
    243   // The HTTP methods used.
    244   Method method_;
    245   Method suggestion_method_;
    246 
    247   // If true, we are currently parsing a suggest URL, otherwise it is an HTML
    248   // search.  Note that we don't need a stack as Url nodes cannot be nested.
    249   bool is_suggest_url_;
    250 
    251   // Whether we should derive the image from the URL (when images are data
    252   // URLs).
    253   bool derive_image_from_url_;
    254 
    255   DISALLOW_COPY_AND_ASSIGN(ParsingContext);
    256 };
    257 
    258 // static
    259 std::map<std::string, ParsingContext::ElementType>*
    260     ParsingContext::kElementNameToElementTypeMap = NULL;
    261 
    262 string16 XMLCharToUTF16(const xmlChar* value, int length) {
    263   return UTF8ToUTF16(std::string((const char*)value, length));
    264 }
    265 
    266 std::string XMLCharToString(const xmlChar* value) {
    267   return std::string((const char*)value);
    268 }
    269 
    270 // Returns true if input_encoding contains a valid input encoding string. This
    271 // doesn't verify that we have a valid encoding for the string, just that the
    272 // string contains characters that constitute a valid input encoding.
    273 bool IsValidEncodingString(const std::string& input_encoding) {
    274   if (input_encoding.empty())
    275     return false;
    276 
    277   if (!IsAsciiAlpha(input_encoding[0]))
    278     return false;
    279 
    280   for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
    281     char c = input_encoding[i];
    282     if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
    283         c != '-') {
    284       return false;
    285     }
    286   }
    287   return true;
    288 }
    289 
    290 void ParseURL(const xmlChar** atts, ParsingContext* context) {
    291   if (!atts)
    292     return;
    293 
    294   TemplateURL* turl = context->template_url();
    295   const xmlChar** attributes = atts;
    296   std::string template_url;
    297   bool is_post = false;
    298   bool is_html_url = false;
    299   bool is_suggest_url = false;
    300   int index_offset = 1;
    301   int page_offset = 1;
    302 
    303   while (*attributes) {
    304     std::string name(XMLCharToString(*attributes));
    305     const xmlChar* value = attributes[1];
    306     if (name == kURLTypeAttribute) {
    307       std::string type = XMLCharToString(value);
    308       is_html_url = (type == kHTMLType);
    309       is_suggest_url = (type == kSuggestionType);
    310     } else if (name == kURLTemplateAttribute) {
    311       template_url = XMLCharToString(value);
    312     } else if (name == kURLIndexOffsetAttribute) {
    313       base::StringToInt(XMLCharToString(value), &index_offset);
    314       index_offset = std::max(1, index_offset);
    315     } else if (name == kURLPageOffsetAttribute) {
    316       base::StringToInt(XMLCharToString(value), &page_offset);
    317       page_offset = std::max(1, page_offset);
    318     } else if (name == kParamMethodAttribute) {
    319       is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
    320     }
    321     attributes += 2;
    322   }
    323   if (is_html_url) {
    324     turl->SetURL(template_url, index_offset, page_offset);
    325     context->set_is_suggestion(false);
    326     if (is_post)
    327       context->set_method(ParsingContext::POST);
    328   } else if (is_suggest_url) {
    329     turl->SetSuggestionsURL(template_url, index_offset, page_offset);
    330     context->set_is_suggestion(true);
    331     if (is_post)
    332       context->set_suggestion_method(ParsingContext::POST);
    333   }
    334 }
    335 
    336 void ParseImage(const xmlChar** atts, ParsingContext* context) {
    337   if (!atts)
    338     return;
    339 
    340   const xmlChar** attributes = atts;
    341   int width = 0;
    342   int height = 0;
    343   std::string type;
    344   while (*attributes) {
    345     std::string name(XMLCharToString(*attributes));
    346     const xmlChar* value = attributes[1];
    347     if (name == kImageTypeAttribute) {
    348       type = XMLCharToString(value);
    349     } else if (name == kImageWidthAttribute) {
    350       base::StringToInt(XMLCharToString(value), &width);
    351     } else if (name == kImageHeightAttribute) {
    352       base::StringToInt(XMLCharToString(value), &height);
    353     }
    354     attributes += 2;
    355   }
    356   if (width > 0 && height > 0 && !type.empty()) {
    357     // Valid Image URL.
    358     context->AddImageRef(type, width, height);
    359   }
    360 }
    361 
    362 void ParseParam(const xmlChar** atts, ParsingContext* context) {
    363   if (!atts)
    364     return;
    365 
    366   const xmlChar** attributes = atts;
    367   std::string key, value;
    368   while (*attributes) {
    369     std::string name(XMLCharToString(*attributes));
    370     const xmlChar* val = attributes[1];
    371     if (name == kParamNameAttribute) {
    372       key = XMLCharToString(val);
    373     } else if (name == kParamValueAttribute) {
    374       value = XMLCharToString(val);
    375     }
    376     attributes += 2;
    377   }
    378   if (!key.empty())
    379     context->AddExtraParams(key, value);
    380 }
    381 
    382 static void AppendParamToQuery(const std::string& key,
    383                                const std::string& value,
    384                                std::string* query) {
    385   if (!query->empty())
    386     query->append("&");
    387   if (!key.empty()) {
    388     query->append(key);
    389     query->append("=");
    390   }
    391   query->append(value);
    392 }
    393 
    394 void ProcessURLParams(ParsingContext* context) {
    395   TemplateURL* t_url = context->template_url();
    396   const TemplateURLRef* t_url_ref =
    397       context->is_suggestion() ? t_url->suggestions_url() :
    398                                  t_url->url();
    399   if (!t_url_ref)
    400     return;
    401 
    402   if (!context->parameter_filter() && context->extra_params().empty())
    403     return;
    404 
    405   GURL url(t_url_ref->url());
    406   // If there is a parameter filter, parse the existing URL and remove any
    407   // unwanted parameter.
    408   TemplateURLParser::ParameterFilter* filter = context->parameter_filter();
    409   std::string new_query;
    410   bool modified = false;
    411   if (filter) {
    412     url_parse::Component query = url.parsed_for_possibly_invalid_spec().query;
    413     url_parse::Component key, value;
    414     const char* url_spec = url.spec().c_str();
    415     while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
    416       std::string key_str(url_spec, key.begin, key.len);
    417       std::string value_str(url_spec, value.begin, value.len);
    418       if (filter->KeepParameter(key_str, value_str)) {
    419         AppendParamToQuery(key_str, value_str, &new_query);
    420       } else {
    421         modified = true;
    422       }
    423     }
    424   }
    425   if (!modified)
    426     new_query = url.query();
    427 
    428   // Add the extra parameters if any.
    429   const std::vector<ParsingContext::Param>& params = context->extra_params();
    430   if (!params.empty()) {
    431     modified = true;
    432     std::vector<ParsingContext::Param>::const_iterator iter;
    433     for (iter = params.begin(); iter != params.end(); ++iter)
    434       AppendParamToQuery(iter->first, iter->second, &new_query);
    435   }
    436 
    437   if (modified) {
    438     GURL::Replacements repl;
    439     repl.SetQueryStr(new_query);
    440     url = url.ReplaceComponents(repl);
    441     if (context->is_suggestion()) {
    442       t_url->SetSuggestionsURL(url.spec(),
    443                                t_url_ref->index_offset(),
    444                                t_url_ref->page_offset());
    445     } else {
    446       t_url->SetURL(url.spec(),
    447                     t_url_ref->index_offset(),
    448                     t_url_ref->page_offset());
    449     }
    450   }
    451 }
    452 
    453 void StartElementImpl(void *ctx, const xmlChar *name, const xmlChar **atts) {
    454   ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
    455   std::string node_name((const char*)name);
    456   PruneNamespace(&node_name);
    457   context->PushElement(node_name);
    458   switch (context->GetKnownType()) {
    459     case ParsingContext::URL:
    460       context->ResetExtraParams();
    461       ParseURL(atts, context);
    462       break;
    463     case ParsingContext::IMAGE:
    464       ParseImage(atts, context);
    465       break;
    466     case ParsingContext::PARAM:
    467       ParseParam(atts, context);
    468       break;
    469     default:
    470       break;
    471   }
    472   context->ResetString();
    473 }
    474 
    475 void EndElementImpl(void *ctx, const xmlChar *name) {
    476   ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
    477   switch (context->GetKnownType()) {
    478     case ParsingContext::SHORT_NAME:
    479       context->template_url()->set_short_name(context->GetString());
    480       break;
    481     case ParsingContext::DESCRIPTION:
    482       context->template_url()->set_description(context->GetString());
    483       break;
    484     case ParsingContext::IMAGE: {
    485       GURL image_url(UTF16ToUTF8(context->GetString()));
    486       if (image_url.SchemeIs(chrome::kDataScheme)) {
    487         // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
    488         // decode the data URL in the renderer. For now, we'll just point to the
    489         // favicon from the URL.
    490         context->set_derive_image_from_url(true);
    491       } else {
    492         context->SetImageURL(image_url);
    493       }
    494       context->EndImage();
    495       break;
    496     }
    497     case ParsingContext::LANGUAGE:
    498       context->template_url()->add_language(context->GetString());
    499       break;
    500     case ParsingContext::INPUT_ENCODING: {
    501       std::string input_encoding = UTF16ToASCII(context->GetString());
    502       if (IsValidEncodingString(input_encoding))
    503         context->template_url()->add_input_encoding(input_encoding);
    504       break;
    505     }
    506     case ParsingContext::URL:
    507       ProcessURLParams(context);
    508       break;
    509     default:
    510       break;
    511   }
    512   context->ResetString();
    513   context->PopElement();
    514 }
    515 
    516 void CharactersImpl(void *ctx, const xmlChar *ch, int len) {
    517   ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
    518   context->AppendString(XMLCharToUTF16(ch, len));
    519 }
    520 
    521 // Returns true if the ref is null, or the url wrapped by ref is
    522 // valid with a spec of http/https.
    523 bool IsHTTPRef(const TemplateURLRef* ref) {
    524   if (ref == NULL)
    525     return true;
    526   GURL url(ref->url());
    527   return (url.is_valid() && (url.SchemeIs(chrome::kHttpScheme) ||
    528                              url.SchemeIs(chrome::kHttpsScheme)));
    529 }
    530 
    531 // Returns true if the TemplateURL is legal. A legal TemplateURL is one
    532 // where all URLs have a spec of http/https.
    533 bool IsLegal(TemplateURL* url) {
    534   if (!IsHTTPRef(url->url()) || !IsHTTPRef(url->suggestions_url()))
    535     return false;
    536   // Make sure all the image refs are legal.
    537   const std::vector<TemplateURL::ImageRef>& image_refs = url->image_refs();
    538   for (size_t i = 0; i < image_refs.size(); i++) {
    539     GURL image_url(image_refs[i].url);
    540     if (!image_url.is_valid() ||
    541         !(image_url.SchemeIs(chrome::kHttpScheme) ||
    542           image_url.SchemeIs(chrome::kHttpsScheme))) {
    543       return false;
    544     }
    545   }
    546   return true;
    547 }
    548 
    549 }  // namespace
    550 
    551 // static
    552 bool TemplateURLParser::Parse(const unsigned char* data, size_t length,
    553                               TemplateURLParser::ParameterFilter* param_filter,
    554                               TemplateURL* url) {
    555   DCHECK(url);
    556   // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
    557   // &#38; . Unfortunately xmlSubstituteEntitiesDefault effects global state.
    558   // If this becomes problematic we'll need to provide our own entity
    559   // type for &amp;, or strip out &#34; by hand after parsing.
    560   int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
    561   ParsingContext context(param_filter, url);
    562   xmlSAXHandler sax_handler;
    563   memset(&sax_handler, 0, sizeof(sax_handler));
    564   sax_handler.startElement = &StartElementImpl;
    565   sax_handler.endElement = &EndElementImpl;
    566   sax_handler.characters = &CharactersImpl;
    567   xmlSAXUserParseMemory(&sax_handler, &context,
    568                         reinterpret_cast<const char*>(data),
    569                         static_cast<int>(length));
    570   xmlSubstituteEntitiesDefault(last_sub_entities_value);
    571   // If the image was a data URL, use the favicon from the search URL instead.
    572   // (see TODO inEndElementImpl()).
    573   context.DeriveImageFromURL();
    574 
    575   // TODO(jcampan): http://b/issue?id=1196285 we do not support search engines
    576   //                that use POST yet.
    577   if (context.method() == ParsingContext::POST)
    578     return false;
    579   if (context.suggestion_method() == ParsingContext::POST)
    580     url->SetSuggestionsURL("", 0, 0);
    581 
    582   if (!url->short_name().empty() && !url->description().empty()) {
    583     // So far so good, make sure the urls are http.
    584     return IsLegal(url);
    585   }
    586   return false;
    587 }
    588