1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/search_engines/template_url_parser.h" 6 7 #include <algorithm> 8 #include <map> 9 #include <vector> 10 11 #include "base/logging.h" 12 #include "base/memory/scoped_ptr.h" 13 #include "base/strings/string_number_conversions.h" 14 #include "base/strings/string_util.h" 15 #include "base/strings/utf_string_conversions.h" 16 #include "chrome/browser/search_engines/search_terms_data.h" 17 #include "chrome/browser/search_engines/template_url.h" 18 #include "chrome/browser/search_engines/template_url_service.h" 19 #include "chrome/common/url_constants.h" 20 #include "libxml/parser.h" 21 #include "libxml/xmlwriter.h" 22 #include "ui/gfx/favicon_size.h" 23 #include "url/gurl.h" 24 25 namespace { 26 27 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds 28 // to that of char, the following names are all in terms of char. This avoids 29 // having to convert to wide, then do comparisons. 30 31 // Defines for element names of the OSD document: 32 const char kURLElement[] = "Url"; 33 const char kParamElement[] = "Param"; 34 const char kShortNameElement[] = "ShortName"; 35 const char kImageElement[] = "Image"; 36 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription"; 37 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin"; 38 const char kInputEncodingElement[] = "InputEncoding"; 39 40 // Various XML attributes used. 41 const char kURLTypeAttribute[] = "type"; 42 const char kURLTemplateAttribute[] = "template"; 43 const char kImageTypeAttribute[] = "type"; 44 const char kImageWidthAttribute[] = "width"; 45 const char kImageHeightAttribute[] = "height"; 46 const char kParamNameAttribute[] = "name"; 47 const char kParamValueAttribute[] = "value"; 48 const char kParamMethodAttribute[] = "method"; 49 50 // Mime type for search results. 51 const char kHTMLType[] = "text/html"; 52 53 // Mime type for as you type suggestions. 54 const char kSuggestionType[] = "application/x-suggestions+json"; 55 56 // Namespace identifier. 57 const char kOSDNS[] = "xmlns"; 58 59 // The namespace for documents we understand. 60 const char kNameSpace[] = "http://a9.com/-/spec/opensearch/1.1/"; 61 62 std::string XMLCharToString(const xmlChar* value) { 63 return std::string(reinterpret_cast<const char*>(value)); 64 } 65 66 // Returns true if input_encoding contains a valid input encoding string. This 67 // doesn't verify that we have a valid encoding for the string, just that the 68 // string contains characters that constitute a valid input encoding. 69 bool IsValidEncodingString(const std::string& input_encoding) { 70 if (input_encoding.empty()) 71 return false; 72 73 if (!IsAsciiAlpha(input_encoding[0])) 74 return false; 75 76 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) { 77 char c = input_encoding[i]; 78 if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' && 79 c != '-') { 80 return false; 81 } 82 } 83 return true; 84 } 85 86 void AppendParamToQuery(const std::string& key, 87 const std::string& value, 88 std::string* query) { 89 if (!query->empty()) 90 query->append("&"); 91 if (!key.empty()) { 92 query->append(key); 93 query->append("="); 94 } 95 query->append(value); 96 } 97 98 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S]. 99 bool IsHTTPRef(const std::string& url) { 100 if (url.empty()) 101 return true; 102 GURL gurl(url); 103 return gurl.is_valid() && (gurl.SchemeIs(chrome::kHttpScheme) || 104 gurl.SchemeIs(chrome::kHttpsScheme)); 105 } 106 107 } // namespace 108 109 110 // TemplateURLParsingContext -------------------------------------------------- 111 112 // To minimize memory overhead while parsing, a SAX style parser is used. 113 // TemplateURLParsingContext is used to maintain the state we're in the document 114 // while parsing. 115 class TemplateURLParsingContext { 116 public: 117 // Enum of the known element types. 118 enum ElementType { 119 UNKNOWN, 120 OPEN_SEARCH_DESCRIPTION, 121 URL, 122 PARAM, 123 SHORT_NAME, 124 IMAGE, 125 INPUT_ENCODING, 126 }; 127 128 enum Method { 129 GET, 130 POST 131 }; 132 133 // Key/value of a Param node. 134 typedef std::pair<std::string, std::string> Param; 135 136 explicit TemplateURLParsingContext( 137 TemplateURLParser::ParameterFilter* parameter_filter); 138 139 static void StartElementImpl(void* ctx, 140 const xmlChar* name, 141 const xmlChar** atts); 142 static void EndElementImpl(void* ctx, const xmlChar* name); 143 static void CharactersImpl(void* ctx, const xmlChar* ch, int len); 144 145 // Returns a heap-allocated TemplateURL representing the result of parsing. 146 // This will be NULL if parsing failed or if the results were invalid for some 147 // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied, 148 // a resulting TemplateURLRef was invalid, etc.). 149 TemplateURL* GetTemplateURL(Profile* profile, bool show_in_default_list); 150 151 private: 152 // Key is UTF8 encoded. 153 typedef std::map<std::string, ElementType> ElementNameToElementTypeMap; 154 155 static void InitMapping(); 156 157 void ParseURL(const xmlChar** atts); 158 void ParseImage(const xmlChar** atts); 159 void ParseParam(const xmlChar** atts); 160 void ProcessURLParams(); 161 162 // Returns the current ElementType. 163 ElementType GetKnownType(); 164 165 static ElementNameToElementTypeMap* kElementNameToElementTypeMap; 166 167 // Data that gets updated as we parse, and is converted to a TemplateURL by 168 // GetTemplateURL(). 169 TemplateURLData data_; 170 171 std::vector<ElementType> elements_; 172 bool image_is_valid_for_favicon_; 173 174 // Character content for the current element. 175 string16 string_; 176 177 TemplateURLParser::ParameterFilter* parameter_filter_; 178 179 // The list of parameters parsed in the Param nodes of a Url node. 180 std::vector<Param> extra_params_; 181 182 // The HTTP methods used. 183 Method method_; 184 Method suggestion_method_; 185 186 // If true, we are currently parsing a suggest URL, otherwise it is an HTML 187 // search. Note that we don't need a stack as URL nodes cannot be nested. 188 bool is_suggest_url_; 189 190 // Whether we should derive the image from the URL (when images are data 191 // URLs). 192 bool derive_image_from_url_; 193 194 DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext); 195 }; 196 197 // static 198 TemplateURLParsingContext::ElementNameToElementTypeMap* 199 TemplateURLParsingContext::kElementNameToElementTypeMap = NULL; 200 201 TemplateURLParsingContext::TemplateURLParsingContext( 202 TemplateURLParser::ParameterFilter* parameter_filter) 203 : image_is_valid_for_favicon_(false), 204 parameter_filter_(parameter_filter), 205 method_(GET), 206 suggestion_method_(GET), 207 is_suggest_url_(false), 208 derive_image_from_url_(false) { 209 if (kElementNameToElementTypeMap == NULL) 210 InitMapping(); 211 } 212 213 // static 214 void TemplateURLParsingContext::StartElementImpl(void* ctx, 215 const xmlChar* name, 216 const xmlChar** atts) { 217 // Remove the namespace from |name|, ex: os:Url -> Url. 218 std::string node_name(XMLCharToString(name)); 219 size_t index = node_name.find_first_of(":"); 220 if (index != std::string::npos) 221 node_name.erase(0, index + 1); 222 223 TemplateURLParsingContext* context = 224 reinterpret_cast<TemplateURLParsingContext*>(ctx); 225 context->elements_.push_back( 226 context->kElementNameToElementTypeMap->count(node_name) ? 227 (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN); 228 switch (context->GetKnownType()) { 229 case TemplateURLParsingContext::URL: 230 context->extra_params_.clear(); 231 context->ParseURL(atts); 232 break; 233 case TemplateURLParsingContext::IMAGE: 234 context->ParseImage(atts); 235 break; 236 case TemplateURLParsingContext::PARAM: 237 context->ParseParam(atts); 238 break; 239 default: 240 break; 241 } 242 context->string_.clear(); 243 } 244 245 // static 246 void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) { 247 TemplateURLParsingContext* context = 248 reinterpret_cast<TemplateURLParsingContext*>(ctx); 249 switch (context->GetKnownType()) { 250 case TemplateURLParsingContext::SHORT_NAME: 251 context->data_.short_name = context->string_; 252 break; 253 case TemplateURLParsingContext::IMAGE: { 254 GURL image_url(UTF16ToUTF8(context->string_)); 255 if (image_url.SchemeIs(chrome::kDataScheme)) { 256 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to 257 // decode the data URL in the renderer. For now, we'll just point to the 258 // favicon from the URL. 259 context->derive_image_from_url_ = true; 260 } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() && 261 (image_url.SchemeIs(chrome::kHttpScheme) || 262 image_url.SchemeIs(chrome::kHttpsScheme))) { 263 context->data_.favicon_url = image_url; 264 } 265 context->image_is_valid_for_favicon_ = false; 266 break; 267 } 268 case TemplateURLParsingContext::INPUT_ENCODING: { 269 std::string input_encoding = UTF16ToASCII(context->string_); 270 if (IsValidEncodingString(input_encoding)) 271 context->data_.input_encodings.push_back(input_encoding); 272 break; 273 } 274 case TemplateURLParsingContext::URL: 275 context->ProcessURLParams(); 276 break; 277 default: 278 break; 279 } 280 context->string_.clear(); 281 context->elements_.pop_back(); 282 } 283 284 // static 285 void TemplateURLParsingContext::CharactersImpl(void* ctx, 286 const xmlChar* ch, 287 int len) { 288 reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ += 289 UTF8ToUTF16(std::string(reinterpret_cast<const char*>(ch), len)); 290 } 291 292 TemplateURL* TemplateURLParsingContext::GetTemplateURL( 293 Profile* profile, 294 bool show_in_default_list) { 295 // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107 296 if (method_ == TemplateURLParsingContext::POST || data_.short_name.empty() || 297 !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url)) 298 return NULL; 299 if (suggestion_method_ == TemplateURLParsingContext::POST) 300 data_.suggestions_url.clear(); 301 302 // If the image was a data URL, use the favicon from the search URL instead. 303 // (see the TODO in EndElementImpl()). 304 GURL search_url(data_.url()); 305 if (derive_image_from_url_ && data_.favicon_url.is_empty()) 306 data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url); 307 308 data_.SetKeyword(TemplateURLService::GenerateKeyword(search_url)); 309 data_.show_in_default_list = show_in_default_list; 310 311 // Bail if the search URL is empty or if either TemplateURLRef is invalid. 312 scoped_ptr<TemplateURL> template_url(new TemplateURL(profile, data_)); 313 scoped_ptr<SearchTermsData> search_terms_data(profile ? 314 new UIThreadSearchTermsData(profile) : new SearchTermsData()); 315 if (template_url->url().empty() || 316 !template_url->url_ref().IsValidUsingTermsData(*search_terms_data) || 317 (!template_url->suggestions_url().empty() && 318 !template_url->suggestions_url_ref(). 319 IsValidUsingTermsData(*search_terms_data))) { 320 return NULL; 321 } 322 323 return template_url.release(); 324 } 325 326 // static 327 void TemplateURLParsingContext::InitMapping() { 328 kElementNameToElementTypeMap = new std::map<std::string, ElementType>; 329 (*kElementNameToElementTypeMap)[kURLElement] = URL; 330 (*kElementNameToElementTypeMap)[kParamElement] = PARAM; 331 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME; 332 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE; 333 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] = 334 OPEN_SEARCH_DESCRIPTION; 335 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] = 336 OPEN_SEARCH_DESCRIPTION; 337 (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING; 338 } 339 340 void TemplateURLParsingContext::ParseURL(const xmlChar** atts) { 341 if (!atts) 342 return; 343 344 std::string template_url; 345 bool is_post = false; 346 bool is_html_url = false; 347 bool is_suggest_url = false; 348 for (; *atts; atts += 2) { 349 std::string name(XMLCharToString(*atts)); 350 const xmlChar* value = atts[1]; 351 if (name == kURLTypeAttribute) { 352 std::string type = XMLCharToString(value); 353 is_html_url = (type == kHTMLType); 354 is_suggest_url = (type == kSuggestionType); 355 } else if (name == kURLTemplateAttribute) { 356 template_url = XMLCharToString(value); 357 } else if (name == kParamMethodAttribute) { 358 is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post"); 359 } 360 } 361 362 if (is_html_url && !template_url.empty()) { 363 data_.SetURL(template_url); 364 is_suggest_url_ = false; 365 if (is_post) 366 method_ = POST; 367 } else if (is_suggest_url) { 368 data_.suggestions_url = template_url; 369 is_suggest_url_ = true; 370 if (is_post) 371 suggestion_method_ = POST; 372 } 373 } 374 375 void TemplateURLParsingContext::ParseImage(const xmlChar** atts) { 376 if (!atts) 377 return; 378 379 int width = 0; 380 int height = 0; 381 std::string type; 382 for (; *atts; atts += 2) { 383 std::string name(XMLCharToString(*atts)); 384 const xmlChar* value = atts[1]; 385 if (name == kImageTypeAttribute) { 386 type = XMLCharToString(value); 387 } else if (name == kImageWidthAttribute) { 388 base::StringToInt(XMLCharToString(value), &width); 389 } else if (name == kImageHeightAttribute) { 390 base::StringToInt(XMLCharToString(value), &height); 391 } 392 } 393 394 image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) && 395 (height == gfx::kFaviconSize) && 396 ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon")); 397 } 398 399 void TemplateURLParsingContext::ParseParam(const xmlChar** atts) { 400 if (!atts) 401 return; 402 403 std::string key, value; 404 for (; *atts; atts += 2) { 405 std::string name(XMLCharToString(*atts)); 406 const xmlChar* val = atts[1]; 407 if (name == kParamNameAttribute) { 408 key = XMLCharToString(val); 409 } else if (name == kParamValueAttribute) { 410 value = XMLCharToString(val); 411 } 412 } 413 414 if (!key.empty() && 415 (!parameter_filter_ || parameter_filter_->KeepParameter(key, value))) 416 extra_params_.push_back(Param(key, value)); 417 } 418 419 void TemplateURLParsingContext::ProcessURLParams() { 420 if (!parameter_filter_ && extra_params_.empty()) 421 return; 422 423 GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url()); 424 if (url.is_empty()) 425 return; 426 427 // If there is a parameter filter, parse the existing URL and remove any 428 // unwanted parameter. 429 std::string new_query; 430 bool modified = false; 431 if (parameter_filter_) { 432 url_parse::Component query = url.parsed_for_possibly_invalid_spec().query; 433 url_parse::Component key, value; 434 const char* url_spec = url.spec().c_str(); 435 while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) { 436 std::string key_str(url_spec, key.begin, key.len); 437 std::string value_str(url_spec, value.begin, value.len); 438 if (parameter_filter_->KeepParameter(key_str, value_str)) { 439 AppendParamToQuery(key_str, value_str, &new_query); 440 } else { 441 modified = true; 442 } 443 } 444 } 445 if (!modified) 446 new_query = url.query(); 447 448 // Add the extra parameters if any. 449 if (!extra_params_.empty()) { 450 modified = true; 451 for (std::vector<Param>::const_iterator iter(extra_params_.begin()); 452 iter != extra_params_.end(); ++iter) 453 AppendParamToQuery(iter->first, iter->second, &new_query); 454 } 455 456 if (modified) { 457 GURL::Replacements repl; 458 repl.SetQueryStr(new_query); 459 url = url.ReplaceComponents(repl); 460 if (is_suggest_url_) 461 data_.suggestions_url = url.spec(); 462 else if (url.is_valid()) 463 data_.SetURL(url.spec()); 464 } 465 } 466 467 TemplateURLParsingContext::ElementType 468 TemplateURLParsingContext::GetKnownType() { 469 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION) 470 return elements_[1]; 471 // We only expect PARAM nodes under the URL node. 472 return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION && 473 elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN; 474 } 475 476 477 // TemplateURLParser ---------------------------------------------------------- 478 479 // static 480 TemplateURL* TemplateURLParser::Parse( 481 Profile* profile, 482 bool show_in_default_list, 483 const char* data, 484 size_t length, 485 TemplateURLParser::ParameterFilter* param_filter) { 486 // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to 487 // & . Unfortunately xmlSubstituteEntitiesDefault affects global state. 488 // If this becomes problematic we'll need to provide our own entity 489 // type for &, or strip out & by hand after parsing. 490 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1); 491 TemplateURLParsingContext context(param_filter); 492 xmlSAXHandler sax_handler; 493 memset(&sax_handler, 0, sizeof(sax_handler)); 494 sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl; 495 sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl; 496 sax_handler.characters = &TemplateURLParsingContext::CharactersImpl; 497 int error = xmlSAXUserParseMemory(&sax_handler, &context, data, 498 static_cast<int>(length)); 499 xmlSubstituteEntitiesDefault(last_sub_entities_value); 500 501 return error ? NULL : context.GetTemplateURL(profile, show_in_default_list); 502 } 503