1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/search_engines/template_url_parser.h" 6 7 #include <algorithm> 8 #include <map> 9 #include <vector> 10 11 #include "base/logging.h" 12 #include "base/memory/scoped_ptr.h" 13 #include "base/string_number_conversions.h" 14 #include "base/string_util.h" 15 #include "base/utf_string_conversions.h" 16 #include "chrome/browser/search_engines/template_url.h" 17 #include "chrome/common/url_constants.h" 18 #include "googleurl/src/gurl.h" 19 #include "libxml/parser.h" 20 #include "libxml/xmlwriter.h" 21 22 namespace { 23 24 // 25 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds 26 // to that of char, the following names are all in terms of char. This avoids 27 // having to convert to wide, then do comparisons 28 29 // Defines for element names of the OSD document: 30 static const char kURLElement[] = "Url"; 31 static const char kParamElement[] = "Param"; 32 static const char kShortNameElement[] = "ShortName"; 33 static const char kDescriptionElement[] = "Description"; 34 static const char kImageElement[] = "Image"; 35 static const char kOpenSearchDescriptionElement[] = "OpenSearchDescription"; 36 static const char kFirefoxSearchDescriptionElement[] = "SearchPlugin"; 37 static const char kLanguageElement[] = "Language"; 38 static const char kInputEncodingElement[] = "InputEncoding"; 39 40 // Various XML attributes used. 41 static const char kURLTypeAttribute[] = "type"; 42 static const char kURLTemplateAttribute[] = "template"; 43 static const char kImageTypeAttribute[] = "type"; 44 static const char kImageWidthAttribute[] = "width"; 45 static const char kImageHeightAttribute[] = "height"; 46 static const char kURLIndexOffsetAttribute[] = "indexOffset"; 47 static const char kURLPageOffsetAttribute[] = "pageOffset"; 48 static const char kParamNameAttribute[] = "name"; 49 static const char kParamValueAttribute[] = "value"; 50 static const char kParamMethodAttribute[] = "method"; 51 52 // Mime type for search results. 53 static const char kHTMLType[] = "text/html"; 54 55 // Mime type for as you type suggestions. 56 static const char kSuggestionType[] = "application/x-suggestions+json"; 57 58 // Namespace identifier. 59 static const char kOSDNS[] = "xmlns"; 60 61 // The namespace for documents we understand. 62 static const char kNameSpace[] = "http://a9.com/-/spec/opensearch/1.1/"; 63 64 // Removes the namespace from the specified |name|, ex: os:Url -> Url. 65 static void PruneNamespace(std::string* name) { 66 size_t index = name->find_first_of(":"); 67 if (index != std::string::npos) 68 name->erase(0, index + 1); 69 } 70 71 // 72 // To minimize memory overhead while parsing, a SAX style parser is used. 73 // ParsingContext is used to maintain the state we're in the document 74 // while parsing. 75 class ParsingContext { 76 public: 77 // Enum of the known element types. 78 enum ElementType { 79 UNKNOWN, 80 OPEN_SEARCH_DESCRIPTION, 81 URL, 82 PARAM, 83 SHORT_NAME, 84 DESCRIPTION, 85 IMAGE, 86 LANGUAGE, 87 INPUT_ENCODING, 88 }; 89 90 enum Method { 91 GET, 92 POST 93 }; 94 95 // Key/value of a Param node. 96 typedef std::pair<std::string, std::string> Param; 97 98 ParsingContext(TemplateURLParser::ParameterFilter* parameter_filter, 99 TemplateURL* url) 100 : url_(url), 101 parameter_filter_(parameter_filter), 102 method_(GET), 103 suggestion_method_(GET), 104 is_suggest_url_(false), 105 derive_image_from_url_(false) { 106 if (kElementNameToElementTypeMap == NULL) 107 InitMapping(); 108 } 109 110 // Invoked when an element starts. 111 void PushElement(const std::string& element) { 112 ElementType type; 113 if (kElementNameToElementTypeMap->find(element) == 114 kElementNameToElementTypeMap->end()) { 115 type = UNKNOWN; 116 } else { 117 type = (*kElementNameToElementTypeMap)[element]; 118 } 119 elements_.push_back(type); 120 } 121 122 void PopElement() { 123 elements_.pop_back(); 124 } 125 126 // Returns the current ElementType. 127 ElementType GetKnownType() { 128 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION) 129 return elements_[1]; 130 131 // We only expect PARAM nodes under the Url node 132 if (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION && 133 elements_[1] == URL && elements_[2] == PARAM) 134 return PARAM; 135 136 return UNKNOWN; 137 } 138 139 TemplateURL* template_url() { return url_; } 140 141 void AddImageRef(const std::string& type, int width, int height) { 142 if (width > 0 && height > 0) 143 current_image_.reset(new TemplateURL::ImageRef(type, width, height)); 144 } 145 146 void EndImage() { 147 current_image_.reset(); 148 } 149 150 void SetImageURL(const GURL& url) { 151 if (current_image_.get()) { 152 current_image_->url = url; 153 url_->add_image_ref(*current_image_); 154 current_image_.reset(); 155 } 156 } 157 158 void ResetString() { 159 string_.clear(); 160 } 161 162 void AppendString(const string16& string) { 163 string_ += string; 164 } 165 166 const string16& GetString() { 167 return string_; 168 } 169 170 void ResetExtraParams() { 171 extra_params_.clear(); 172 } 173 174 void AddExtraParams(const std::string& key, const std::string& value) { 175 if (parameter_filter_ && !parameter_filter_->KeepParameter(key, value)) 176 return; 177 extra_params_.push_back(Param(key, value)); 178 } 179 180 const std::vector<Param>& extra_params() const { return extra_params_; } 181 182 void set_is_suggestion(bool value) { is_suggest_url_ = value; } 183 bool is_suggestion() const { return is_suggest_url_; } 184 185 TemplateURLParser::ParameterFilter* parameter_filter() const { 186 return parameter_filter_; 187 } 188 189 void set_derive_image_from_url(bool derive_image_from_url) { 190 derive_image_from_url_ = derive_image_from_url; 191 } 192 193 void set_method(Method method) { method_ = method; } 194 Method method() { return method_; } 195 196 void set_suggestion_method(Method method) { suggestion_method_ = method; } 197 Method suggestion_method() { return suggestion_method_; } 198 199 // Builds the image URL from the Template search URL if no image URL has been 200 // set. 201 void DeriveImageFromURL() { 202 if (derive_image_from_url_ && 203 url_->GetFaviconURL().is_empty() && url_->url()) { 204 GURL url(url_->url()->url()); // More url's please... 205 url_->SetFaviconURL(TemplateURL::GenerateFaviconURL(url)); 206 } 207 } 208 209 private: 210 static void InitMapping() { 211 kElementNameToElementTypeMap = new std::map<std::string, ElementType>; 212 (*kElementNameToElementTypeMap)[kURLElement] = URL; 213 (*kElementNameToElementTypeMap)[kParamElement] = PARAM; 214 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME; 215 (*kElementNameToElementTypeMap)[kDescriptionElement] = DESCRIPTION; 216 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE; 217 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] = 218 OPEN_SEARCH_DESCRIPTION; 219 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] = 220 OPEN_SEARCH_DESCRIPTION; 221 (*kElementNameToElementTypeMap)[kLanguageElement] = 222 LANGUAGE; 223 (*kElementNameToElementTypeMap)[kInputEncodingElement] = 224 INPUT_ENCODING; 225 } 226 227 // Key is UTF8 encoded. 228 static std::map<std::string, ElementType>* kElementNameToElementTypeMap; 229 // TemplateURL supplied to Read method. It's owned by the caller, so we 230 // don't need to free it. 231 TemplateURL* url_; 232 std::vector<ElementType> elements_; 233 scoped_ptr<TemplateURL::ImageRef> current_image_; 234 235 // Character content for the current element. 236 string16 string_; 237 238 TemplateURLParser::ParameterFilter* parameter_filter_; 239 240 // The list of parameters parsed in the Param nodes of a Url node. 241 std::vector<Param> extra_params_; 242 243 // The HTTP methods used. 244 Method method_; 245 Method suggestion_method_; 246 247 // If true, we are currently parsing a suggest URL, otherwise it is an HTML 248 // search. Note that we don't need a stack as Url nodes cannot be nested. 249 bool is_suggest_url_; 250 251 // Whether we should derive the image from the URL (when images are data 252 // URLs). 253 bool derive_image_from_url_; 254 255 DISALLOW_COPY_AND_ASSIGN(ParsingContext); 256 }; 257 258 // static 259 std::map<std::string, ParsingContext::ElementType>* 260 ParsingContext::kElementNameToElementTypeMap = NULL; 261 262 string16 XMLCharToUTF16(const xmlChar* value, int length) { 263 return UTF8ToUTF16(std::string((const char*)value, length)); 264 } 265 266 std::string XMLCharToString(const xmlChar* value) { 267 return std::string((const char*)value); 268 } 269 270 // Returns true if input_encoding contains a valid input encoding string. This 271 // doesn't verify that we have a valid encoding for the string, just that the 272 // string contains characters that constitute a valid input encoding. 273 bool IsValidEncodingString(const std::string& input_encoding) { 274 if (input_encoding.empty()) 275 return false; 276 277 if (!IsAsciiAlpha(input_encoding[0])) 278 return false; 279 280 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) { 281 char c = input_encoding[i]; 282 if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' && 283 c != '-') { 284 return false; 285 } 286 } 287 return true; 288 } 289 290 void ParseURL(const xmlChar** atts, ParsingContext* context) { 291 if (!atts) 292 return; 293 294 TemplateURL* turl = context->template_url(); 295 const xmlChar** attributes = atts; 296 std::string template_url; 297 bool is_post = false; 298 bool is_html_url = false; 299 bool is_suggest_url = false; 300 int index_offset = 1; 301 int page_offset = 1; 302 303 while (*attributes) { 304 std::string name(XMLCharToString(*attributes)); 305 const xmlChar* value = attributes[1]; 306 if (name == kURLTypeAttribute) { 307 std::string type = XMLCharToString(value); 308 is_html_url = (type == kHTMLType); 309 is_suggest_url = (type == kSuggestionType); 310 } else if (name == kURLTemplateAttribute) { 311 template_url = XMLCharToString(value); 312 } else if (name == kURLIndexOffsetAttribute) { 313 base::StringToInt(XMLCharToString(value), &index_offset); 314 index_offset = std::max(1, index_offset); 315 } else if (name == kURLPageOffsetAttribute) { 316 base::StringToInt(XMLCharToString(value), &page_offset); 317 page_offset = std::max(1, page_offset); 318 } else if (name == kParamMethodAttribute) { 319 is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post"); 320 } 321 attributes += 2; 322 } 323 if (is_html_url) { 324 turl->SetURL(template_url, index_offset, page_offset); 325 context->set_is_suggestion(false); 326 if (is_post) 327 context->set_method(ParsingContext::POST); 328 } else if (is_suggest_url) { 329 turl->SetSuggestionsURL(template_url, index_offset, page_offset); 330 context->set_is_suggestion(true); 331 if (is_post) 332 context->set_suggestion_method(ParsingContext::POST); 333 } 334 } 335 336 void ParseImage(const xmlChar** atts, ParsingContext* context) { 337 if (!atts) 338 return; 339 340 const xmlChar** attributes = atts; 341 int width = 0; 342 int height = 0; 343 std::string type; 344 while (*attributes) { 345 std::string name(XMLCharToString(*attributes)); 346 const xmlChar* value = attributes[1]; 347 if (name == kImageTypeAttribute) { 348 type = XMLCharToString(value); 349 } else if (name == kImageWidthAttribute) { 350 base::StringToInt(XMLCharToString(value), &width); 351 } else if (name == kImageHeightAttribute) { 352 base::StringToInt(XMLCharToString(value), &height); 353 } 354 attributes += 2; 355 } 356 if (width > 0 && height > 0 && !type.empty()) { 357 // Valid Image URL. 358 context->AddImageRef(type, width, height); 359 } 360 } 361 362 void ParseParam(const xmlChar** atts, ParsingContext* context) { 363 if (!atts) 364 return; 365 366 const xmlChar** attributes = atts; 367 std::string key, value; 368 while (*attributes) { 369 std::string name(XMLCharToString(*attributes)); 370 const xmlChar* val = attributes[1]; 371 if (name == kParamNameAttribute) { 372 key = XMLCharToString(val); 373 } else if (name == kParamValueAttribute) { 374 value = XMLCharToString(val); 375 } 376 attributes += 2; 377 } 378 if (!key.empty()) 379 context->AddExtraParams(key, value); 380 } 381 382 static void AppendParamToQuery(const std::string& key, 383 const std::string& value, 384 std::string* query) { 385 if (!query->empty()) 386 query->append("&"); 387 if (!key.empty()) { 388 query->append(key); 389 query->append("="); 390 } 391 query->append(value); 392 } 393 394 void ProcessURLParams(ParsingContext* context) { 395 TemplateURL* t_url = context->template_url(); 396 const TemplateURLRef* t_url_ref = 397 context->is_suggestion() ? t_url->suggestions_url() : 398 t_url->url(); 399 if (!t_url_ref) 400 return; 401 402 if (!context->parameter_filter() && context->extra_params().empty()) 403 return; 404 405 GURL url(t_url_ref->url()); 406 // If there is a parameter filter, parse the existing URL and remove any 407 // unwanted parameter. 408 TemplateURLParser::ParameterFilter* filter = context->parameter_filter(); 409 std::string new_query; 410 bool modified = false; 411 if (filter) { 412 url_parse::Component query = url.parsed_for_possibly_invalid_spec().query; 413 url_parse::Component key, value; 414 const char* url_spec = url.spec().c_str(); 415 while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) { 416 std::string key_str(url_spec, key.begin, key.len); 417 std::string value_str(url_spec, value.begin, value.len); 418 if (filter->KeepParameter(key_str, value_str)) { 419 AppendParamToQuery(key_str, value_str, &new_query); 420 } else { 421 modified = true; 422 } 423 } 424 } 425 if (!modified) 426 new_query = url.query(); 427 428 // Add the extra parameters if any. 429 const std::vector<ParsingContext::Param>& params = context->extra_params(); 430 if (!params.empty()) { 431 modified = true; 432 std::vector<ParsingContext::Param>::const_iterator iter; 433 for (iter = params.begin(); iter != params.end(); ++iter) 434 AppendParamToQuery(iter->first, iter->second, &new_query); 435 } 436 437 if (modified) { 438 GURL::Replacements repl; 439 repl.SetQueryStr(new_query); 440 url = url.ReplaceComponents(repl); 441 if (context->is_suggestion()) { 442 t_url->SetSuggestionsURL(url.spec(), 443 t_url_ref->index_offset(), 444 t_url_ref->page_offset()); 445 } else { 446 t_url->SetURL(url.spec(), 447 t_url_ref->index_offset(), 448 t_url_ref->page_offset()); 449 } 450 } 451 } 452 453 void StartElementImpl(void *ctx, const xmlChar *name, const xmlChar **atts) { 454 ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx); 455 std::string node_name((const char*)name); 456 PruneNamespace(&node_name); 457 context->PushElement(node_name); 458 switch (context->GetKnownType()) { 459 case ParsingContext::URL: 460 context->ResetExtraParams(); 461 ParseURL(atts, context); 462 break; 463 case ParsingContext::IMAGE: 464 ParseImage(atts, context); 465 break; 466 case ParsingContext::PARAM: 467 ParseParam(atts, context); 468 break; 469 default: 470 break; 471 } 472 context->ResetString(); 473 } 474 475 void EndElementImpl(void *ctx, const xmlChar *name) { 476 ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx); 477 switch (context->GetKnownType()) { 478 case ParsingContext::SHORT_NAME: 479 context->template_url()->set_short_name(context->GetString()); 480 break; 481 case ParsingContext::DESCRIPTION: 482 context->template_url()->set_description(context->GetString()); 483 break; 484 case ParsingContext::IMAGE: { 485 GURL image_url(UTF16ToUTF8(context->GetString())); 486 if (image_url.SchemeIs(chrome::kDataScheme)) { 487 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to 488 // decode the data URL in the renderer. For now, we'll just point to the 489 // favicon from the URL. 490 context->set_derive_image_from_url(true); 491 } else { 492 context->SetImageURL(image_url); 493 } 494 context->EndImage(); 495 break; 496 } 497 case ParsingContext::LANGUAGE: 498 context->template_url()->add_language(context->GetString()); 499 break; 500 case ParsingContext::INPUT_ENCODING: { 501 std::string input_encoding = UTF16ToASCII(context->GetString()); 502 if (IsValidEncodingString(input_encoding)) 503 context->template_url()->add_input_encoding(input_encoding); 504 break; 505 } 506 case ParsingContext::URL: 507 ProcessURLParams(context); 508 break; 509 default: 510 break; 511 } 512 context->ResetString(); 513 context->PopElement(); 514 } 515 516 void CharactersImpl(void *ctx, const xmlChar *ch, int len) { 517 ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx); 518 context->AppendString(XMLCharToUTF16(ch, len)); 519 } 520 521 // Returns true if the ref is null, or the url wrapped by ref is 522 // valid with a spec of http/https. 523 bool IsHTTPRef(const TemplateURLRef* ref) { 524 if (ref == NULL) 525 return true; 526 GURL url(ref->url()); 527 return (url.is_valid() && (url.SchemeIs(chrome::kHttpScheme) || 528 url.SchemeIs(chrome::kHttpsScheme))); 529 } 530 531 // Returns true if the TemplateURL is legal. A legal TemplateURL is one 532 // where all URLs have a spec of http/https. 533 bool IsLegal(TemplateURL* url) { 534 if (!IsHTTPRef(url->url()) || !IsHTTPRef(url->suggestions_url())) 535 return false; 536 // Make sure all the image refs are legal. 537 const std::vector<TemplateURL::ImageRef>& image_refs = url->image_refs(); 538 for (size_t i = 0; i < image_refs.size(); i++) { 539 GURL image_url(image_refs[i].url); 540 if (!image_url.is_valid() || 541 !(image_url.SchemeIs(chrome::kHttpScheme) || 542 image_url.SchemeIs(chrome::kHttpsScheme))) { 543 return false; 544 } 545 } 546 return true; 547 } 548 549 } // namespace 550 551 // static 552 bool TemplateURLParser::Parse(const unsigned char* data, size_t length, 553 TemplateURLParser::ParameterFilter* param_filter, 554 TemplateURL* url) { 555 DCHECK(url); 556 // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to 557 // & . Unfortunately xmlSubstituteEntitiesDefault effects global state. 558 // If this becomes problematic we'll need to provide our own entity 559 // type for &, or strip out " by hand after parsing. 560 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1); 561 ParsingContext context(param_filter, url); 562 xmlSAXHandler sax_handler; 563 memset(&sax_handler, 0, sizeof(sax_handler)); 564 sax_handler.startElement = &StartElementImpl; 565 sax_handler.endElement = &EndElementImpl; 566 sax_handler.characters = &CharactersImpl; 567 xmlSAXUserParseMemory(&sax_handler, &context, 568 reinterpret_cast<const char*>(data), 569 static_cast<int>(length)); 570 xmlSubstituteEntitiesDefault(last_sub_entities_value); 571 // If the image was a data URL, use the favicon from the search URL instead. 572 // (see TODO inEndElementImpl()). 573 context.DeriveImageFromURL(); 574 575 // TODO(jcampan): http://b/issue?id=1196285 we do not support search engines 576 // that use POST yet. 577 if (context.method() == ParsingContext::POST) 578 return false; 579 if (context.suggestion_method() == ParsingContext::POST) 580 url->SetSuggestionsURL("", 0, 0); 581 582 if (!url->short_name().empty() && !url->description().empty()) { 583 // So far so good, make sure the urls are http. 584 return IsLegal(url); 585 } 586 return false; 587 } 588