1 // Copyright 2014 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "components/search_engines/template_url_parser.h" 6 7 #include <algorithm> 8 #include <map> 9 #include <vector> 10 11 #include "base/logging.h" 12 #include "base/memory/scoped_ptr.h" 13 #include "base/strings/string_number_conversions.h" 14 #include "base/strings/string_util.h" 15 #include "base/strings/utf_string_conversions.h" 16 #include "components/search_engines/template_url.h" 17 #include "libxml/parser.h" 18 #include "libxml/xmlwriter.h" 19 #include "ui/gfx/favicon_size.h" 20 #include "url/gurl.h" 21 #include "url/url_constants.h" 22 23 namespace { 24 25 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds 26 // to that of char, the following names are all in terms of char. This avoids 27 // having to convert to wide, then do comparisons. 28 29 // Defines for element names of the OSD document: 30 const char kURLElement[] = "Url"; 31 const char kParamElement[] = "Param"; 32 const char kShortNameElement[] = "ShortName"; 33 const char kImageElement[] = "Image"; 34 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription"; 35 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin"; 36 const char kInputEncodingElement[] = "InputEncoding"; 37 const char kAliasElement[] = "Alias"; 38 39 // Various XML attributes used. 40 const char kURLTypeAttribute[] = "type"; 41 const char kURLTemplateAttribute[] = "template"; 42 const char kImageTypeAttribute[] = "type"; 43 const char kImageWidthAttribute[] = "width"; 44 const char kImageHeightAttribute[] = "height"; 45 const char kParamNameAttribute[] = "name"; 46 const char kParamValueAttribute[] = "value"; 47 const char kParamMethodAttribute[] = "method"; 48 49 // Mime type for search results. 50 const char kHTMLType[] = "text/html"; 51 52 // Mime type for as you type suggestions. 53 const char kSuggestionType[] = "application/x-suggestions+json"; 54 55 std::string XMLCharToString(const xmlChar* value) { 56 return std::string(reinterpret_cast<const char*>(value)); 57 } 58 59 // Returns true if input_encoding contains a valid input encoding string. This 60 // doesn't verify that we have a valid encoding for the string, just that the 61 // string contains characters that constitute a valid input encoding. 62 bool IsValidEncodingString(const std::string& input_encoding) { 63 if (input_encoding.empty()) 64 return false; 65 66 if (!IsAsciiAlpha(input_encoding[0])) 67 return false; 68 69 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) { 70 char c = input_encoding[i]; 71 if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' && 72 c != '-') { 73 return false; 74 } 75 } 76 return true; 77 } 78 79 void AppendParamToQuery(const std::string& key, 80 const std::string& value, 81 std::string* query) { 82 if (!query->empty()) 83 query->append("&"); 84 if (!key.empty()) { 85 query->append(key); 86 query->append("="); 87 } 88 query->append(value); 89 } 90 91 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S]. 92 bool IsHTTPRef(const std::string& url) { 93 if (url.empty()) 94 return true; 95 GURL gurl(url); 96 return gurl.is_valid() && (gurl.SchemeIs(url::kHttpScheme) || 97 gurl.SchemeIs(url::kHttpsScheme)); 98 } 99 100 } // namespace 101 102 103 // TemplateURLParsingContext -------------------------------------------------- 104 105 // To minimize memory overhead while parsing, a SAX style parser is used. 106 // TemplateURLParsingContext is used to maintain the state we're in the document 107 // while parsing. 108 class TemplateURLParsingContext { 109 public: 110 // Enum of the known element types. 111 enum ElementType { 112 UNKNOWN, 113 OPEN_SEARCH_DESCRIPTION, 114 URL, 115 PARAM, 116 SHORT_NAME, 117 IMAGE, 118 INPUT_ENCODING, 119 ALIAS, 120 }; 121 122 enum Method { 123 GET, 124 POST 125 }; 126 127 // Key/value of a Param node. 128 typedef std::pair<std::string, std::string> Param; 129 130 explicit TemplateURLParsingContext( 131 TemplateURLParser::ParameterFilter* parameter_filter); 132 133 static void StartElementImpl(void* ctx, 134 const xmlChar* name, 135 const xmlChar** atts); 136 static void EndElementImpl(void* ctx, const xmlChar* name); 137 static void CharactersImpl(void* ctx, const xmlChar* ch, int len); 138 139 // Returns a heap-allocated TemplateURL representing the result of parsing. 140 // This will be NULL if parsing failed or if the results were invalid for some 141 // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied, 142 // a resulting TemplateURLRef was invalid, etc.). 143 TemplateURL* GetTemplateURL(const SearchTermsData& search_terms_data, 144 bool show_in_default_list); 145 146 private: 147 // Key is UTF8 encoded. 148 typedef std::map<std::string, ElementType> ElementNameToElementTypeMap; 149 150 static void InitMapping(); 151 152 void ParseURL(const xmlChar** atts); 153 void ParseImage(const xmlChar** atts); 154 void ParseParam(const xmlChar** atts); 155 void ProcessURLParams(); 156 157 // Returns the current ElementType. 158 ElementType GetKnownType(); 159 160 static ElementNameToElementTypeMap* kElementNameToElementTypeMap; 161 162 // Data that gets updated as we parse, and is converted to a TemplateURL by 163 // GetTemplateURL(). 164 TemplateURLData data_; 165 166 std::vector<ElementType> elements_; 167 bool image_is_valid_for_favicon_; 168 169 // Character content for the current element. 170 base::string16 string_; 171 172 TemplateURLParser::ParameterFilter* parameter_filter_; 173 174 // The list of parameters parsed in the Param nodes of a Url node. 175 std::vector<Param> extra_params_; 176 177 // The HTTP methods used. 178 Method method_; 179 Method suggestion_method_; 180 181 // If true, we are currently parsing a suggest URL, otherwise it is an HTML 182 // search. Note that we don't need a stack as URL nodes cannot be nested. 183 bool is_suggest_url_; 184 185 // If true, the user has set a keyword and we should use it. Otherwise, 186 // we generate a keyword based on the URL. 187 bool has_custom_keyword_; 188 189 // Whether we should derive the image from the URL (when images are data 190 // URLs). 191 bool derive_image_from_url_; 192 193 DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext); 194 }; 195 196 // static 197 TemplateURLParsingContext::ElementNameToElementTypeMap* 198 TemplateURLParsingContext::kElementNameToElementTypeMap = NULL; 199 200 TemplateURLParsingContext::TemplateURLParsingContext( 201 TemplateURLParser::ParameterFilter* parameter_filter) 202 : image_is_valid_for_favicon_(false), 203 parameter_filter_(parameter_filter), 204 method_(GET), 205 suggestion_method_(GET), 206 is_suggest_url_(false), 207 has_custom_keyword_(false), 208 derive_image_from_url_(false) { 209 if (kElementNameToElementTypeMap == NULL) 210 InitMapping(); 211 } 212 213 // static 214 void TemplateURLParsingContext::StartElementImpl(void* ctx, 215 const xmlChar* name, 216 const xmlChar** atts) { 217 // Remove the namespace from |name|, ex: os:Url -> Url. 218 std::string node_name(XMLCharToString(name)); 219 size_t index = node_name.find_first_of(":"); 220 if (index != std::string::npos) 221 node_name.erase(0, index + 1); 222 223 TemplateURLParsingContext* context = 224 reinterpret_cast<TemplateURLParsingContext*>(ctx); 225 context->elements_.push_back( 226 context->kElementNameToElementTypeMap->count(node_name) ? 227 (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN); 228 switch (context->GetKnownType()) { 229 case TemplateURLParsingContext::URL: 230 context->extra_params_.clear(); 231 context->ParseURL(atts); 232 break; 233 case TemplateURLParsingContext::IMAGE: 234 context->ParseImage(atts); 235 break; 236 case TemplateURLParsingContext::PARAM: 237 context->ParseParam(atts); 238 break; 239 default: 240 break; 241 } 242 context->string_.clear(); 243 } 244 245 // static 246 void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) { 247 TemplateURLParsingContext* context = 248 reinterpret_cast<TemplateURLParsingContext*>(ctx); 249 switch (context->GetKnownType()) { 250 case TemplateURLParsingContext::URL: 251 context->ProcessURLParams(); 252 break; 253 case TemplateURLParsingContext::SHORT_NAME: 254 context->data_.short_name = context->string_; 255 break; 256 case TemplateURLParsingContext::IMAGE: { 257 GURL image_url(base::UTF16ToUTF8(context->string_)); 258 if (image_url.SchemeIs(url::kDataScheme)) { 259 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to 260 // decode the data URL in the renderer. For now, we'll just point to the 261 // favicon from the URL. 262 context->derive_image_from_url_ = true; 263 } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() && 264 (image_url.SchemeIs(url::kHttpScheme) || 265 image_url.SchemeIs(url::kHttpsScheme))) { 266 context->data_.favicon_url = image_url; 267 } 268 context->image_is_valid_for_favicon_ = false; 269 break; 270 } 271 case TemplateURLParsingContext::INPUT_ENCODING: { 272 std::string input_encoding = base::UTF16ToASCII(context->string_); 273 if (IsValidEncodingString(input_encoding)) 274 context->data_.input_encodings.push_back(input_encoding); 275 break; 276 } 277 case TemplateURLParsingContext::ALIAS: { 278 context->data_.SetKeyword(context->string_); 279 context->has_custom_keyword_ = true; 280 break; 281 } 282 default: 283 break; 284 } 285 context->string_.clear(); 286 context->elements_.pop_back(); 287 } 288 289 // static 290 void TemplateURLParsingContext::CharactersImpl(void* ctx, 291 const xmlChar* ch, 292 int len) { 293 reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ += 294 base::UTF8ToUTF16(std::string(reinterpret_cast<const char*>(ch), len)); 295 } 296 297 TemplateURL* TemplateURLParsingContext::GetTemplateURL( 298 const SearchTermsData& search_terms_data, 299 bool show_in_default_list) { 300 // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107 301 if (method_ == TemplateURLParsingContext::POST || data_.short_name.empty() || 302 !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url)) 303 return NULL; 304 if (suggestion_method_ == TemplateURLParsingContext::POST) 305 data_.suggestions_url.clear(); 306 307 // If the image was a data URL, use the favicon from the search URL instead. 308 // (see the TODO in EndElementImpl()). 309 GURL search_url(data_.url()); 310 if (derive_image_from_url_ && data_.favicon_url.is_empty()) 311 data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url); 312 313 // Generate a keyword for this search engine if a custom one was not present 314 // in the imported data. 315 if (!has_custom_keyword_) 316 data_.SetKeyword(TemplateURL::GenerateKeyword(search_url)); 317 318 data_.show_in_default_list = show_in_default_list; 319 320 // Bail if the search URL is empty or if either TemplateURLRef is invalid. 321 scoped_ptr<TemplateURL> template_url(new TemplateURL(data_)); 322 if (template_url->url().empty() || 323 !template_url->url_ref().IsValid(search_terms_data) || 324 (!template_url->suggestions_url().empty() && 325 !template_url->suggestions_url_ref().IsValid(search_terms_data))) { 326 return NULL; 327 } 328 329 return template_url.release(); 330 } 331 332 // static 333 void TemplateURLParsingContext::InitMapping() { 334 kElementNameToElementTypeMap = new std::map<std::string, ElementType>; 335 (*kElementNameToElementTypeMap)[kURLElement] = URL; 336 (*kElementNameToElementTypeMap)[kParamElement] = PARAM; 337 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME; 338 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE; 339 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] = 340 OPEN_SEARCH_DESCRIPTION; 341 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] = 342 OPEN_SEARCH_DESCRIPTION; 343 (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING; 344 (*kElementNameToElementTypeMap)[kAliasElement] = ALIAS; 345 } 346 347 void TemplateURLParsingContext::ParseURL(const xmlChar** atts) { 348 if (!atts) 349 return; 350 351 std::string template_url; 352 bool is_post = false; 353 bool is_html_url = false; 354 bool is_suggest_url = false; 355 for (; *atts; atts += 2) { 356 std::string name(XMLCharToString(*atts)); 357 const xmlChar* value = atts[1]; 358 if (name == kURLTypeAttribute) { 359 std::string type = XMLCharToString(value); 360 is_html_url = (type == kHTMLType); 361 is_suggest_url = (type == kSuggestionType); 362 } else if (name == kURLTemplateAttribute) { 363 template_url = XMLCharToString(value); 364 } else if (name == kParamMethodAttribute) { 365 is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post"); 366 } 367 } 368 369 if (is_html_url && !template_url.empty()) { 370 data_.SetURL(template_url); 371 is_suggest_url_ = false; 372 if (is_post) 373 method_ = POST; 374 } else if (is_suggest_url) { 375 data_.suggestions_url = template_url; 376 is_suggest_url_ = true; 377 if (is_post) 378 suggestion_method_ = POST; 379 } 380 } 381 382 void TemplateURLParsingContext::ParseImage(const xmlChar** atts) { 383 if (!atts) 384 return; 385 386 int width = 0; 387 int height = 0; 388 std::string type; 389 for (; *atts; atts += 2) { 390 std::string name(XMLCharToString(*atts)); 391 const xmlChar* value = atts[1]; 392 if (name == kImageTypeAttribute) { 393 type = XMLCharToString(value); 394 } else if (name == kImageWidthAttribute) { 395 base::StringToInt(XMLCharToString(value), &width); 396 } else if (name == kImageHeightAttribute) { 397 base::StringToInt(XMLCharToString(value), &height); 398 } 399 } 400 401 image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) && 402 (height == gfx::kFaviconSize) && 403 ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon")); 404 } 405 406 void TemplateURLParsingContext::ParseParam(const xmlChar** atts) { 407 if (!atts) 408 return; 409 410 std::string key, value; 411 for (; *atts; atts += 2) { 412 std::string name(XMLCharToString(*atts)); 413 const xmlChar* val = atts[1]; 414 if (name == kParamNameAttribute) { 415 key = XMLCharToString(val); 416 } else if (name == kParamValueAttribute) { 417 value = XMLCharToString(val); 418 } 419 } 420 421 if (!key.empty() && 422 (!parameter_filter_ || parameter_filter_->KeepParameter(key, value))) 423 extra_params_.push_back(Param(key, value)); 424 } 425 426 void TemplateURLParsingContext::ProcessURLParams() { 427 if (!parameter_filter_ && extra_params_.empty()) 428 return; 429 430 GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url()); 431 if (url.is_empty()) 432 return; 433 434 // If there is a parameter filter, parse the existing URL and remove any 435 // unwanted parameter. 436 std::string new_query; 437 bool modified = false; 438 if (parameter_filter_) { 439 url::Component query = url.parsed_for_possibly_invalid_spec().query; 440 url::Component key, value; 441 const char* url_spec = url.spec().c_str(); 442 while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) { 443 std::string key_str(url_spec, key.begin, key.len); 444 std::string value_str(url_spec, value.begin, value.len); 445 if (parameter_filter_->KeepParameter(key_str, value_str)) { 446 AppendParamToQuery(key_str, value_str, &new_query); 447 } else { 448 modified = true; 449 } 450 } 451 } 452 if (!modified) 453 new_query = url.query(); 454 455 // Add the extra parameters if any. 456 if (!extra_params_.empty()) { 457 modified = true; 458 for (std::vector<Param>::const_iterator iter(extra_params_.begin()); 459 iter != extra_params_.end(); ++iter) 460 AppendParamToQuery(iter->first, iter->second, &new_query); 461 } 462 463 if (modified) { 464 GURL::Replacements repl; 465 repl.SetQueryStr(new_query); 466 url = url.ReplaceComponents(repl); 467 if (is_suggest_url_) 468 data_.suggestions_url = url.spec(); 469 else if (url.is_valid()) 470 data_.SetURL(url.spec()); 471 } 472 } 473 474 TemplateURLParsingContext::ElementType 475 TemplateURLParsingContext::GetKnownType() { 476 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION) 477 return elements_[1]; 478 // We only expect PARAM nodes under the URL node. 479 return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION && 480 elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN; 481 } 482 483 484 // TemplateURLParser ---------------------------------------------------------- 485 486 // static 487 TemplateURL* TemplateURLParser::Parse( 488 const SearchTermsData& search_terms_data, 489 bool show_in_default_list, 490 const char* data, 491 size_t length, 492 TemplateURLParser::ParameterFilter* param_filter) { 493 // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to 494 // & . Unfortunately xmlSubstituteEntitiesDefault affects global state. 495 // If this becomes problematic we'll need to provide our own entity 496 // type for &, or strip out & by hand after parsing. 497 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1); 498 TemplateURLParsingContext context(param_filter); 499 xmlSAXHandler sax_handler; 500 memset(&sax_handler, 0, sizeof(sax_handler)); 501 sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl; 502 sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl; 503 sax_handler.characters = &TemplateURLParsingContext::CharactersImpl; 504 int error = xmlSAXUserParseMemory(&sax_handler, &context, data, 505 static_cast<int>(length)); 506 xmlSubstituteEntitiesDefault(last_sub_entities_value); 507 508 return error ? 509 NULL : context.GetTemplateURL(search_terms_data, show_in_default_list); 510 } 511