1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "content/child/site_isolation_policy.h" 6 7 #include "base/basictypes.h" 8 #include "base/command_line.h" 9 #include "base/lazy_instance.h" 10 #include "base/logging.h" 11 #include "base/metrics/histogram.h" 12 #include "base/strings/string_piece.h" 13 #include "base/strings/string_util.h" 14 #include "content/child/child_thread.h" 15 #include "content/public/common/content_switches.h" 16 #include "net/base/registry_controlled_domains/registry_controlled_domain.h" 17 #include "net/http/http_response_headers.h" 18 #include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h" 19 #include "third_party/WebKit/public/platform/WebString.h" 20 #include "third_party/WebKit/public/platform/WebURL.h" 21 #include "third_party/WebKit/public/platform/WebURLRequest.h" 22 #include "third_party/WebKit/public/platform/WebURLResponse.h" 23 #include "third_party/WebKit/public/web/WebDocument.h" 24 #include "third_party/WebKit/public/web/WebFrame.h" 25 #include "third_party/WebKit/public/web/WebFrameClient.h" 26 #include "third_party/WebKit/public/web/WebSecurityOrigin.h" 27 28 using base::StringPiece; 29 using blink::WebDocument; 30 using blink::WebString; 31 using blink::WebURL; 32 using blink::WebURLResponse; 33 using blink::WebURLRequest; 34 35 namespace content { 36 37 namespace { 38 39 // Maintain the bookkeeping data between OnReceivedResponse and 40 // OnReceivedData. The key is a request id maintained by ResourceDispatcher. 41 static base::LazyInstance<SiteIsolationPolicy::RequestIdToMetaDataMap> 42 g_metadata_map = LAZY_INSTANCE_INITIALIZER; 43 44 // Maintain the bookkeeping data for OnReceivedData. Blocking decision is made 45 // when OnReceivedData is called for the first time for a request, and the 46 // decision will remain the same for following data. This map maintains the 47 // decision. The key is a request id maintained by ResourceDispatcher. 48 static base::LazyInstance<SiteIsolationPolicy::RequestIdToResultMap> 49 g_result_map = LAZY_INSTANCE_INITIALIZER; 50 51 // The cross-site document blocking/UMA data collection is deactivated by 52 // default, and only activated in renderer processes. 53 static bool g_policy_enabled = false; 54 55 // MIME types 56 const char kTextHtml[] = "text/html"; 57 const char kTextXml[] = "text/xml"; 58 const char xAppRssXml[] = "application/rss+xml"; 59 const char kAppXml[] = "application/xml"; 60 const char kAppJson[] = "application/json"; 61 const char kTextJson[] = "text/json"; 62 const char kTextXjson[] = "text/x-json"; 63 const char kTextPlain[] = "text/plain"; 64 65 // TODO(dsjang): this is only needed for collecting UMA stat. Will be deleted 66 // when this class is used for actual blocking. 67 bool IsRenderableStatusCode(int status_code) { 68 // Chrome only uses the content of a response with one of these status codes 69 // for CSS/JavaScript. For images, Chrome just ignores status code. 70 const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 71 301, 302, 303, 305, 306, 307}; 72 for (size_t i = 0; i < arraysize(renderable_status_code); ++i) { 73 if (renderable_status_code[i] == status_code) 74 return true; 75 } 76 return false; 77 } 78 79 bool MatchesSignature(StringPiece data, 80 const StringPiece signatures[], 81 size_t arr_size) { 82 83 size_t offset = data.find_first_not_of(" \t\r\n"); 84 // There is no not-whitespace character in this document. 85 if (offset == base::StringPiece::npos) 86 return false; 87 88 data.remove_prefix(offset); 89 size_t length = data.length(); 90 91 for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) { 92 const StringPiece& signature = signatures[sig_index]; 93 size_t signature_length = signature.length(); 94 if (length < signature_length) 95 continue; 96 97 if (LowerCaseEqualsASCII( 98 data.begin(), data.begin() + signature_length, signature.data())) 99 return true; 100 } 101 return false; 102 } 103 104 void IncrementHistogramCount(const std::string& name) { 105 // The default value of min, max, bucket_count are copied from histogram.h. 106 base::HistogramBase* histogram_pointer = base::Histogram::FactoryGet( 107 name, 1, 100000, 50, base::HistogramBase::kUmaTargetedHistogramFlag); 108 histogram_pointer->Add(1); 109 } 110 111 void IncrementHistogramEnum(const std::string& name, 112 uint32 sample, 113 uint32 boundary_value) { 114 // The default value of min, max, bucket_count are copied from histogram.h. 115 base::HistogramBase* histogram_pointer = base::LinearHistogram::FactoryGet( 116 name, 117 1, 118 boundary_value, 119 boundary_value + 1, 120 base::HistogramBase::kUmaTargetedHistogramFlag); 121 histogram_pointer->Add(sample); 122 } 123 124 void HistogramCountBlockedResponse( 125 const std::string& bucket_prefix, 126 const SiteIsolationPolicy::ResponseMetaData& resp_data, 127 bool nosniff_block) { 128 std::string block_label(nosniff_block ? ".NoSniffBlocked" : ".Blocked"); 129 IncrementHistogramCount(bucket_prefix + block_label); 130 131 // The content is blocked if it is sniffed as HTML/JSON/XML. When 132 // the blocked response is with an error status code, it is not 133 // disruptive for the following reasons : 1) the blocked content is 134 // not a binary object (such as an image) since it is sniffed as 135 // text; 2) then, this blocking only breaks the renderer behavior 136 // only if it is either JavaScript or CSS. However, the renderer 137 // doesn't use the contents of JS/CSS with unaffected status code 138 // (e.g, 404). 3) the renderer is expected not to use the cross-site 139 // document content for purposes other than JS/CSS (e.g, XHR). 140 bool renderable_status_code = 141 IsRenderableStatusCode(resp_data.http_status_code); 142 143 if (renderable_status_code) { 144 IncrementHistogramEnum( 145 bucket_prefix + block_label + ".RenderableStatusCode", 146 resp_data.resource_type, 147 ResourceType::LAST_TYPE); 148 } else { 149 IncrementHistogramCount(bucket_prefix + block_label + 150 ".NonRenderableStatusCode"); 151 } 152 } 153 154 void HistogramCountNotBlockedResponse(const std::string& bucket_prefix, 155 bool sniffed_as_js) { 156 IncrementHistogramCount(bucket_prefix + ".NotBlocked"); 157 if (sniffed_as_js) 158 IncrementHistogramCount(bucket_prefix + ".NotBlocked.MaybeJS"); 159 } 160 161 } // namespace 162 163 SiteIsolationPolicy::ResponseMetaData::ResponseMetaData() {} 164 165 void SiteIsolationPolicy::SetPolicyEnabled(bool enabled) { 166 g_policy_enabled = enabled; 167 } 168 169 void SiteIsolationPolicy::OnReceivedResponse( 170 int request_id, 171 const GURL& frame_origin, 172 const GURL& response_url, 173 ResourceType::Type resource_type, 174 int origin_pid, 175 const webkit_glue::ResourceResponseInfo& info) { 176 if (!g_policy_enabled) 177 return; 178 179 // if |origin_pid| is non-zero, it means that this response is for a plugin 180 // spawned from this renderer process. We exclude responses for plugins for 181 // now, but eventually, we're going to make plugin processes directly talk to 182 // the browser process so that we don't apply cross-site document blocking to 183 // them. 184 if (origin_pid) 185 return; 186 187 UMA_HISTOGRAM_COUNTS("SiteIsolation.AllResponses", 1); 188 189 // See if this is for navigation. If it is, don't block it, under the 190 // assumption that we will put it in an appropriate process. 191 if (ResourceType::IsFrame(resource_type)) 192 return; 193 194 if (!IsBlockableScheme(response_url)) 195 return; 196 197 if (IsSameSite(frame_origin, response_url)) 198 return; 199 200 SiteIsolationPolicy::ResponseMetaData::CanonicalMimeType canonical_mime_type = 201 GetCanonicalMimeType(info.mime_type); 202 203 if (canonical_mime_type == SiteIsolationPolicy::ResponseMetaData::Others) 204 return; 205 206 // Every CORS request should have the Access-Control-Allow-Origin header even 207 // if it is preceded by a pre-flight request. Therefore, if this is a CORS 208 // request, it has this header. response.httpHeaderField() internally uses 209 // case-insensitive matching for the header name. 210 std::string access_control_origin; 211 212 // We can use a case-insensitive header name for EnumerateHeader(). 213 info.headers->EnumerateHeader( 214 NULL, "access-control-allow-origin", &access_control_origin); 215 if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin)) 216 return; 217 218 // Real XSD data collection starts from here. 219 std::string no_sniff; 220 info.headers->EnumerateHeader(NULL, "x-content-type-options", &no_sniff); 221 222 ResponseMetaData resp_data; 223 resp_data.frame_origin = frame_origin.spec(); 224 resp_data.response_url = response_url; 225 resp_data.resource_type = resource_type; 226 resp_data.canonical_mime_type = canonical_mime_type; 227 resp_data.http_status_code = info.headers->response_code(); 228 resp_data.no_sniff = LowerCaseEqualsASCII(no_sniff, "nosniff"); 229 230 (g_metadata_map.Get())[request_id] = resp_data; 231 } 232 233 bool SiteIsolationPolicy::ShouldBlockResponse( 234 int request_id, 235 const char* raw_data, 236 int raw_length, 237 std::string* alternative_data) { 238 if (!g_policy_enabled) 239 return false; 240 241 RequestIdToMetaDataMap& metadata_map = g_metadata_map.Get(); 242 RequestIdToResultMap& result_map = g_result_map.Get(); 243 244 // If there's an entry for |request_id| in blocked_map, this request's first 245 // data packet has already been examined. We can return the result here. 246 if (result_map.count(request_id) != 0) { 247 if (result_map[request_id]) { 248 // Here, the blocking result has been set for the previous run of 249 // ShouldBlockResponse(), so we set alternative data to an empty string so 250 // that ResourceDispatcher doesn't call its peer's onReceivedData() with 251 // the alternative data. 252 alternative_data->erase(); 253 return true; 254 } 255 return false; 256 } 257 258 // If result_map doesn't have an entry for |request_id|, we're receiving the 259 // first data packet for request_id. If request_id is not registered, this 260 // request is identified as a non-target of our policy. So we return true. 261 if (metadata_map.count(request_id) == 0) { 262 // We set request_id to true so that we always return true for this request. 263 result_map[request_id] = false; 264 return false; 265 } 266 267 StringPiece data(raw_data, raw_length); 268 269 // We now look at the first data packet received for request_id. 270 ResponseMetaData resp_data = metadata_map[request_id]; 271 metadata_map.erase(request_id); 272 273 // Record the length of the first received network packet to see if it's 274 // enough for sniffing. 275 UMA_HISTOGRAM_COUNTS("SiteIsolation.XSD.DataLength", raw_length); 276 277 // Record the number of cross-site document responses with a specific mime 278 // type (text/html, text/xml, etc). 279 UMA_HISTOGRAM_ENUMERATION( 280 "SiteIsolation.XSD.MimeType", 281 resp_data.canonical_mime_type, 282 SiteIsolationPolicy::ResponseMetaData::MaxCanonicalMimeType); 283 284 // Store the result of cross-site document blocking analysis. 285 bool is_blocked = false; 286 bool sniffed_as_js = SniffForJS(data); 287 288 // Record the number of responses whose content is sniffed for what its mime 289 // type claims it to be. For example, we apply a HTML sniffer for a document 290 // tagged with text/html here. Whenever this check becomes true, we'll block 291 // the response. 292 if (resp_data.canonical_mime_type != 293 SiteIsolationPolicy::ResponseMetaData::Plain) { 294 std::string bucket_prefix; 295 bool sniffed_as_target_document = false; 296 if (resp_data.canonical_mime_type == 297 SiteIsolationPolicy::ResponseMetaData::HTML) { 298 bucket_prefix = "SiteIsolation.XSD.HTML"; 299 sniffed_as_target_document = SniffForHTML(data); 300 } else if (resp_data.canonical_mime_type == 301 SiteIsolationPolicy::ResponseMetaData::XML) { 302 bucket_prefix = "SiteIsolation.XSD.XML"; 303 sniffed_as_target_document = SniffForXML(data); 304 } else if (resp_data.canonical_mime_type == 305 SiteIsolationPolicy::ResponseMetaData::JSON) { 306 bucket_prefix = "SiteIsolation.XSD.JSON"; 307 sniffed_as_target_document = SniffForJSON(data); 308 } else { 309 NOTREACHED() << "Not a blockable mime type: " 310 << resp_data.canonical_mime_type; 311 } 312 313 if (sniffed_as_target_document) { 314 is_blocked = true; 315 HistogramCountBlockedResponse(bucket_prefix, resp_data, false); 316 } else { 317 if (resp_data.no_sniff) { 318 is_blocked = true; 319 HistogramCountBlockedResponse(bucket_prefix, resp_data, true); 320 } else { 321 HistogramCountNotBlockedResponse(bucket_prefix, sniffed_as_js); 322 } 323 } 324 } else { 325 // This block is for plain text documents. We apply our HTML, XML, 326 // and JSON sniffer to a text document in the order, and block it 327 // if any of them succeeds in sniffing. 328 std::string bucket_prefix; 329 if (SniffForHTML(data)) 330 bucket_prefix = "SiteIsolation.XSD.Plain.HTML"; 331 else if (SniffForXML(data)) 332 bucket_prefix = "SiteIsolation.XSD.Plain.XML"; 333 else if (SniffForJSON(data)) 334 bucket_prefix = "SiteIsolation.XSD.Plain.JSON"; 335 336 if (bucket_prefix.size() > 0) { 337 is_blocked = true; 338 HistogramCountBlockedResponse(bucket_prefix, resp_data, false); 339 } else if (resp_data.no_sniff) { 340 is_blocked = true; 341 HistogramCountBlockedResponse("SiteIsolation.XSD.Plain", resp_data, true); 342 } else { 343 HistogramCountNotBlockedResponse("SiteIsolation.XSD.Plain", 344 sniffed_as_js); 345 } 346 } 347 348 if (!CommandLine::ForCurrentProcess()->HasSwitch( 349 switches::kBlockCrossSiteDocuments)) 350 is_blocked = false; 351 result_map[request_id] = is_blocked; 352 353 if (is_blocked) { 354 alternative_data->erase(); 355 alternative_data->insert(0, " "); 356 LOG(ERROR) << resp_data.response_url 357 << " is blocked as an illegal cross-site document from " 358 << resp_data.frame_origin; 359 } 360 return is_blocked; 361 } 362 363 void SiteIsolationPolicy::OnRequestComplete(int request_id) { 364 if (!g_policy_enabled) 365 return; 366 g_metadata_map.Get().erase(request_id); 367 g_result_map.Get().erase(request_id); 368 } 369 370 SiteIsolationPolicy::ResponseMetaData::CanonicalMimeType 371 SiteIsolationPolicy::GetCanonicalMimeType(const std::string& mime_type) { 372 if (LowerCaseEqualsASCII(mime_type, kTextHtml)) { 373 return SiteIsolationPolicy::ResponseMetaData::HTML; 374 } 375 376 if (LowerCaseEqualsASCII(mime_type, kTextPlain)) { 377 return SiteIsolationPolicy::ResponseMetaData::Plain; 378 } 379 380 if (LowerCaseEqualsASCII(mime_type, kAppJson) || 381 LowerCaseEqualsASCII(mime_type, kTextJson) || 382 LowerCaseEqualsASCII(mime_type, kTextXjson)) { 383 return SiteIsolationPolicy::ResponseMetaData::JSON; 384 } 385 386 if (LowerCaseEqualsASCII(mime_type, kTextXml) || 387 LowerCaseEqualsASCII(mime_type, xAppRssXml) || 388 LowerCaseEqualsASCII(mime_type, kAppXml)) { 389 return SiteIsolationPolicy::ResponseMetaData::XML; 390 } 391 392 return SiteIsolationPolicy::ResponseMetaData::Others; 393 } 394 395 bool SiteIsolationPolicy::IsBlockableScheme(const GURL& url) { 396 // We exclude ftp:// from here. FTP doesn't provide a Content-Type 397 // header which our policy depends on, so we cannot protect any 398 // document from FTP servers. 399 return url.SchemeIs("http") || url.SchemeIs("https"); 400 } 401 402 bool SiteIsolationPolicy::IsSameSite(const GURL& frame_origin, 403 const GURL& response_url) { 404 405 if (!frame_origin.is_valid() || !response_url.is_valid()) 406 return false; 407 408 if (frame_origin.scheme() != response_url.scheme()) 409 return false; 410 411 // SameDomainOrHost() extracts the effective domains (public suffix plus one) 412 // from the two URLs and compare them. 413 // TODO(dsjang): use INCLUDE_PRIVATE_REGISTRIES when http://crbug.com/7988 is 414 // fixed. 415 return net::registry_controlled_domains::SameDomainOrHost( 416 frame_origin, 417 response_url, 418 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); 419 } 420 421 // We don't use Webkit's existing CORS policy implementation since 422 // their policy works in terms of origins, not sites. For example, 423 // when frame is sub.a.com and it is not allowed to access a document 424 // with sub1.a.com. But under Site Isolation, it's allowed. 425 bool SiteIsolationPolicy::IsValidCorsHeaderSet( 426 const GURL& frame_origin, 427 const GURL& website_origin, 428 const std::string& access_control_origin) { 429 // Many websites are sending back "\"*\"" instead of "*". This is 430 // non-standard practice, and not supported by Chrome. Refer to 431 // CrossOriginAccessControl::passesAccessControlCheck(). 432 433 // TODO(dsjang): * is not allowed for the response from a request 434 // with cookies. This allows for more than what the renderer will 435 // eventually be able to receive, so we won't see illegal cross-site 436 // documents allowed by this. We have to find a way to see if this 437 // response is from a cookie-tagged request or not in the future. 438 if (access_control_origin == "*") 439 return true; 440 441 // TODO(dsjang): The CORS spec only treats a fully specified URL, except for 442 // "*", but many websites are using just a domain for access_control_origin, 443 // and this is blocked by Webkit's CORS logic here : 444 // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set 445 // is_valid() to false when it is created from a URL containing * in the 446 // domain part. 447 448 GURL cors_origin(access_control_origin); 449 return IsSameSite(frame_origin, cors_origin); 450 } 451 452 // This function is a slight modification of |net::SniffForHTML|. 453 bool SiteIsolationPolicy::SniffForHTML(StringPiece data) { 454 // The content sniffer used by Chrome and Firefox are using "<!--" 455 // as one of the HTML signatures, but it also appears in valid 456 // JavaScript, considered as well-formed JS by the browser. Since 457 // we do not want to block any JS, we exclude it from our HTML 458 // signatures. This can weaken our document block policy, but we can 459 // break less websites. 460 // TODO(dsjang): parameterize |net::SniffForHTML| with an option 461 // that decides whether to include <!-- or not, so that we can 462 // remove this function. 463 // TODO(dsjang): Once SiteIsolationPolicy is moved into the browser 464 // process, we should do single-thread checking here for the static 465 // initializer. 466 static const StringPiece kHtmlSignatures[] = { 467 StringPiece("<!DOCTYPE html"), // HTML5 spec 468 StringPiece("<script"), // HTML5 spec, Mozilla 469 StringPiece("<html"), // HTML5 spec, Mozilla 470 StringPiece("<head"), // HTML5 spec, Mozilla 471 StringPiece("<iframe"), // Mozilla 472 StringPiece("<h1"), // Mozilla 473 StringPiece("<div"), // Mozilla 474 StringPiece("<font"), // Mozilla 475 StringPiece("<table"), // Mozilla 476 StringPiece("<a"), // Mozilla 477 StringPiece("<style"), // Mozilla 478 StringPiece("<title"), // Mozilla 479 StringPiece("<b"), // Mozilla 480 StringPiece("<body"), // Mozilla 481 StringPiece("<br"), // Mozilla 482 StringPiece("<p"), // Mozilla 483 StringPiece("<?xml") // Mozilla 484 }; 485 486 while (data.length() > 0) { 487 if (MatchesSignature( 488 data, kHtmlSignatures, arraysize(kHtmlSignatures))) 489 return true; 490 491 // If we cannot find "<!--", we fail sniffing this as HTML. 492 static const StringPiece kCommentBegins[] = { StringPiece("<!--") }; 493 if (!MatchesSignature(data, kCommentBegins, arraysize(kCommentBegins))) 494 break; 495 496 // Search for --> and do SniffForHTML after that. If we can find the 497 // comment's end, we start HTML sniffing from there again. 498 static const char kEndComment[] = "-->"; 499 size_t offset = data.find(kEndComment); 500 if (offset == base::StringPiece::npos) 501 break; 502 503 // Proceed to the index next to the ending comment (-->). 504 data.remove_prefix(offset + strlen(kEndComment)); 505 } 506 507 return false; 508 } 509 510 bool SiteIsolationPolicy::SniffForXML(base::StringPiece data) { 511 // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for 512 // this signature. However, XML is case-sensitive. Don't we have to 513 // be more lenient only to block documents starting with the exact 514 // string <?xml rather than <?XML ? 515 // TODO(dsjang): Once SiteIsolationPolicy is moved into the browser 516 // process, we should do single-thread checking here for the static 517 // initializer. 518 static const StringPiece kXmlSignatures[] = { StringPiece("<?xml") }; 519 return MatchesSignature(data, kXmlSignatures, arraysize(kXmlSignatures)); 520 } 521 522 bool SiteIsolationPolicy::SniffForJSON(base::StringPiece data) { 523 // TODO(dsjang): We have to come up with a better way to sniff 524 // JSON. However, even RE cannot help us that much due to the fact 525 // that we don't do full parsing. This DFA starts with state 0, and 526 // finds {, "/' and : in that order. We're avoiding adding a 527 // dependency on a regular expression library. 528 enum { 529 kStartState, 530 kLeftBraceState, 531 kLeftQuoteState, 532 kColonState, 533 kTerminalState, 534 } state = kStartState; 535 536 size_t length = data.length(); 537 for (size_t i = 0; i < length && state < kColonState; ++i) { 538 const char c = data[i]; 539 if (c == ' ' || c == '\t' || c == '\r' || c == '\n') 540 continue; 541 542 switch (state) { 543 case kStartState: 544 if (c == '{') 545 state = kLeftBraceState; 546 else 547 state = kTerminalState; 548 break; 549 case kLeftBraceState: 550 if (c == '\"' || c == '\'') 551 state = kLeftQuoteState; 552 else 553 state = kTerminalState; 554 break; 555 case kLeftQuoteState: 556 if (c == ':') 557 state = kColonState; 558 break; 559 case kColonState: 560 case kTerminalState: 561 NOTREACHED(); 562 break; 563 } 564 } 565 return state == kColonState; 566 } 567 568 bool SiteIsolationPolicy::SniffForJS(StringPiece data) { 569 // TODO(dsjang): This is a real hack. The only purpose of this function is to 570 // try to see if there's any possibility that this data can be JavaScript 571 // (superset of JS). This function will be removed once UMA stats are 572 // gathered. 573 574 // Search for "var " for JS detection. 575 return data.find("var ") != base::StringPiece::npos; 576 } 577 578 } // namespace content 579