1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Detecting mime types is a tricky business because we need to balance 6 // compatibility concerns with security issues. Here is a survey of how other 7 // browsers behave and then a description of how we intend to behave. 8 // 9 // HTML payload, no Content-Type header: 10 // * IE 7: Render as HTML 11 // * Firefox 2: Render as HTML 12 // * Safari 3: Render as HTML 13 // * Opera 9: Render as HTML 14 // 15 // Here the choice seems clear: 16 // => Chrome: Render as HTML 17 // 18 // HTML payload, Content-Type: "text/plain": 19 // * IE 7: Render as HTML 20 // * Firefox 2: Render as text 21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL 22 // has an HTML extension) 23 // * Opera 9: Render as text 24 // 25 // Here we choose to follow the majority (and break some compatibility with IE). 26 // Many folks dislike IE's behavior here. 27 // => Chrome: Render as text 28 // We generalize this as follows. If the Content-Type header is text/plain 29 // we won't detect dangerous mime types (those that can execute script). 30 // 31 // HTML payload, Content-Type: "application/octet-stream": 32 // * IE 7: Render as HTML 33 // * Firefox 2: Download as application/octet-stream 34 // * Safari 3: Render as HTML 35 // * Opera 9: Render as HTML 36 // 37 // We follow Firefox. 38 // => Chrome: Download as application/octet-stream 39 // One factor in this decision is that IIS 4 and 5 will send 40 // application/octet-stream for .xhtml files (because they don't recognize 41 // the extension). We did some experiments and it looks like this doesn't occur 42 // very often on the web. We choose the more secure option. 43 // 44 // GIF payload, no Content-Type header: 45 // * IE 7: Render as GIF 46 // * Firefox 2: Render as GIF 47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the 48 // URL has an GIF extension) 49 // * Opera 9: Render as GIF 50 // 51 // The choice is clear. 52 // => Chrome: Render as GIF 53 // Once we decide to render HTML without a Content-Type header, there isn't much 54 // reason not to render GIFs. 55 // 56 // GIF payload, Content-Type: "text/plain": 57 // * IE 7: Render as GIF 58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will 59 // Download as GIF if the URL has an GIF extension) 60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the 61 // URL has an GIF extension) 62 // * Opera 9: Render as GIF 63 // 64 // Displaying as text/plain makes little sense as the content will look like 65 // gibberish. Here, we could change our minds and download. 66 // => Chrome: Render as GIF 67 // 68 // GIF payload, Content-Type: "application/octet-stream": 69 // * IE 7: Render as GIF 70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will 71 // Download as GIF if the URL has an GIF extension) 72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the 73 // URL has an GIF extension) 74 // * Opera 9: Render as GIF 75 // 76 // We used to render as GIF here, but the problem is that some sites want to 77 // trigger downloads by sending application/octet-stream (even though they 78 // should be sending Content-Disposition: attachment). Although it is safe 79 // to render as GIF from a security perspective, we actually get better 80 // compatibility if we don't sniff from application/octet stream at all. 81 // => Chrome: Download as application/octet-stream 82 // 83 // XHTML payload, Content-Type: "text/xml": 84 // * IE 7: Render as XML 85 // * Firefox 2: Render as HTML 86 // * Safari 3: Render as HTML 87 // * Opera 9: Render as HTML 88 // The layout tests rely on us rendering this as HTML. 89 // But we're conservative in XHTML detection, as this runs afoul of the 90 // "don't detect dangerous mime types" rule. 91 // 92 // Note that our definition of HTML payload is much stricter than IE's 93 // definition and roughly the same as Firefox's definition. 94 95 #include <string> 96 97 #include "net/base/mime_sniffer.h" 98 99 #include "base/basictypes.h" 100 #include "base/logging.h" 101 #include "base/metrics/histogram.h" 102 #include "base/string_util.h" 103 #include "googleurl/src/gurl.h" 104 #include "net/base/mime_util.h" 105 106 namespace net { 107 108 // The number of content bytes we need to use all our magic numbers. Feel free 109 // to increase this number if you add a longer magic number. 110 static const size_t kBytesRequiredForMagic = 42; 111 112 struct MagicNumber { 113 const char* mime_type; 114 const char* magic; 115 size_t magic_len; 116 bool is_string; 117 }; 118 119 #define MAGIC_NUMBER(mime_type, magic) \ 120 { (mime_type), (magic), sizeof(magic)-1, false }, 121 122 // Magic strings are case insensitive and must not include '\0' characters 123 #define MAGIC_STRING(mime_type, magic) \ 124 { (mime_type), (magic), sizeof(magic)-1, true }, 125 126 static const MagicNumber kMagicNumbers[] = { 127 // Source: HTML 5 specification 128 MAGIC_NUMBER("application/pdf", "%PDF-") 129 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-") 130 MAGIC_NUMBER("image/gif", "GIF87a") 131 MAGIC_NUMBER("image/gif", "GIF89a") 132 MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A") 133 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF") 134 MAGIC_NUMBER("image/bmp", "BM") 135 // Source: Mozilla 136 MAGIC_NUMBER("text/plain", "#!") // Script 137 MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS 138 MAGIC_NUMBER("text/plain", "From") 139 MAGIC_NUMBER("text/plain", ">From") 140 // Chrome specific 141 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08") 142 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46") 143 MAGIC_NUMBER("video/x-ms-asf", 144 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C") 145 MAGIC_NUMBER("image/tiff", "I I") 146 MAGIC_NUMBER("image/tiff", "II*") 147 MAGIC_NUMBER("image/tiff", "MM\x00*") 148 MAGIC_NUMBER("audio/mpeg", "ID3") 149 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ") 150 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3") 151 // TODO(abarth): we don't handle partial byte matches yet 152 // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB") 153 // MAGIC_NUMBER("audio/mpeg", "\xFF\xE") 154 // MAGIC_NUMBER("audio/mpeg", "\xFF\xF") 155 MAGIC_NUMBER("application/zip", "PK\x03\x04") 156 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00") 157 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A") 158 MAGIC_NUMBER("application/octet-stream", "MZ") // EXE 159 // Sniffing for Flash: 160 // 161 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS") 162 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV") 163 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS") 164 // 165 // Including these magic number for Flash is a trade off. 166 // 167 // Pros: 168 // * Flash is an important and popular file format 169 // 170 // Cons: 171 // * These patterns are fairly weak 172 // * If we mistakenly decide something is Flash, we will execute it 173 // in the origin of an unsuspecting site. This could be a security 174 // vulnerability if the site allows users to upload content. 175 // 176 // On balance, we do not include these patterns. 177 }; 178 179 // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will 180 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is 181 // HTML, but we will not. 182 183 #define MAGIC_HTML_TAG(tag) \ 184 MAGIC_STRING("text/html", "<" tag) 185 186 static const MagicNumber kSniffableTags[] = { 187 // XML processing directive. Although this is not an HTML mime type, we sniff 188 // for this in the HTML phase because text/xml is just as powerful as HTML and 189 // we want to leverage our white space skipping technology. 190 MAGIC_NUMBER("text/xml", "<?xml") // Mozilla 191 // DOCTYPEs 192 MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec 193 // Sniffable tags, ordered by how often they occur in sniffable documents. 194 MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla 195 MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla 196 MAGIC_HTML_TAG("!--") 197 MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla 198 MAGIC_HTML_TAG("iframe") // Mozilla 199 MAGIC_HTML_TAG("h1") // Mozilla 200 MAGIC_HTML_TAG("div") // Mozilla 201 MAGIC_HTML_TAG("font") // Mozilla 202 MAGIC_HTML_TAG("table") // Mozilla 203 MAGIC_HTML_TAG("a") // Mozilla 204 MAGIC_HTML_TAG("style") // Mozilla 205 MAGIC_HTML_TAG("title") // Mozilla 206 MAGIC_HTML_TAG("b") // Mozilla 207 MAGIC_HTML_TAG("body") // Mozilla 208 MAGIC_HTML_TAG("br") 209 MAGIC_HTML_TAG("p") // Mozilla 210 }; 211 212 static base::Histogram* UMASnifferHistogramGet(const char* name, 213 int array_size) { 214 base::Histogram* counter = 215 base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size, 216 base::Histogram::kUmaTargetedHistogramFlag); 217 return counter; 218 } 219 220 // Compare content header to a magic number where magic_entry can contain '.' 221 // for single character of anything, allowing some bytes to be skipped. 222 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) { 223 while (len) { 224 if ((*magic_entry != '.') && (*magic_entry != *content)) 225 return false; 226 ++magic_entry; 227 ++content; 228 --len; 229 } 230 return true; 231 } 232 233 static bool MatchMagicNumber(const char* content, size_t size, 234 const MagicNumber* magic_entry, 235 std::string* result) { 236 const size_t len = magic_entry->magic_len; 237 238 // Keep kBytesRequiredForMagic honest. 239 DCHECK_LE(len, kBytesRequiredForMagic); 240 241 // To compare with magic strings, we need to compute strlen(content), but 242 // content might not actually have a null terminator. In that case, we 243 // pretend the length is content_size. 244 const char* end = 245 static_cast<const char*>(memchr(content, '\0', size)); 246 const size_t content_strlen = 247 (end != NULL) ? static_cast<size_t>(end - content) : size; 248 249 bool match = false; 250 if (magic_entry->is_string) { 251 if (content_strlen >= len) { 252 // String comparisons are case-insensitive 253 match = (base::strncasecmp(magic_entry->magic, content, len) == 0); 254 } 255 } else { 256 if (size >= len) 257 match = MagicCmp(magic_entry->magic, content, len); 258 } 259 260 if (match) { 261 result->assign(magic_entry->mime_type); 262 return true; 263 } 264 return false; 265 } 266 267 static bool CheckForMagicNumbers(const char* content, size_t size, 268 const MagicNumber* magic, size_t magic_len, 269 base::Histogram* counter, 270 std::string* result) { 271 for (size_t i = 0; i < magic_len; ++i) { 272 if (MatchMagicNumber(content, size, &(magic[i]), result)) { 273 if (counter) counter->Add(static_cast<int>(i)); 274 return true; 275 } 276 } 277 return false; 278 } 279 280 // Truncates |size| to |max_size| and returns true if |size| is at least 281 // |max_size|. 282 static bool TruncateSize(const size_t max_size, size_t* size) { 283 // Keep kMaxBytesToSniff honest. 284 DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff); 285 286 if (*size >= max_size) { 287 *size = max_size; 288 return true; 289 } 290 return false; 291 } 292 293 // Returns true and sets result if the content appears to be HTML. 294 // Clears have_enough_content if more data could possibly change the result. 295 static bool SniffForHTML(const char* content, 296 size_t size, 297 bool* have_enough_content, 298 std::string* result) { 299 // For HTML, we are willing to consider up to 512 bytes. This may be overly 300 // conservative as IE only considers 256. 301 *have_enough_content &= TruncateSize(512, &size); 302 303 // We adopt a strategy similar to that used by Mozilla to sniff HTML tags, 304 // but with some modifications to better match the HTML5 spec. 305 const char* const end = content + size; 306 const char* pos; 307 for (pos = content; pos < end; ++pos) { 308 if (!IsAsciiWhitespace(*pos)) 309 break; 310 } 311 static base::Histogram* counter(NULL); 312 if (!counter) 313 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2", 314 arraysize(kSniffableTags)); 315 // |pos| now points to first non-whitespace character (or at end). 316 return CheckForMagicNumbers(pos, end - pos, 317 kSniffableTags, arraysize(kSniffableTags), 318 counter, result); 319 } 320 321 // Returns true and sets result if the content matches any of kMagicNumbers. 322 // Clears have_enough_content if more data could possibly change the result. 323 static bool SniffForMagicNumbers(const char* content, 324 size_t size, 325 bool* have_enough_content, 326 std::string* result) { 327 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); 328 329 // Check our big table of Magic Numbers 330 static base::Histogram* counter(NULL); 331 if (!counter) 332 counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", 333 arraysize(kMagicNumbers)); 334 return CheckForMagicNumbers(content, size, 335 kMagicNumbers, arraysize(kMagicNumbers), 336 counter, result); 337 } 338 339 // Byte order marks 340 static const MagicNumber kMagicXML[] = { 341 // We want to be very conservative in interpreting text/xml content as 342 // XHTML -- we just want to sniff enough to make unit tests pass. 343 // So we match explicitly on this, and don't match other ways of writing 344 // it in semantically-equivalent ways. 345 MAGIC_STRING("application/xhtml+xml", 346 "<html xmlns=\"http://www.w3.org/1999/xhtml\"") 347 MAGIC_STRING("application/atom+xml", "<feed") 348 MAGIC_STRING("application/rss+xml", "<rss") // UTF-8 349 }; 350 351 // Returns true and sets result if the content appears to contain XHTML or a 352 // feed. 353 // Clears have_enough_content if more data could possibly change the result. 354 // 355 // TODO(evanm): this is similar but more conservative than what Safari does, 356 // while HTML5 has a different recommendation -- what should we do? 357 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset 358 // of ASCII -- do we care? 359 static bool SniffXML(const char* content, 360 size_t size, 361 bool* have_enough_content, 362 std::string* result) { 363 // We allow at most 300 bytes of content before we expect the opening tag. 364 *have_enough_content &= TruncateSize(300, &size); 365 const char* pos = content; 366 const char* const end = content + size; 367 368 // This loop iterates through tag-looking offsets in the file. 369 // We want to skip XML processing instructions (of the form "<?xml ...") 370 // and stop at the first "plain" tag, then make a decision on the mime-type 371 // based on the name (or possibly attributes) of that tag. 372 static base::Histogram* counter(NULL); 373 if (!counter) 374 counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2", 375 arraysize(kMagicXML)); 376 const int kMaxTagIterations = 5; 377 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { 378 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); 379 if (!pos) 380 return false; 381 382 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml")-1) == 0) { 383 // Skip XML declarations. 384 ++pos; 385 continue; 386 } else if (base::strncasecmp(pos, "<!DOCTYPE", 387 sizeof("<!DOCTYPE")-1) == 0) { 388 // Skip DOCTYPE declarations. 389 ++pos; 390 continue; 391 } 392 393 if (CheckForMagicNumbers(pos, end - pos, 394 kMagicXML, arraysize(kMagicXML), 395 counter, result)) 396 return true; 397 398 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult 399 // to identify. 400 401 // If we get here, we've hit an initial tag that hasn't matched one of the 402 // above tests. Abort. 403 return true; 404 } 405 406 // We iterated too far without finding a start tag. 407 // If we have more content to look at, we aren't going to change our mind by 408 // seeing more bytes from the network. 409 return pos < end; 410 } 411 412 // Byte order marks 413 static const MagicNumber kByteOrderMark[] = { 414 MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE 415 MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE 416 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8 417 }; 418 419 // Whether a given byte looks like it might be part of binary content. 420 // Source: HTML5 spec 421 static char kByteLooksBinary[] = { 422 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F 423 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F 424 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F 425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F 426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F 427 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F 428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F 429 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F 430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F 431 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F 432 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF 433 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF 434 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF 435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF 436 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF 437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF 438 }; 439 440 // Returns true and sets result to "application/octet-stream" if the content 441 // appears to be binary data. Otherwise, returns false and sets "text/plain". 442 // Clears have_enough_content if more data could possibly change the result. 443 static bool SniffBinary(const char* content, 444 size_t size, 445 bool* have_enough_content, 446 std::string* result) { 447 // There is no concensus about exactly how to sniff for binary content. 448 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. 449 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. 450 // Here, we side with FF, but with a smaller buffer. This size was chosen 451 // because it is small enough to comfortably fit into a single packet (after 452 // allowing for headers) and yet large enough to account for binary formats 453 // that have a significant amount of ASCII at the beginning (crbug.com/15314). 454 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); 455 456 // First, we look for a BOM. 457 static base::Histogram* counter(NULL); 458 if (!counter) 459 counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", 460 arraysize(kByteOrderMark)); 461 std::string unused; 462 if (CheckForMagicNumbers(content, size, 463 kByteOrderMark, arraysize(kByteOrderMark), 464 counter, &unused)) { 465 // If there is BOM, we think the buffer is not binary. 466 result->assign("text/plain"); 467 return false; 468 } 469 470 // Next we look to see if any of the bytes "look binary." 471 for (size_t i = 0; i < size; ++i) { 472 // If we a see a binary-looking byte, we think the content is binary. 473 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { 474 result->assign("application/octet-stream"); 475 return true; 476 } 477 } 478 479 // No evidence either way. Default to non-binary and, if truncated, clear 480 // have_enough_content because there could be a binary looking byte in the 481 // truncated data. 482 *have_enough_content &= is_truncated; 483 result->assign("text/plain"); 484 return false; 485 } 486 487 static bool IsUnknownMimeType(const std::string& mime_type) { 488 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here. 489 // If we do, please be careful not to alter the semantics at all. 490 static const char* kUnknownMimeTypes[] = { 491 // Empty mime types are as unknown as they get. 492 "", 493 // The unknown/unknown type is popular and uninformative 494 "unknown/unknown", 495 // The second most popular unknown mime type is application/unknown 496 "application/unknown", 497 // Firefox rejects a mime type if it is exactly */* 498 "*/*", 499 }; 500 static base::Histogram* counter(NULL); 501 if (!counter) 502 counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2", 503 arraysize(kUnknownMimeTypes) + 1); 504 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { 505 if (mime_type == kUnknownMimeTypes[i]) { 506 counter->Add(i); 507 return true; 508 } 509 } 510 if (mime_type.find('/') == std::string::npos) { 511 // Firefox rejects a mime type if it does not contain a slash 512 counter->Add(arraysize(kUnknownMimeTypes)); 513 return true; 514 } 515 return false; 516 } 517 518 // Returns true and sets result if the content appears to be a crx (chrome 519 // extension) file. 520 // Clears have_enough_content if more data could possibly change the result. 521 static bool SniffCRX(const char* content, 522 size_t size, 523 const GURL& url, 524 const std::string& type_hint, 525 bool* have_enough_content, 526 std::string* result) { 527 static base::Histogram* counter(NULL); 528 if (!counter) 529 counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); 530 531 // Technically, the crx magic number is just Cr24, but the bytes after that 532 // are a version number which changes infrequently. Including it in the 533 // sniffing gives us less room for error. If the version number ever changes, 534 // we can just add an entry to this list. 535 // 536 // TODO(aa): If we ever have another magic number, we'll want to pass a 537 // histogram into CheckForMagicNumbers(), below, to see which one matched. 538 static const struct MagicNumber kCRXMagicNumbers[] = { 539 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00") 540 }; 541 542 // Only consider files that have the extension ".crx". 543 static const char kCRXExtension[] = ".crx"; 544 // Ignore null by subtracting 1. 545 static const int kExtensionLength = arraysize(kCRXExtension) - 1; 546 if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) == 547 url.path().size() - kExtensionLength) { 548 counter->Add(1); 549 } else { 550 return false; 551 } 552 553 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); 554 if (CheckForMagicNumbers(content, size, 555 kCRXMagicNumbers, arraysize(kCRXMagicNumbers), 556 NULL, result)) { 557 counter->Add(2); 558 } else { 559 return false; 560 } 561 562 return true; 563 } 564 565 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { 566 static base::Histogram* should_sniff_counter(NULL); 567 if (!should_sniff_counter) 568 should_sniff_counter = 569 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3); 570 // We are willing to sniff the mime type for HTTP, HTTPS, and FTP 571 bool sniffable_scheme = url.is_empty() || 572 url.SchemeIs("http") || 573 url.SchemeIs("https") || 574 url.SchemeIs("ftp") || 575 url.SchemeIsFile(); 576 if (!sniffable_scheme) { 577 should_sniff_counter->Add(1); 578 return false; 579 } 580 581 static const char* kSniffableTypes[] = { 582 // Many web servers are misconfigured to send text/plain for many 583 // different types of content. 584 "text/plain", 585 // We want to sniff application/octet-stream for 586 // application/x-chrome-extension, but nothing else. 587 "application/octet-stream", 588 // XHTML and Atom/RSS feeds are often served as plain xml instead of 589 // their more specific mime types. 590 "text/xml", 591 "application/xml", 592 }; 593 static base::Histogram* counter(NULL); 594 if (!counter) 595 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2", 596 arraysize(kSniffableTypes) + 1); 597 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { 598 if (mime_type == kSniffableTypes[i]) { 599 counter->Add(i); 600 should_sniff_counter->Add(2); 601 return true; 602 } 603 } 604 if (IsUnknownMimeType(mime_type)) { 605 // The web server didn't specify a content type or specified a mime 606 // type that we ignore. 607 counter->Add(arraysize(kSniffableTypes)); 608 should_sniff_counter->Add(2); 609 return true; 610 } 611 should_sniff_counter->Add(1); 612 return false; 613 } 614 615 bool SniffMimeType(const char* content, size_t content_size, 616 const GURL& url, const std::string& type_hint, 617 std::string* result) { 618 DCHECK_LT(content_size, 1000000U); // sanity check 619 DCHECK(content); 620 DCHECK(result); 621 622 // By default, we assume we have enough content. 623 // Each sniff routine may unset this if it wasn't provided enough content. 624 bool have_enough_content = true; 625 626 // By default, we'll return the type hint. 627 // Each sniff routine may modify this if it has a better guess.. 628 result->assign(type_hint); 629 630 // Cache information about the type_hint 631 const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint); 632 633 // First check for HTML 634 if (hint_is_unknown_mime_type) { 635 // We're only willing to sniff HTML if the server has not supplied a mime 636 // type, or if the type it did supply indicates that it doesn't know what 637 // the type should be. 638 if (SniffForHTML(content, content_size, &have_enough_content, result)) 639 return true; // We succeeded in sniffing HTML. No more content needed. 640 } 641 642 // We're only willing to sniff for binary in 3 cases: 643 // 1. The server has not supplied a mime type. 644 // 2. The type it did supply indicates that it doesn't know what the type 645 // should be. 646 // 3. The type is "text/plain" which is the default on some web servers and 647 // could be indicative of a mis-configuration that we shield the user from. 648 const bool hint_is_text_plain = (type_hint == "text/plain"); 649 if (hint_is_unknown_mime_type || hint_is_text_plain) { 650 if (!SniffBinary(content, content_size, &have_enough_content, result)) { 651 // If the server said the content was text/plain and it doesn't appear 652 // to be binary, then we trust it. 653 if (hint_is_text_plain) { 654 return have_enough_content; 655 } 656 } 657 } 658 659 // If we have plain XML, sniff XML subtypes. 660 if (type_hint == "text/xml" || type_hint == "application/xml") { 661 // We're not interested in sniffing these types for images and the like. 662 // Instead, we're looking explicitly for a feed. If we don't find one 663 // we're done and return early. 664 if (SniffXML(content, content_size, &have_enough_content, result)) 665 return true; 666 return have_enough_content; 667 } 668 669 // CRX files (chrome extensions) have a special sniffing algorithm. It is 670 // tighter than the others because we don't have to match legacy behavior. 671 if (SniffCRX(content, content_size, url, type_hint, 672 &have_enough_content, result)) 673 return true; 674 675 // We're not interested in sniffing for magic numbers when the type_hint 676 // is application/octet-stream. Time to bail out. 677 if (type_hint == "application/octet-stream") 678 return have_enough_content; 679 680 // Now we look in our large table of magic numbers to see if we can find 681 // anything that matches the content. 682 if (SniffForMagicNumbers(content, content_size, 683 &have_enough_content, result)) 684 return true; // We've matched a magic number. No more content needed. 685 686 return have_enough_content; 687 } 688 689 } // namespace net 690