Home | History | Annotate | Download | only in base
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Detecting mime types is a tricky business because we need to balance
      6 // compatibility concerns with security issues.  Here is a survey of how other
      7 // browsers behave and then a description of how we intend to behave.
      8 //
      9 // HTML payload, no Content-Type header:
     10 // * IE 7: Render as HTML
     11 // * Firefox 2: Render as HTML
     12 // * Safari 3: Render as HTML
     13 // * Opera 9: Render as HTML
     14 //
     15 // Here the choice seems clear:
     16 // => Chrome: Render as HTML
     17 //
     18 // HTML payload, Content-Type: "text/plain":
     19 // * IE 7: Render as HTML
     20 // * Firefox 2: Render as text
     21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL
     22 //                                   has an HTML extension)
     23 // * Opera 9: Render as text
     24 //
     25 // Here we choose to follow the majority (and break some compatibility with IE).
     26 // Many folks dislike IE's behavior here.
     27 // => Chrome: Render as text
     28 // We generalize this as follows.  If the Content-Type header is text/plain
     29 // we won't detect dangerous mime types (those that can execute script).
     30 //
     31 // HTML payload, Content-Type: "application/octet-stream":
     32 // * IE 7: Render as HTML
     33 // * Firefox 2: Download as application/octet-stream
     34 // * Safari 3: Render as HTML
     35 // * Opera 9: Render as HTML
     36 //
     37 // We follow Firefox.
     38 // => Chrome: Download as application/octet-stream
     39 // One factor in this decision is that IIS 4 and 5 will send
     40 // application/octet-stream for .xhtml files (because they don't recognize
     41 // the extension).  We did some experiments and it looks like this doesn't occur
     42 // very often on the web.  We choose the more secure option.
     43 //
     44 // GIF payload, no Content-Type header:
     45 // * IE 7: Render as GIF
     46 // * Firefox 2: Render as GIF
     47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
     48 //                                        URL has an GIF extension)
     49 // * Opera 9: Render as GIF
     50 //
     51 // The choice is clear.
     52 // => Chrome: Render as GIF
     53 // Once we decide to render HTML without a Content-Type header, there isn't much
     54 // reason not to render GIFs.
     55 //
     56 // GIF payload, Content-Type: "text/plain":
     57 // * IE 7: Render as GIF
     58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
     59 //                              Download as GIF if the URL has an GIF extension)
     60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
     61 //                                        URL has an GIF extension)
     62 // * Opera 9: Render as GIF
     63 //
     64 // Displaying as text/plain makes little sense as the content will look like
     65 // gibberish.  Here, we could change our minds and download.
     66 // => Chrome: Render as GIF
     67 //
     68 // GIF payload, Content-Type: "application/octet-stream":
     69 // * IE 7: Render as GIF
     70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
     71 //                              Download as GIF if the URL has an GIF extension)
     72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
     73 //                                        URL has an GIF extension)
     74 // * Opera 9: Render as GIF
     75 //
     76 // We used to render as GIF here, but the problem is that some sites want to
     77 // trigger downloads by sending application/octet-stream (even though they
     78 // should be sending Content-Disposition: attachment).  Although it is safe
     79 // to render as GIF from a security perspective, we actually get better
     80 // compatibility if we don't sniff from application/octet stream at all.
     81 // => Chrome: Download as application/octet-stream
     82 //
     83 // XHTML payload, Content-Type: "text/xml":
     84 // * IE 7: Render as XML
     85 // * Firefox 2: Render as HTML
     86 // * Safari 3: Render as HTML
     87 // * Opera 9: Render as HTML
     88 // The layout tests rely on us rendering this as HTML.
     89 // But we're conservative in XHTML detection, as this runs afoul of the
     90 // "don't detect dangerous mime types" rule.
     91 //
     92 // Note that our definition of HTML payload is much stricter than IE's
     93 // definition and roughly the same as Firefox's definition.
     94 
     95 #include <string>
     96 
     97 #include "net/base/mime_sniffer.h"
     98 
     99 #include "base/basictypes.h"
    100 #include "base/logging.h"
    101 #include "base/metrics/histogram.h"
    102 #include "base/string_util.h"
    103 #include "googleurl/src/gurl.h"
    104 #include "net/base/mime_util.h"
    105 
    106 namespace net {
    107 
    108 // The number of content bytes we need to use all our magic numbers.  Feel free
    109 // to increase this number if you add a longer magic number.
    110 static const size_t kBytesRequiredForMagic = 42;
    111 
    112 struct MagicNumber {
    113   const char* mime_type;
    114   const char* magic;
    115   size_t magic_len;
    116   bool is_string;
    117 };
    118 
    119 #define MAGIC_NUMBER(mime_type, magic) \
    120   { (mime_type), (magic), sizeof(magic)-1, false },
    121 
    122 // Magic strings are case insensitive and must not include '\0' characters
    123 #define MAGIC_STRING(mime_type, magic) \
    124   { (mime_type), (magic), sizeof(magic)-1, true },
    125 
    126 static const MagicNumber kMagicNumbers[] = {
    127   // Source: HTML 5 specification
    128   MAGIC_NUMBER("application/pdf", "%PDF-")
    129   MAGIC_NUMBER("application/postscript", "%!PS-Adobe-")
    130   MAGIC_NUMBER("image/gif", "GIF87a")
    131   MAGIC_NUMBER("image/gif", "GIF89a")
    132   MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A")
    133   MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF")
    134   MAGIC_NUMBER("image/bmp", "BM")
    135   // Source: Mozilla
    136   MAGIC_NUMBER("text/plain", "#!")  // Script
    137   MAGIC_NUMBER("text/plain", "%!")  // Script, similar to PS
    138   MAGIC_NUMBER("text/plain", "From")
    139   MAGIC_NUMBER("text/plain", ">From")
    140   // Chrome specific
    141   MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08")
    142   MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46")
    143   MAGIC_NUMBER("video/x-ms-asf",
    144       "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C")
    145   MAGIC_NUMBER("image/tiff", "I I")
    146   MAGIC_NUMBER("image/tiff", "II*")
    147   MAGIC_NUMBER("image/tiff", "MM\x00*")
    148   MAGIC_NUMBER("audio/mpeg", "ID3")
    149   MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ")
    150   MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3")
    151   // TODO(abarth): we don't handle partial byte matches yet
    152   // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB")
    153   // MAGIC_NUMBER("audio/mpeg", "\xFF\xE")
    154   // MAGIC_NUMBER("audio/mpeg", "\xFF\xF")
    155   MAGIC_NUMBER("application/zip", "PK\x03\x04")
    156   MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00")
    157   MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A")
    158   MAGIC_NUMBER("application/octet-stream", "MZ")  // EXE
    159   // Sniffing for Flash:
    160   //
    161   //   MAGIC_NUMBER("application/x-shockwave-flash", "CWS")
    162   //   MAGIC_NUMBER("application/x-shockwave-flash", "FLV")
    163   //   MAGIC_NUMBER("application/x-shockwave-flash", "FWS")
    164   //
    165   // Including these magic number for Flash is a trade off.
    166   //
    167   // Pros:
    168   //   * Flash is an important and popular file format
    169   //
    170   // Cons:
    171   //   * These patterns are fairly weak
    172   //   * If we mistakenly decide something is Flash, we will execute it
    173   //     in the origin of an unsuspecting site.  This could be a security
    174   //     vulnerability if the site allows users to upload content.
    175   //
    176   // On balance, we do not include these patterns.
    177 };
    178 
    179 // Our HTML sniffer differs slightly from Mozilla.  For example, Mozilla will
    180 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
    181 // HTML, but we will not.
    182 
    183 #define MAGIC_HTML_TAG(tag) \
    184   MAGIC_STRING("text/html", "<" tag)
    185 
    186 static const MagicNumber kSniffableTags[] = {
    187   // XML processing directive.  Although this is not an HTML mime type, we sniff
    188   // for this in the HTML phase because text/xml is just as powerful as HTML and
    189   // we want to leverage our white space skipping technology.
    190   MAGIC_NUMBER("text/xml", "<?xml")  // Mozilla
    191   // DOCTYPEs
    192   MAGIC_HTML_TAG("!DOCTYPE html")  // HTML5 spec
    193   // Sniffable tags, ordered by how often they occur in sniffable documents.
    194   MAGIC_HTML_TAG("script")  // HTML5 spec, Mozilla
    195   MAGIC_HTML_TAG("html")  // HTML5 spec, Mozilla
    196   MAGIC_HTML_TAG("!--")
    197   MAGIC_HTML_TAG("head")  // HTML5 spec, Mozilla
    198   MAGIC_HTML_TAG("iframe")  // Mozilla
    199   MAGIC_HTML_TAG("h1")  // Mozilla
    200   MAGIC_HTML_TAG("div")  // Mozilla
    201   MAGIC_HTML_TAG("font")  // Mozilla
    202   MAGIC_HTML_TAG("table")  // Mozilla
    203   MAGIC_HTML_TAG("a")  // Mozilla
    204   MAGIC_HTML_TAG("style")  // Mozilla
    205   MAGIC_HTML_TAG("title")  // Mozilla
    206   MAGIC_HTML_TAG("b")  // Mozilla
    207   MAGIC_HTML_TAG("body")  // Mozilla
    208   MAGIC_HTML_TAG("br")
    209   MAGIC_HTML_TAG("p")  // Mozilla
    210 };
    211 
    212 static base::Histogram* UMASnifferHistogramGet(const char* name,
    213                                                int array_size) {
    214   base::Histogram* counter =
    215       base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size,
    216       base::Histogram::kUmaTargetedHistogramFlag);
    217   return counter;
    218 }
    219 
    220 // Compare content header to a magic number where magic_entry can contain '.'
    221 // for single character of anything, allowing some bytes to be skipped.
    222 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) {
    223   while (len) {
    224     if ((*magic_entry != '.') && (*magic_entry != *content))
    225       return false;
    226     ++magic_entry;
    227     ++content;
    228     --len;
    229   }
    230   return true;
    231 }
    232 
    233 static bool MatchMagicNumber(const char* content, size_t size,
    234                              const MagicNumber* magic_entry,
    235                              std::string* result) {
    236   const size_t len = magic_entry->magic_len;
    237 
    238   // Keep kBytesRequiredForMagic honest.
    239   DCHECK_LE(len, kBytesRequiredForMagic);
    240 
    241   // To compare with magic strings, we need to compute strlen(content), but
    242   // content might not actually have a null terminator.  In that case, we
    243   // pretend the length is content_size.
    244   const char* end =
    245       static_cast<const char*>(memchr(content, '\0', size));
    246   const size_t content_strlen =
    247       (end != NULL) ? static_cast<size_t>(end - content) : size;
    248 
    249   bool match = false;
    250   if (magic_entry->is_string) {
    251     if (content_strlen >= len) {
    252       // String comparisons are case-insensitive
    253       match = (base::strncasecmp(magic_entry->magic, content, len) == 0);
    254     }
    255   } else {
    256     if (size >= len)
    257       match = MagicCmp(magic_entry->magic, content, len);
    258   }
    259 
    260   if (match) {
    261     result->assign(magic_entry->mime_type);
    262     return true;
    263   }
    264   return false;
    265 }
    266 
    267 static bool CheckForMagicNumbers(const char* content, size_t size,
    268                                  const MagicNumber* magic, size_t magic_len,
    269                                  base::Histogram* counter,
    270                                  std::string* result) {
    271   for (size_t i = 0; i < magic_len; ++i) {
    272     if (MatchMagicNumber(content, size, &(magic[i]), result)) {
    273       if (counter) counter->Add(static_cast<int>(i));
    274       return true;
    275     }
    276   }
    277   return false;
    278 }
    279 
    280 // Truncates |size| to |max_size| and returns true if |size| is at least
    281 // |max_size|.
    282 static bool TruncateSize(const size_t max_size, size_t* size) {
    283   // Keep kMaxBytesToSniff honest.
    284   DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
    285 
    286   if (*size >= max_size) {
    287     *size = max_size;
    288     return true;
    289   }
    290   return false;
    291 }
    292 
    293 // Returns true and sets result if the content appears to be HTML.
    294 // Clears have_enough_content if more data could possibly change the result.
    295 static bool SniffForHTML(const char* content,
    296                          size_t size,
    297                          bool* have_enough_content,
    298                          std::string* result) {
    299   // For HTML, we are willing to consider up to 512 bytes. This may be overly
    300   // conservative as IE only considers 256.
    301   *have_enough_content &= TruncateSize(512, &size);
    302 
    303   // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
    304   // but with some modifications to better match the HTML5 spec.
    305   const char* const end = content + size;
    306   const char* pos;
    307   for (pos = content; pos < end; ++pos) {
    308     if (!IsAsciiWhitespace(*pos))
    309       break;
    310   }
    311   static base::Histogram* counter(NULL);
    312   if (!counter)
    313     counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2",
    314                                      arraysize(kSniffableTags));
    315   // |pos| now points to first non-whitespace character (or at end).
    316   return CheckForMagicNumbers(pos, end - pos,
    317                               kSniffableTags, arraysize(kSniffableTags),
    318                               counter, result);
    319 }
    320 
    321 // Returns true and sets result if the content matches any of kMagicNumbers.
    322 // Clears have_enough_content if more data could possibly change the result.
    323 static bool SniffForMagicNumbers(const char* content,
    324                                  size_t size,
    325                                  bool* have_enough_content,
    326                                  std::string* result) {
    327   *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
    328 
    329   // Check our big table of Magic Numbers
    330   static base::Histogram* counter(NULL);
    331   if (!counter)
    332     counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2",
    333                                      arraysize(kMagicNumbers));
    334   return CheckForMagicNumbers(content, size,
    335                               kMagicNumbers, arraysize(kMagicNumbers),
    336                               counter, result);
    337 }
    338 
    339 // Byte order marks
    340 static const MagicNumber kMagicXML[] = {
    341   // We want to be very conservative in interpreting text/xml content as
    342   // XHTML -- we just want to sniff enough to make unit tests pass.
    343   // So we match explicitly on this, and don't match other ways of writing
    344   // it in semantically-equivalent ways.
    345   MAGIC_STRING("application/xhtml+xml",
    346                "<html xmlns=\"http://www.w3.org/1999/xhtml\"")
    347   MAGIC_STRING("application/atom+xml", "<feed")
    348   MAGIC_STRING("application/rss+xml", "<rss")  // UTF-8
    349 };
    350 
    351 // Returns true and sets result if the content appears to contain XHTML or a
    352 // feed.
    353 // Clears have_enough_content if more data could possibly change the result.
    354 //
    355 // TODO(evanm): this is similar but more conservative than what Safari does,
    356 // while HTML5 has a different recommendation -- what should we do?
    357 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset
    358 // of ASCII -- do we care?
    359 static bool SniffXML(const char* content,
    360                      size_t size,
    361                      bool* have_enough_content,
    362                      std::string* result) {
    363   // We allow at most 300 bytes of content before we expect the opening tag.
    364   *have_enough_content &= TruncateSize(300, &size);
    365   const char* pos = content;
    366   const char* const end = content + size;
    367 
    368   // This loop iterates through tag-looking offsets in the file.
    369   // We want to skip XML processing instructions (of the form "<?xml ...")
    370   // and stop at the first "plain" tag, then make a decision on the mime-type
    371   // based on the name (or possibly attributes) of that tag.
    372   static base::Histogram* counter(NULL);
    373   if (!counter)
    374     counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2",
    375                                      arraysize(kMagicXML));
    376   const int kMaxTagIterations = 5;
    377   for (int i = 0; i < kMaxTagIterations && pos < end; ++i) {
    378     pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos));
    379     if (!pos)
    380       return false;
    381 
    382     if (base::strncasecmp(pos, "<?xml", sizeof("<?xml")-1) == 0) {
    383       // Skip XML declarations.
    384       ++pos;
    385       continue;
    386     } else if (base::strncasecmp(pos, "<!DOCTYPE",
    387                                  sizeof("<!DOCTYPE")-1) == 0) {
    388       // Skip DOCTYPE declarations.
    389       ++pos;
    390       continue;
    391     }
    392 
    393     if (CheckForMagicNumbers(pos, end - pos,
    394                              kMagicXML, arraysize(kMagicXML),
    395                              counter, result))
    396       return true;
    397 
    398     // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
    399     // to identify.
    400 
    401     // If we get here, we've hit an initial tag that hasn't matched one of the
    402     // above tests.  Abort.
    403     return true;
    404   }
    405 
    406   // We iterated too far without finding a start tag.
    407   // If we have more content to look at, we aren't going to change our mind by
    408   // seeing more bytes from the network.
    409   return pos < end;
    410 }
    411 
    412 // Byte order marks
    413 static const MagicNumber kByteOrderMark[] = {
    414   MAGIC_NUMBER("text/plain", "\xFE\xFF")  // UTF-16BE
    415   MAGIC_NUMBER("text/plain", "\xFF\xFE")  // UTF-16LE
    416   MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF")  // UTF-8
    417 };
    418 
    419 // Whether a given byte looks like it might be part of binary content.
    420 // Source: HTML5 spec
    421 static char kByteLooksBinary[] = {
    422   1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  // 0x00 - 0x0F
    423   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  // 0x10 - 0x1F
    424   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x20 - 0x2F
    425   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x30 - 0x3F
    426   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x40 - 0x4F
    427   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x50 - 0x5F
    428   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x60 - 0x6F
    429   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x70 - 0x7F
    430   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x80 - 0x8F
    431   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x90 - 0x9F
    432   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xA0 - 0xAF
    433   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xB0 - 0xBF
    434   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xC0 - 0xCF
    435   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xD0 - 0xDF
    436   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xE0 - 0xEF
    437   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xF0 - 0xFF
    438 };
    439 
    440 // Returns true and sets result to "application/octet-stream" if the content
    441 // appears to be binary data. Otherwise, returns false and sets "text/plain".
    442 // Clears have_enough_content if more data could possibly change the result.
    443 static bool SniffBinary(const char* content,
    444                         size_t size,
    445                         bool* have_enough_content,
    446                         std::string* result) {
    447   // There is no concensus about exactly how to sniff for binary content.
    448   // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
    449   // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
    450   // Here, we side with FF, but with a smaller buffer. This size was chosen
    451   // because it is small enough to comfortably fit into a single packet (after
    452   // allowing for headers) and yet large enough to account for binary formats
    453   // that have a significant amount of ASCII at the beginning (crbug.com/15314).
    454   const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size);
    455 
    456   // First, we look for a BOM.
    457   static base::Histogram* counter(NULL);
    458   if (!counter)
    459     counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2",
    460                                      arraysize(kByteOrderMark));
    461   std::string unused;
    462   if (CheckForMagicNumbers(content, size,
    463                            kByteOrderMark, arraysize(kByteOrderMark),
    464                            counter, &unused)) {
    465     // If there is BOM, we think the buffer is not binary.
    466     result->assign("text/plain");
    467     return false;
    468   }
    469 
    470   // Next we look to see if any of the bytes "look binary."
    471   for (size_t i = 0; i < size; ++i) {
    472     // If we a see a binary-looking byte, we think the content is binary.
    473     if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) {
    474       result->assign("application/octet-stream");
    475       return true;
    476     }
    477   }
    478 
    479   // No evidence either way. Default to non-binary and, if truncated, clear
    480   // have_enough_content because there could be a binary looking byte in the
    481   // truncated data.
    482   *have_enough_content &= is_truncated;
    483   result->assign("text/plain");
    484   return false;
    485 }
    486 
    487 static bool IsUnknownMimeType(const std::string& mime_type) {
    488   // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
    489   // If we do, please be careful not to alter the semantics at all.
    490   static const char* kUnknownMimeTypes[] = {
    491     // Empty mime types are as unknown as they get.
    492     "",
    493     // The unknown/unknown type is popular and uninformative
    494     "unknown/unknown",
    495     // The second most popular unknown mime type is application/unknown
    496     "application/unknown",
    497     // Firefox rejects a mime type if it is exactly */*
    498     "*/*",
    499   };
    500   static base::Histogram* counter(NULL);
    501   if (!counter)
    502     counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2",
    503                                      arraysize(kUnknownMimeTypes) + 1);
    504   for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) {
    505     if (mime_type == kUnknownMimeTypes[i]) {
    506       counter->Add(i);
    507       return true;
    508     }
    509   }
    510   if (mime_type.find('/') == std::string::npos) {
    511     // Firefox rejects a mime type if it does not contain a slash
    512     counter->Add(arraysize(kUnknownMimeTypes));
    513     return true;
    514   }
    515   return false;
    516 }
    517 
    518 // Returns true and sets result if the content appears to be a crx (chrome
    519 // extension) file.
    520 // Clears have_enough_content if more data could possibly change the result.
    521 static bool SniffCRX(const char* content,
    522                      size_t size,
    523                      const GURL& url,
    524                      const std::string& type_hint,
    525                      bool* have_enough_content,
    526                      std::string* result) {
    527   static base::Histogram* counter(NULL);
    528   if (!counter)
    529     counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3);
    530 
    531   // Technically, the crx magic number is just Cr24, but the bytes after that
    532   // are a version number which changes infrequently. Including it in the
    533   // sniffing gives us less room for error. If the version number ever changes,
    534   // we can just add an entry to this list.
    535   //
    536   // TODO(aa): If we ever have another magic number, we'll want to pass a
    537   // histogram into CheckForMagicNumbers(), below, to see which one matched.
    538   static const struct MagicNumber kCRXMagicNumbers[] = {
    539     MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00")
    540   };
    541 
    542   // Only consider files that have the extension ".crx".
    543   static const char kCRXExtension[] = ".crx";
    544   // Ignore null by subtracting 1.
    545   static const int kExtensionLength = arraysize(kCRXExtension) - 1;
    546   if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) ==
    547       url.path().size() - kExtensionLength) {
    548     counter->Add(1);
    549   } else {
    550     return false;
    551   }
    552 
    553   *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
    554   if (CheckForMagicNumbers(content, size,
    555                            kCRXMagicNumbers, arraysize(kCRXMagicNumbers),
    556                            NULL, result)) {
    557     counter->Add(2);
    558   } else {
    559     return false;
    560   }
    561 
    562   return true;
    563 }
    564 
    565 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) {
    566   static base::Histogram* should_sniff_counter(NULL);
    567   if (!should_sniff_counter)
    568     should_sniff_counter =
    569         UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3);
    570   // We are willing to sniff the mime type for HTTP, HTTPS, and FTP
    571   bool sniffable_scheme = url.is_empty() ||
    572                           url.SchemeIs("http") ||
    573                           url.SchemeIs("https") ||
    574                           url.SchemeIs("ftp") ||
    575                           url.SchemeIsFile();
    576   if (!sniffable_scheme) {
    577     should_sniff_counter->Add(1);
    578     return false;
    579   }
    580 
    581   static const char* kSniffableTypes[] = {
    582     // Many web servers are misconfigured to send text/plain for many
    583     // different types of content.
    584     "text/plain",
    585     // We want to sniff application/octet-stream for
    586     // application/x-chrome-extension, but nothing else.
    587     "application/octet-stream",
    588     // XHTML and Atom/RSS feeds are often served as plain xml instead of
    589     // their more specific mime types.
    590     "text/xml",
    591     "application/xml",
    592   };
    593   static base::Histogram* counter(NULL);
    594   if (!counter)
    595     counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2",
    596                                      arraysize(kSniffableTypes) + 1);
    597   for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) {
    598     if (mime_type == kSniffableTypes[i]) {
    599       counter->Add(i);
    600       should_sniff_counter->Add(2);
    601       return true;
    602     }
    603   }
    604   if (IsUnknownMimeType(mime_type)) {
    605     // The web server didn't specify a content type or specified a mime
    606     // type that we ignore.
    607     counter->Add(arraysize(kSniffableTypes));
    608     should_sniff_counter->Add(2);
    609     return true;
    610   }
    611   should_sniff_counter->Add(1);
    612   return false;
    613 }
    614 
    615 bool SniffMimeType(const char* content, size_t content_size,
    616                    const GURL& url, const std::string& type_hint,
    617                    std::string* result) {
    618   DCHECK_LT(content_size, 1000000U);  // sanity check
    619   DCHECK(content);
    620   DCHECK(result);
    621 
    622   // By default, we assume we have enough content.
    623   // Each sniff routine may unset this if it wasn't provided enough content.
    624   bool have_enough_content = true;
    625 
    626   // By default, we'll return the type hint.
    627   // Each sniff routine may modify this if it has a better guess..
    628   result->assign(type_hint);
    629 
    630   // Cache information about the type_hint
    631   const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
    632 
    633   // First check for HTML
    634   if (hint_is_unknown_mime_type) {
    635     // We're only willing to sniff HTML if the server has not supplied a mime
    636     // type, or if the type it did supply indicates that it doesn't know what
    637     // the type should be.
    638     if (SniffForHTML(content, content_size, &have_enough_content, result))
    639       return true;  // We succeeded in sniffing HTML.  No more content needed.
    640   }
    641 
    642   // We're only willing to sniff for binary in 3 cases:
    643   // 1. The server has not supplied a mime type.
    644   // 2. The type it did supply indicates that it doesn't know what the type
    645   //    should be.
    646   // 3. The type is "text/plain" which is the default on some web servers and
    647   //    could be indicative of a mis-configuration that we shield the user from.
    648   const bool hint_is_text_plain = (type_hint == "text/plain");
    649   if (hint_is_unknown_mime_type || hint_is_text_plain) {
    650     if (!SniffBinary(content, content_size, &have_enough_content, result)) {
    651       // If the server said the content was text/plain and it doesn't appear
    652       // to be binary, then we trust it.
    653       if (hint_is_text_plain) {
    654         return have_enough_content;
    655       }
    656     }
    657   }
    658 
    659   // If we have plain XML, sniff XML subtypes.
    660   if (type_hint == "text/xml" || type_hint == "application/xml") {
    661     // We're not interested in sniffing these types for images and the like.
    662     // Instead, we're looking explicitly for a feed.  If we don't find one
    663     // we're done and return early.
    664     if (SniffXML(content, content_size, &have_enough_content, result))
    665       return true;
    666     return have_enough_content;
    667   }
    668 
    669   // CRX files (chrome extensions) have a special sniffing algorithm. It is
    670   // tighter than the others because we don't have to match legacy behavior.
    671   if (SniffCRX(content, content_size, url, type_hint,
    672                &have_enough_content, result))
    673     return true;
    674 
    675   // We're not interested in sniffing for magic numbers when the type_hint
    676   // is application/octet-stream.  Time to bail out.
    677   if (type_hint == "application/octet-stream")
    678     return have_enough_content;
    679 
    680   // Now we look in our large table of magic numbers to see if we can find
    681   // anything that matches the content.
    682   if (SniffForMagicNumbers(content, content_size,
    683                            &have_enough_content, result))
    684     return true;  // We've matched a magic number.  No more content needed.
    685 
    686   return have_enough_content;
    687 }
    688 
    689 }  // namespace net
    690