Home | History | Annotate | Download | only in base
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Detecting mime types is a tricky business because we need to balance
      6 // compatibility concerns with security issues.  Here is a survey of how other
      7 // browsers behave and then a description of how we intend to behave.
      8 //
      9 // HTML payload, no Content-Type header:
     10 // * IE 7: Render as HTML
     11 // * Firefox 2: Render as HTML
     12 // * Safari 3: Render as HTML
     13 // * Opera 9: Render as HTML
     14 //
     15 // Here the choice seems clear:
     16 // => Chrome: Render as HTML
     17 //
     18 // HTML payload, Content-Type: "text/plain":
     19 // * IE 7: Render as HTML
     20 // * Firefox 2: Render as text
     21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL
     22 //                                   has an HTML extension)
     23 // * Opera 9: Render as text
     24 //
     25 // Here we choose to follow the majority (and break some compatibility with IE).
     26 // Many folks dislike IE's behavior here.
     27 // => Chrome: Render as text
     28 // We generalize this as follows.  If the Content-Type header is text/plain
     29 // we won't detect dangerous mime types (those that can execute script).
     30 //
     31 // HTML payload, Content-Type: "application/octet-stream":
     32 // * IE 7: Render as HTML
     33 // * Firefox 2: Download as application/octet-stream
     34 // * Safari 3: Render as HTML
     35 // * Opera 9: Render as HTML
     36 //
     37 // We follow Firefox.
     38 // => Chrome: Download as application/octet-stream
     39 // One factor in this decision is that IIS 4 and 5 will send
     40 // application/octet-stream for .xhtml files (because they don't recognize
     41 // the extension).  We did some experiments and it looks like this doesn't occur
     42 // very often on the web.  We choose the more secure option.
     43 //
     44 // GIF payload, no Content-Type header:
     45 // * IE 7: Render as GIF
     46 // * Firefox 2: Render as GIF
     47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
     48 //                                        URL has an GIF extension)
     49 // * Opera 9: Render as GIF
     50 //
     51 // The choice is clear.
     52 // => Chrome: Render as GIF
     53 // Once we decide to render HTML without a Content-Type header, there isn't much
     54 // reason not to render GIFs.
     55 //
     56 // GIF payload, Content-Type: "text/plain":
     57 // * IE 7: Render as GIF
     58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
     59 //                              Download as GIF if the URL has an GIF extension)
     60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
     61 //                                        URL has an GIF extension)
     62 // * Opera 9: Render as GIF
     63 //
     64 // Displaying as text/plain makes little sense as the content will look like
     65 // gibberish.  Here, we could change our minds and download.
     66 // => Chrome: Render as GIF
     67 //
     68 // GIF payload, Content-Type: "application/octet-stream":
     69 // * IE 7: Render as GIF
     70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
     71 //                              Download as GIF if the URL has an GIF extension)
     72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
     73 //                                        URL has an GIF extension)
     74 // * Opera 9: Render as GIF
     75 //
     76 // We used to render as GIF here, but the problem is that some sites want to
     77 // trigger downloads by sending application/octet-stream (even though they
     78 // should be sending Content-Disposition: attachment).  Although it is safe
     79 // to render as GIF from a security perspective, we actually get better
     80 // compatibility if we don't sniff from application/octet stream at all.
     81 // => Chrome: Download as application/octet-stream
     82 //
     83 // XHTML payload, Content-Type: "text/xml":
     84 // * IE 7: Render as XML
     85 // * Firefox 2: Render as HTML
     86 // * Safari 3: Render as HTML
     87 // * Opera 9: Render as HTML
     88 // The layout tests rely on us rendering this as HTML.
     89 // But we're conservative in XHTML detection, as this runs afoul of the
     90 // "don't detect dangerous mime types" rule.
     91 //
     92 // Note that our definition of HTML payload is much stricter than IE's
     93 // definition and roughly the same as Firefox's definition.
     94 
     95 #include <string>
     96 
     97 #include "net/base/mime_sniffer.h"
     98 
     99 #include "base/basictypes.h"
    100 #include "base/logging.h"
    101 #include "base/metrics/histogram.h"
    102 #include "base/strings/string_util.h"
    103 #include "net/base/mime_util.h"
    104 #include "url/gurl.h"
    105 
    106 namespace net {
    107 
    108 // The number of content bytes we need to use all our magic numbers.  Feel free
    109 // to increase this number if you add a longer magic number.
    110 static const size_t kBytesRequiredForMagic = 42;
    111 
    112 struct MagicNumber {
    113   const char* mime_type;
    114   const char* magic;
    115   size_t magic_len;
    116   bool is_string;
    117   const char* mask;  // if set, must have same length as |magic|
    118 };
    119 
    120 #define MAGIC_NUMBER(mime_type, magic) \
    121   { (mime_type), (magic), sizeof(magic)-1, false, NULL },
    122 
    123 template <int MagicSize, int MaskSize>
    124 class VerifySizes {
    125   COMPILE_ASSERT(MagicSize == MaskSize, sizes_must_be_equal);
    126  public:
    127   enum { SIZES = MagicSize };
    128 };
    129 
    130 #define verified_sizeof(magic, mask) \
    131 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES
    132 
    133 #define MAGIC_MASK(mime_type, magic, mask) \
    134   { (mime_type), (magic), verified_sizeof(magic, mask)-1, false, (mask) },
    135 
    136 // Magic strings are case insensitive and must not include '\0' characters
    137 #define MAGIC_STRING(mime_type, magic) \
    138   { (mime_type), (magic), sizeof(magic)-1, true, NULL },
    139 
    140 static const MagicNumber kMagicNumbers[] = {
    141   // Source: HTML 5 specification
    142   MAGIC_NUMBER("application/pdf", "%PDF-")
    143   MAGIC_NUMBER("application/postscript", "%!PS-Adobe-")
    144   MAGIC_NUMBER("image/gif", "GIF87a")
    145   MAGIC_NUMBER("image/gif", "GIF89a")
    146   MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A")
    147   MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF")
    148   MAGIC_NUMBER("image/bmp", "BM")
    149   // Source: Mozilla
    150   MAGIC_NUMBER("text/plain", "#!")  // Script
    151   MAGIC_NUMBER("text/plain", "%!")  // Script, similar to PS
    152   MAGIC_NUMBER("text/plain", "From")
    153   MAGIC_NUMBER("text/plain", ">From")
    154   // Chrome specific
    155   MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08")
    156   MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46")
    157   MAGIC_NUMBER("video/x-ms-asf",
    158       "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C")
    159   MAGIC_NUMBER("image/tiff", "I I")
    160   MAGIC_NUMBER("image/tiff", "II*")
    161   MAGIC_NUMBER("image/tiff", "MM\x00*")
    162   MAGIC_NUMBER("audio/mpeg", "ID3")
    163   MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ")
    164   MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3")
    165   // TODO(abarth): we don't handle partial byte matches yet
    166   // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB")
    167   // MAGIC_NUMBER("audio/mpeg", "\xFF\xE")
    168   // MAGIC_NUMBER("audio/mpeg", "\xFF\xF")
    169   MAGIC_NUMBER("application/zip", "PK\x03\x04")
    170   MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00")
    171   MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A")
    172   MAGIC_NUMBER("application/octet-stream", "MZ")  // EXE
    173   // Sniffing for Flash:
    174   //
    175   //   MAGIC_NUMBER("application/x-shockwave-flash", "CWS")
    176   //   MAGIC_NUMBER("application/x-shockwave-flash", "FLV")
    177   //   MAGIC_NUMBER("application/x-shockwave-flash", "FWS")
    178   //
    179   // Including these magic number for Flash is a trade off.
    180   //
    181   // Pros:
    182   //   * Flash is an important and popular file format
    183   //
    184   // Cons:
    185   //   * These patterns are fairly weak
    186   //   * If we mistakenly decide something is Flash, we will execute it
    187   //     in the origin of an unsuspecting site.  This could be a security
    188   //     vulnerability if the site allows users to upload content.
    189   //
    190   // On balance, we do not include these patterns.
    191 };
    192 
    193 // The number of content bytes we need to use all our Microsoft Office magic
    194 // numbers.
    195 static const size_t kBytesRequiredForOfficeMagic = 8;
    196 
    197 static const MagicNumber kOfficeMagicNumbers[] = {
    198   MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1")
    199   MAGIC_NUMBER("OOXML", "PK\x03\x04")
    200 };
    201 
    202 enum OfficeDocType {
    203   DOC_TYPE_WORD,
    204   DOC_TYPE_EXCEL,
    205   DOC_TYPE_POWERPOINT,
    206   DOC_TYPE_NONE
    207 };
    208 
    209 struct OfficeExtensionType {
    210   OfficeDocType doc_type;
    211   const char* extension;
    212   size_t extension_len;
    213 };
    214 
    215 #define OFFICE_EXTENSION(type, extension) \
    216   { (type), (extension), sizeof(extension) - 1 },
    217 
    218 static const OfficeExtensionType kOfficeExtensionTypes[] = {
    219   OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc")
    220   OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls")
    221   OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt")
    222   OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx")
    223   OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx")
    224   OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx")
    225 };
    226 
    227 static const MagicNumber kExtraMagicNumbers[] = {
    228   MAGIC_NUMBER("image/x-xbitmap", "#define")
    229   MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00")
    230   MAGIC_NUMBER("image/svg+xml", "<?xml_version=")
    231   MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt ")
    232   MAGIC_NUMBER("video/avi", "RIFF....AVI LIST")
    233   MAGIC_NUMBER("audio/ogg", "OggS")
    234   MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0")
    235   MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0")
    236   MAGIC_NUMBER("video/3gpp", "....ftyp3g")
    237   MAGIC_NUMBER("video/3gpp", "....ftypavcl")
    238   MAGIC_NUMBER("video/mp4", "....ftyp")
    239   MAGIC_NUMBER("video/quicktime", "MOVI")
    240   MAGIC_NUMBER("application/x-shockwave-flash", "CWS")
    241   MAGIC_NUMBER("application/x-shockwave-flash", "FWS")
    242   MAGIC_NUMBER("video/x-flv", "FLV")
    243   MAGIC_NUMBER("audio/x-flac", "fLaC")
    244 
    245   // RAW image types.
    246   MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR")
    247   MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR")
    248   MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM")
    249   MAGIC_NUMBER("image/x-olympus-orf", "MMOR")  // big-endian
    250   MAGIC_NUMBER("image/x-olympus-orf", "IIRO")  // little-endian
    251   MAGIC_NUMBER("image/x-olympus-orf", "IIRS")  // little-endian
    252   MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW ")
    253   MAGIC_NUMBER("image/x-panasonic-raw",
    254                "IIU\x00\x08\x00\x00\x00")  // Panasonic .raw
    255   MAGIC_NUMBER("image/x-panasonic-raw",
    256                "IIU\x00\x18\x00\x00\x00")  // Panasonic .rw2
    257   MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw")
    258   MAGIC_NUMBER("image/x-x3f", "FOVb")
    259 };
    260 
    261 // Our HTML sniffer differs slightly from Mozilla.  For example, Mozilla will
    262 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
    263 // HTML, but we will not.
    264 
    265 #define MAGIC_HTML_TAG(tag) \
    266   MAGIC_STRING("text/html", "<" tag)
    267 
    268 static const MagicNumber kSniffableTags[] = {
    269   // XML processing directive.  Although this is not an HTML mime type, we sniff
    270   // for this in the HTML phase because text/xml is just as powerful as HTML and
    271   // we want to leverage our white space skipping technology.
    272   MAGIC_NUMBER("text/xml", "<?xml")  // Mozilla
    273   // DOCTYPEs
    274   MAGIC_HTML_TAG("!DOCTYPE html")  // HTML5 spec
    275   // Sniffable tags, ordered by how often they occur in sniffable documents.
    276   MAGIC_HTML_TAG("script")  // HTML5 spec, Mozilla
    277   MAGIC_HTML_TAG("html")  // HTML5 spec, Mozilla
    278   MAGIC_HTML_TAG("!--")
    279   MAGIC_HTML_TAG("head")  // HTML5 spec, Mozilla
    280   MAGIC_HTML_TAG("iframe")  // Mozilla
    281   MAGIC_HTML_TAG("h1")  // Mozilla
    282   MAGIC_HTML_TAG("div")  // Mozilla
    283   MAGIC_HTML_TAG("font")  // Mozilla
    284   MAGIC_HTML_TAG("table")  // Mozilla
    285   MAGIC_HTML_TAG("a")  // Mozilla
    286   MAGIC_HTML_TAG("style")  // Mozilla
    287   MAGIC_HTML_TAG("title")  // Mozilla
    288   MAGIC_HTML_TAG("b")  // Mozilla
    289   MAGIC_HTML_TAG("body")  // Mozilla
    290   MAGIC_HTML_TAG("br")
    291   MAGIC_HTML_TAG("p")  // Mozilla
    292 };
    293 
    294 static base::HistogramBase* UMASnifferHistogramGet(const char* name,
    295                                                    int array_size) {
    296   base::HistogramBase* counter =
    297       base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size,
    298           base::HistogramBase::kUmaTargetedHistogramFlag);
    299   return counter;
    300 }
    301 
    302 // Compare content header to a magic number where magic_entry can contain '.'
    303 // for single character of anything, allowing some bytes to be skipped.
    304 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) {
    305   while (len) {
    306     if ((*magic_entry != '.') && (*magic_entry != *content))
    307       return false;
    308     ++magic_entry;
    309     ++content;
    310     --len;
    311   }
    312   return true;
    313 }
    314 
    315 // Like MagicCmp() except that it ANDs each byte with a mask before
    316 // the comparison, because there are some bits we don't care about.
    317 static bool MagicMaskCmp(const char* magic_entry,
    318                          const char* content,
    319                          size_t len,
    320                          const char* mask) {
    321   while (len) {
    322     if ((*magic_entry != '.') && (*magic_entry != (*mask & *content)))
    323       return false;
    324     ++magic_entry;
    325     ++content;
    326     ++mask;
    327     --len;
    328   }
    329   return true;
    330 }
    331 
    332 static bool MatchMagicNumber(const char* content,
    333                              size_t size,
    334                              const MagicNumber& magic_entry,
    335                              std::string* result) {
    336   const size_t len = magic_entry.magic_len;
    337 
    338   // Keep kBytesRequiredForMagic honest.
    339   DCHECK_LE(len, kBytesRequiredForMagic);
    340 
    341   // To compare with magic strings, we need to compute strlen(content), but
    342   // content might not actually have a null terminator.  In that case, we
    343   // pretend the length is content_size.
    344   const char* end = static_cast<const char*>(memchr(content, '\0', size));
    345   const size_t content_strlen =
    346       (end != NULL) ? static_cast<size_t>(end - content) : size;
    347 
    348   bool match = false;
    349   if (magic_entry.is_string) {
    350     if (content_strlen >= len) {
    351       // String comparisons are case-insensitive
    352       match = (base::strncasecmp(magic_entry.magic, content, len) == 0);
    353     }
    354   } else {
    355     if (size >= len) {
    356       if (!magic_entry.mask) {
    357         match = MagicCmp(magic_entry.magic, content, len);
    358       } else {
    359         match = MagicMaskCmp(magic_entry.magic, content, len, magic_entry.mask);
    360       }
    361     }
    362   }
    363 
    364   if (match) {
    365     result->assign(magic_entry.mime_type);
    366     return true;
    367   }
    368   return false;
    369 }
    370 
    371 static bool CheckForMagicNumbers(const char* content, size_t size,
    372                                  const MagicNumber* magic, size_t magic_len,
    373                                  base::HistogramBase* counter,
    374                                  std::string* result) {
    375   for (size_t i = 0; i < magic_len; ++i) {
    376     if (MatchMagicNumber(content, size, magic[i], result)) {
    377       if (counter) counter->Add(static_cast<int>(i));
    378       return true;
    379     }
    380   }
    381   return false;
    382 }
    383 
    384 // Truncates |size| to |max_size| and returns true if |size| is at least
    385 // |max_size|.
    386 static bool TruncateSize(const size_t max_size, size_t* size) {
    387   // Keep kMaxBytesToSniff honest.
    388   DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
    389 
    390   if (*size >= max_size) {
    391     *size = max_size;
    392     return true;
    393   }
    394   return false;
    395 }
    396 
    397 // Returns true and sets result if the content appears to be HTML.
    398 // Clears have_enough_content if more data could possibly change the result.
    399 static bool SniffForHTML(const char* content,
    400                          size_t size,
    401                          bool* have_enough_content,
    402                          std::string* result) {
    403   // For HTML, we are willing to consider up to 512 bytes. This may be overly
    404   // conservative as IE only considers 256.
    405   *have_enough_content &= TruncateSize(512, &size);
    406 
    407   // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
    408   // but with some modifications to better match the HTML5 spec.
    409   const char* const end = content + size;
    410   const char* pos;
    411   for (pos = content; pos < end; ++pos) {
    412     if (!IsAsciiWhitespace(*pos))
    413       break;
    414   }
    415   static base::HistogramBase* counter(NULL);
    416   if (!counter) {
    417     counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2",
    418                                      arraysize(kSniffableTags));
    419   }
    420   // |pos| now points to first non-whitespace character (or at end).
    421   return CheckForMagicNumbers(pos, end - pos,
    422                               kSniffableTags, arraysize(kSniffableTags),
    423                               counter, result);
    424 }
    425 
    426 // Returns true and sets result if the content matches any of kMagicNumbers.
    427 // Clears have_enough_content if more data could possibly change the result.
    428 static bool SniffForMagicNumbers(const char* content,
    429                                  size_t size,
    430                                  bool* have_enough_content,
    431                                  std::string* result) {
    432   *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
    433 
    434   // Check our big table of Magic Numbers
    435   static base::HistogramBase* counter(NULL);
    436   if (!counter) {
    437     counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2",
    438                                      arraysize(kMagicNumbers));
    439   }
    440   return CheckForMagicNumbers(content, size,
    441                               kMagicNumbers, arraysize(kMagicNumbers),
    442                               counter, result);
    443 }
    444 
    445 // Returns true and sets result if the content matches any of
    446 // kOfficeMagicNumbers, and the URL has the proper extension.
    447 // Clears |have_enough_content| if more data could possibly change the result.
    448 static bool SniffForOfficeDocs(const char* content,
    449                                size_t size,
    450                                const GURL& url,
    451                                bool* have_enough_content,
    452                                std::string* result) {
    453   *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size);
    454 
    455   // Check our table of magic numbers for Office file types.
    456   std::string office_version;
    457   if (!CheckForMagicNumbers(content, size,
    458                             kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers),
    459                             NULL, &office_version))
    460     return false;
    461 
    462   OfficeDocType type = DOC_TYPE_NONE;
    463   for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) {
    464     std::string url_path = url.path();
    465 
    466     if (url_path.length() < kOfficeExtensionTypes[i].extension_len)
    467       continue;
    468 
    469     const char* extension =
    470         &url_path[url_path.length() - kOfficeExtensionTypes[i].extension_len];
    471 
    472     if (0 == base::strncasecmp(extension, kOfficeExtensionTypes[i].extension,
    473                                kOfficeExtensionTypes[i].extension_len)) {
    474       type = kOfficeExtensionTypes[i].doc_type;
    475       break;
    476     }
    477   }
    478 
    479   if (type == DOC_TYPE_NONE)
    480     return false;
    481 
    482   if (office_version == "CFB") {
    483     switch (type) {
    484       case DOC_TYPE_WORD:
    485         *result = "application/msword";
    486         return true;
    487       case DOC_TYPE_EXCEL:
    488         *result = "application/vnd.ms-excel";
    489         return true;
    490       case DOC_TYPE_POWERPOINT:
    491         *result = "application/vnd.ms-powerpoint";
    492         return true;
    493       case DOC_TYPE_NONE:
    494         NOTREACHED();
    495         return false;
    496     }
    497   } else if (office_version == "OOXML") {
    498     switch (type) {
    499       case DOC_TYPE_WORD:
    500         *result = "application/vnd.openxmlformats-officedocument."
    501                   "wordprocessingml.document";
    502         return true;
    503       case DOC_TYPE_EXCEL:
    504         *result = "application/vnd.openxmlformats-officedocument."
    505                   "spreadsheetml.sheet";
    506         return true;
    507       case DOC_TYPE_POWERPOINT:
    508         *result = "application/vnd.openxmlformats-officedocument."
    509                   "presentationml.presentation";
    510         return true;
    511       case DOC_TYPE_NONE:
    512         NOTREACHED();
    513         return false;
    514     }
    515   }
    516 
    517   NOTREACHED();
    518   return false;
    519 }
    520 
    521 static bool IsOfficeType(const std::string& type_hint) {
    522   return (type_hint == "application/msword" ||
    523           type_hint == "application/vnd.ms-excel" ||
    524           type_hint == "application/vnd.ms-powerpoint" ||
    525           type_hint == "application/vnd.openxmlformats-officedocument."
    526                        "wordprocessingml.document" ||
    527           type_hint == "application/vnd.openxmlformats-officedocument."
    528                        "spreadsheetml.sheet" ||
    529           type_hint == "application/vnd.openxmlformats-officedocument."
    530                        "presentationml.presentation" ||
    531           type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" ||
    532           type_hint == "application/vnd.ms-word.document.macroenabled.12" ||
    533           type_hint == "application/vnd.ms-powerpoint.presentation."
    534                        "macroenabled.12" ||
    535           type_hint == "application/mspowerpoint" ||
    536           type_hint == "application/msexcel" ||
    537           type_hint == "application/vnd.ms-word" ||
    538           type_hint == "application/vnd.ms-word.document.12" ||
    539           type_hint == "application/vnd.msword");
    540 }
    541 
    542 // This function checks for files that have a Microsoft Office MIME type
    543 // set, but are not actually Office files.
    544 //
    545 // If this is not actually an Office file, |*result| is set to
    546 // "application/octet-stream", otherwise it is not modified.
    547 //
    548 // Returns false if additional data is required to determine the file type, or
    549 // true if there is enough data to make a decision.
    550 static bool SniffForInvalidOfficeDocs(const char* content,
    551                                       size_t size,
    552                                       const GURL& url,
    553                                       std::string* result) {
    554   if (!TruncateSize(kBytesRequiredForOfficeMagic, &size))
    555     return false;
    556 
    557   // Check our table of magic numbers for Office file types.  If it does not
    558   // match one, the MIME type was invalid.  Set it instead to a safe value.
    559   std::string office_version;
    560   if (!CheckForMagicNumbers(content, size,
    561                             kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers),
    562                             NULL, &office_version)) {
    563     *result = "application/octet-stream";
    564   }
    565 
    566   // We have enough information to determine if this was a Microsoft Office
    567   // document or not, so sniffing is completed.
    568   return true;
    569 }
    570 
    571 // Byte order marks
    572 static const MagicNumber kMagicXML[] = {
    573   // We want to be very conservative in interpreting text/xml content as
    574   // XHTML -- we just want to sniff enough to make unit tests pass.
    575   // So we match explicitly on this, and don't match other ways of writing
    576   // it in semantically-equivalent ways.
    577   MAGIC_STRING("application/xhtml+xml",
    578                "<html xmlns=\"http://www.w3.org/1999/xhtml\"")
    579   MAGIC_STRING("application/atom+xml", "<feed")
    580   MAGIC_STRING("application/rss+xml", "<rss")  // UTF-8
    581 };
    582 
    583 // Returns true and sets result if the content appears to contain XHTML or a
    584 // feed.
    585 // Clears have_enough_content if more data could possibly change the result.
    586 //
    587 // TODO(evanm): this is similar but more conservative than what Safari does,
    588 // while HTML5 has a different recommendation -- what should we do?
    589 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset
    590 // of ASCII -- do we care?
    591 static bool SniffXML(const char* content,
    592                      size_t size,
    593                      bool* have_enough_content,
    594                      std::string* result) {
    595   // We allow at most 300 bytes of content before we expect the opening tag.
    596   *have_enough_content &= TruncateSize(300, &size);
    597   const char* pos = content;
    598   const char* const end = content + size;
    599 
    600   // This loop iterates through tag-looking offsets in the file.
    601   // We want to skip XML processing instructions (of the form "<?xml ...")
    602   // and stop at the first "plain" tag, then make a decision on the mime-type
    603   // based on the name (or possibly attributes) of that tag.
    604   static base::HistogramBase* counter(NULL);
    605   if (!counter) {
    606     counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2",
    607                                      arraysize(kMagicXML));
    608   }
    609   const int kMaxTagIterations = 5;
    610   for (int i = 0; i < kMaxTagIterations && pos < end; ++i) {
    611     pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos));
    612     if (!pos)
    613       return false;
    614 
    615     if (base::strncasecmp(pos, "<?xml", sizeof("<?xml") - 1) == 0) {
    616       // Skip XML declarations.
    617       ++pos;
    618       continue;
    619     } else if (base::strncasecmp(pos, "<!DOCTYPE",
    620                                  sizeof("<!DOCTYPE") - 1) == 0) {
    621       // Skip DOCTYPE declarations.
    622       ++pos;
    623       continue;
    624     }
    625 
    626     if (CheckForMagicNumbers(pos, end - pos,
    627                              kMagicXML, arraysize(kMagicXML),
    628                              counter, result))
    629       return true;
    630 
    631     // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
    632     // to identify.
    633 
    634     // If we get here, we've hit an initial tag that hasn't matched one of the
    635     // above tests.  Abort.
    636     return true;
    637   }
    638 
    639   // We iterated too far without finding a start tag.
    640   // If we have more content to look at, we aren't going to change our mind by
    641   // seeing more bytes from the network.
    642   return pos < end;
    643 }
    644 
    645 // Byte order marks
    646 static const MagicNumber kByteOrderMark[] = {
    647   MAGIC_NUMBER("text/plain", "\xFE\xFF")  // UTF-16BE
    648   MAGIC_NUMBER("text/plain", "\xFF\xFE")  // UTF-16LE
    649   MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF")  // UTF-8
    650 };
    651 
    652 // Whether a given byte looks like it might be part of binary content.
    653 // Source: HTML5 spec
    654 static char kByteLooksBinary[] = {
    655   1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  // 0x00 - 0x0F
    656   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  // 0x10 - 0x1F
    657   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x20 - 0x2F
    658   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x30 - 0x3F
    659   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x40 - 0x4F
    660   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x50 - 0x5F
    661   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x60 - 0x6F
    662   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x70 - 0x7F
    663   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x80 - 0x8F
    664   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x90 - 0x9F
    665   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xA0 - 0xAF
    666   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xB0 - 0xBF
    667   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xC0 - 0xCF
    668   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xD0 - 0xDF
    669   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xE0 - 0xEF
    670   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xF0 - 0xFF
    671 };
    672 
    673 // Returns true and sets result to "application/octet-stream" if the content
    674 // appears to be binary data. Otherwise, returns false and sets "text/plain".
    675 // Clears have_enough_content if more data could possibly change the result.
    676 static bool SniffBinary(const char* content,
    677                         size_t size,
    678                         bool* have_enough_content,
    679                         std::string* result) {
    680   // There is no concensus about exactly how to sniff for binary content.
    681   // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
    682   // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
    683   // Here, we side with FF, but with a smaller buffer. This size was chosen
    684   // because it is small enough to comfortably fit into a single packet (after
    685   // allowing for headers) and yet large enough to account for binary formats
    686   // that have a significant amount of ASCII at the beginning (crbug.com/15314).
    687   const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size);
    688 
    689   // First, we look for a BOM.
    690   static base::HistogramBase* counter(NULL);
    691   if (!counter) {
    692     counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2",
    693                                      arraysize(kByteOrderMark));
    694   }
    695   std::string unused;
    696   if (CheckForMagicNumbers(content, size,
    697                            kByteOrderMark, arraysize(kByteOrderMark),
    698                            counter, &unused)) {
    699     // If there is BOM, we think the buffer is not binary.
    700     result->assign("text/plain");
    701     return false;
    702   }
    703 
    704   // Next we look to see if any of the bytes "look binary."
    705   for (size_t i = 0; i < size; ++i) {
    706     // If we a see a binary-looking byte, we think the content is binary.
    707     if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) {
    708       result->assign("application/octet-stream");
    709       return true;
    710     }
    711   }
    712 
    713   // No evidence either way. Default to non-binary and, if truncated, clear
    714   // have_enough_content because there could be a binary looking byte in the
    715   // truncated data.
    716   *have_enough_content &= is_truncated;
    717   result->assign("text/plain");
    718   return false;
    719 }
    720 
    721 static bool IsUnknownMimeType(const std::string& mime_type) {
    722   // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
    723   // If we do, please be careful not to alter the semantics at all.
    724   static const char* kUnknownMimeTypes[] = {
    725     // Empty mime types are as unknown as they get.
    726     "",
    727     // The unknown/unknown type is popular and uninformative
    728     "unknown/unknown",
    729     // The second most popular unknown mime type is application/unknown
    730     "application/unknown",
    731     // Firefox rejects a mime type if it is exactly */*
    732     "*/*",
    733   };
    734   static base::HistogramBase* counter(NULL);
    735   if (!counter) {
    736     counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2",
    737                                      arraysize(kUnknownMimeTypes) + 1);
    738   }
    739   for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) {
    740     if (mime_type == kUnknownMimeTypes[i]) {
    741       counter->Add(i);
    742       return true;
    743     }
    744   }
    745   if (mime_type.find('/') == std::string::npos) {
    746     // Firefox rejects a mime type if it does not contain a slash
    747     counter->Add(arraysize(kUnknownMimeTypes));
    748     return true;
    749   }
    750   return false;
    751 }
    752 
    753 // Returns true and sets result if the content appears to be a crx (Chrome
    754 // extension) file.
    755 // Clears have_enough_content if more data could possibly change the result.
    756 static bool SniffCRX(const char* content,
    757                      size_t size,
    758                      const GURL& url,
    759                      const std::string& type_hint,
    760                      bool* have_enough_content,
    761                      std::string* result) {
    762   static base::HistogramBase* counter(NULL);
    763   if (!counter)
    764     counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3);
    765 
    766   // Technically, the crx magic number is just Cr24, but the bytes after that
    767   // are a version number which changes infrequently. Including it in the
    768   // sniffing gives us less room for error. If the version number ever changes,
    769   // we can just add an entry to this list.
    770   //
    771   // TODO(aa): If we ever have another magic number, we'll want to pass a
    772   // histogram into CheckForMagicNumbers(), below, to see which one matched.
    773   static const struct MagicNumber kCRXMagicNumbers[] = {
    774     MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00")
    775   };
    776 
    777   // Only consider files that have the extension ".crx".
    778   static const char kCRXExtension[] = ".crx";
    779   // Ignore null by subtracting 1.
    780   static const int kExtensionLength = arraysize(kCRXExtension) - 1;
    781   if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) ==
    782       url.path().size() - kExtensionLength) {
    783     counter->Add(1);
    784   } else {
    785     return false;
    786   }
    787 
    788   *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
    789   if (CheckForMagicNumbers(content, size,
    790                            kCRXMagicNumbers, arraysize(kCRXMagicNumbers),
    791                            NULL, result)) {
    792     counter->Add(2);
    793   } else {
    794     return false;
    795   }
    796 
    797   return true;
    798 }
    799 
    800 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) {
    801   static base::HistogramBase* should_sniff_counter(NULL);
    802   if (!should_sniff_counter) {
    803     should_sniff_counter =
    804         UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3);
    805   }
    806   bool sniffable_scheme = url.is_empty() ||
    807                           url.SchemeIs("http") ||
    808                           url.SchemeIs("https") ||
    809                           url.SchemeIs("ftp") ||
    810 #if defined(OS_ANDROID)
    811                           url.SchemeIs("content") ||
    812 #endif
    813                           url.SchemeIsFile() ||
    814                           url.SchemeIsFileSystem();
    815   if (!sniffable_scheme) {
    816     should_sniff_counter->Add(1);
    817     return false;
    818   }
    819 
    820   static const char* kSniffableTypes[] = {
    821     // Many web servers are misconfigured to send text/plain for many
    822     // different types of content.
    823     "text/plain",
    824     // We want to sniff application/octet-stream for
    825     // application/x-chrome-extension, but nothing else.
    826     "application/octet-stream",
    827     // XHTML and Atom/RSS feeds are often served as plain xml instead of
    828     // their more specific mime types.
    829     "text/xml",
    830     "application/xml",
    831     // Check for false Microsoft Office MIME types.
    832     "application/msword",
    833     "application/vnd.ms-excel",
    834     "application/vnd.ms-powerpoint",
    835     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    836     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    837     "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    838     "application/vnd.ms-excel.sheet.macroenabled.12",
    839     "application/vnd.ms-word.document.macroenabled.12",
    840     "application/vnd.ms-powerpoint.presentation.macroenabled.12",
    841     "application/mspowerpoint",
    842     "application/msexcel",
    843     "application/vnd.ms-word",
    844     "application/vnd.ms-word.document.12",
    845     "application/vnd.msword",
    846   };
    847   static base::HistogramBase* counter(NULL);
    848   if (!counter) {
    849     counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2",
    850                                      arraysize(kSniffableTypes) + 1);
    851   }
    852   for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) {
    853     if (mime_type == kSniffableTypes[i]) {
    854       counter->Add(i);
    855       should_sniff_counter->Add(2);
    856       return true;
    857     }
    858   }
    859   if (IsUnknownMimeType(mime_type)) {
    860     // The web server didn't specify a content type or specified a mime
    861     // type that we ignore.
    862     counter->Add(arraysize(kSniffableTypes));
    863     should_sniff_counter->Add(2);
    864     return true;
    865   }
    866   should_sniff_counter->Add(1);
    867   return false;
    868 }
    869 
    870 bool SniffMimeType(const char* content,
    871                    size_t content_size,
    872                    const GURL& url,
    873                    const std::string& type_hint,
    874                    std::string* result) {
    875   DCHECK_LT(content_size, 1000000U);  // sanity check
    876   DCHECK(content);
    877   DCHECK(result);
    878 
    879   // By default, we assume we have enough content.
    880   // Each sniff routine may unset this if it wasn't provided enough content.
    881   bool have_enough_content = true;
    882 
    883   // By default, we'll return the type hint.
    884   // Each sniff routine may modify this if it has a better guess..
    885   result->assign(type_hint);
    886 
    887   // If the file has a Microsoft Office MIME type, we should only check that it
    888   // is a valid Office file.  Because this is the only reason we sniff files
    889   // with a Microsoft Office MIME type, we can return early.
    890   if (IsOfficeType(type_hint))
    891     return SniffForInvalidOfficeDocs(content, content_size, url, result);
    892 
    893   // Cache information about the type_hint
    894   const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
    895 
    896   // First check for HTML
    897   if (hint_is_unknown_mime_type) {
    898     // We're only willing to sniff HTML if the server has not supplied a mime
    899     // type, or if the type it did supply indicates that it doesn't know what
    900     // the type should be.
    901     if (SniffForHTML(content, content_size, &have_enough_content, result))
    902       return true;  // We succeeded in sniffing HTML.  No more content needed.
    903   }
    904 
    905   // We're only willing to sniff for binary in 3 cases:
    906   // 1. The server has not supplied a mime type.
    907   // 2. The type it did supply indicates that it doesn't know what the type
    908   //    should be.
    909   // 3. The type is "text/plain" which is the default on some web servers and
    910   //    could be indicative of a mis-configuration that we shield the user from.
    911   const bool hint_is_text_plain = (type_hint == "text/plain");
    912   if (hint_is_unknown_mime_type || hint_is_text_plain) {
    913     if (!SniffBinary(content, content_size, &have_enough_content, result)) {
    914       // If the server said the content was text/plain and it doesn't appear
    915       // to be binary, then we trust it.
    916       if (hint_is_text_plain) {
    917         return have_enough_content;
    918       }
    919     }
    920   }
    921 
    922   // If we have plain XML, sniff XML subtypes.
    923   if (type_hint == "text/xml" || type_hint == "application/xml") {
    924     // We're not interested in sniffing these types for images and the like.
    925     // Instead, we're looking explicitly for a feed.  If we don't find one
    926     // we're done and return early.
    927     if (SniffXML(content, content_size, &have_enough_content, result))
    928       return true;
    929     return have_enough_content;
    930   }
    931 
    932   // CRX files (Chrome extensions) have a special sniffing algorithm. It is
    933   // tighter than the others because we don't have to match legacy behavior.
    934   if (SniffCRX(content, content_size, url, type_hint,
    935                &have_enough_content, result))
    936     return true;
    937 
    938   // Check the file extension and magic numbers to see if this is an Office
    939   // document.  This needs to be checked before the general magic numbers
    940   // because zip files and Office documents (OOXML) have the same magic number.
    941   if (SniffForOfficeDocs(content, content_size, url,
    942                          &have_enough_content, result))
    943     return true;  // We've matched a magic number.  No more content needed.
    944 
    945   // We're not interested in sniffing for magic numbers when the type_hint
    946   // is application/octet-stream.  Time to bail out.
    947   if (type_hint == "application/octet-stream")
    948     return have_enough_content;
    949 
    950   // Now we look in our large table of magic numbers to see if we can find
    951   // anything that matches the content.
    952   if (SniffForMagicNumbers(content, content_size,
    953                            &have_enough_content, result))
    954     return true;  // We've matched a magic number.  No more content needed.
    955 
    956   return have_enough_content;
    957 }
    958 
    959 bool SniffMimeTypeFromLocalData(const char* content,
    960                                 size_t size,
    961                                 std::string* result) {
    962   // First check the extra table.
    963   if (CheckForMagicNumbers(content, size, kExtraMagicNumbers,
    964                            arraysize(kExtraMagicNumbers), NULL, result))
    965     return true;
    966   // Finally check the original table.
    967   return CheckForMagicNumbers(content, size, kMagicNumbers,
    968                               arraysize(kMagicNumbers), NULL, result);
    969 }
    970 
    971 }  // namespace net
    972