Home | History | Annotate | Download | only in http
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package http
      6 
      7 import (
      8 	"bytes"
      9 	"encoding/binary"
     10 )
     11 
     12 // The algorithm uses at most sniffLen bytes to make its decision.
     13 const sniffLen = 512
     14 
     15 // DetectContentType implements the algorithm described
     16 // at http://mimesniff.spec.whatwg.org/ to determine the
     17 // Content-Type of the given data. It considers at most the
     18 // first 512 bytes of data. DetectContentType always returns
     19 // a valid MIME type: if it cannot determine a more specific one, it
     20 // returns "application/octet-stream".
     21 func DetectContentType(data []byte) string {
     22 	if len(data) > sniffLen {
     23 		data = data[:sniffLen]
     24 	}
     25 
     26 	// Index of the first non-whitespace byte in data.
     27 	firstNonWS := 0
     28 	for ; firstNonWS < len(data) && isWS(data[firstNonWS]); firstNonWS++ {
     29 	}
     30 
     31 	for _, sig := range sniffSignatures {
     32 		if ct := sig.match(data, firstNonWS); ct != "" {
     33 			return ct
     34 		}
     35 	}
     36 
     37 	return "application/octet-stream" // fallback
     38 }
     39 
     40 func isWS(b byte) bool {
     41 	switch b {
     42 	case '\t', '\n', '\x0c', '\r', ' ':
     43 		return true
     44 	}
     45 	return false
     46 }
     47 
     48 type sniffSig interface {
     49 	// match returns the MIME type of the data, or "" if unknown.
     50 	match(data []byte, firstNonWS int) string
     51 }
     52 
     53 // Data matching the table in section 6.
     54 var sniffSignatures = []sniffSig{
     55 	htmlSig("<!DOCTYPE HTML"),
     56 	htmlSig("<HTML"),
     57 	htmlSig("<HEAD"),
     58 	htmlSig("<SCRIPT"),
     59 	htmlSig("<IFRAME"),
     60 	htmlSig("<H1"),
     61 	htmlSig("<DIV"),
     62 	htmlSig("<FONT"),
     63 	htmlSig("<TABLE"),
     64 	htmlSig("<A"),
     65 	htmlSig("<STYLE"),
     66 	htmlSig("<TITLE"),
     67 	htmlSig("<B"),
     68 	htmlSig("<BODY"),
     69 	htmlSig("<BR"),
     70 	htmlSig("<P"),
     71 	htmlSig("<!--"),
     72 
     73 	&maskedSig{mask: []byte("\xFF\xFF\xFF\xFF\xFF"), pat: []byte("<?xml"), skipWS: true, ct: "text/xml; charset=utf-8"},
     74 
     75 	&exactSig{[]byte("%PDF-"), "application/pdf"},
     76 	&exactSig{[]byte("%!PS-Adobe-"), "application/postscript"},
     77 
     78 	// UTF BOMs.
     79 	&maskedSig{mask: []byte("\xFF\xFF\x00\x00"), pat: []byte("\xFE\xFF\x00\x00"), ct: "text/plain; charset=utf-16be"},
     80 	&maskedSig{mask: []byte("\xFF\xFF\x00\x00"), pat: []byte("\xFF\xFE\x00\x00"), ct: "text/plain; charset=utf-16le"},
     81 	&maskedSig{mask: []byte("\xFF\xFF\xFF\x00"), pat: []byte("\xEF\xBB\xBF\x00"), ct: "text/plain; charset=utf-8"},
     82 
     83 	&exactSig{[]byte("GIF87a"), "image/gif"},
     84 	&exactSig{[]byte("GIF89a"), "image/gif"},
     85 	&exactSig{[]byte("\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"), "image/png"},
     86 	&exactSig{[]byte("\xFF\xD8\xFF"), "image/jpeg"},
     87 	&exactSig{[]byte("BM"), "image/bmp"},
     88 	&maskedSig{
     89 		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF"),
     90 		pat:  []byte("RIFF\x00\x00\x00\x00WEBPVP"),
     91 		ct:   "image/webp",
     92 	},
     93 	&exactSig{[]byte("\x00\x00\x01\x00"), "image/vnd.microsoft.icon"},
     94 
     95 	&maskedSig{
     96 		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
     97 		pat:  []byte("RIFF\x00\x00\x00\x00WAVE"),
     98 		ct:   "audio/wave",
     99 	},
    100 	&maskedSig{
    101 		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
    102 		pat:  []byte("FORM\x00\x00\x00\x00AIFF"),
    103 		ct:   "audio/aiff",
    104 	},
    105 	&maskedSig{
    106 		mask: []byte("\xFF\xFF\xFF\xFF"),
    107 		pat:  []byte(".snd"),
    108 		ct:   "audio/basic",
    109 	},
    110 	&maskedSig{
    111 		mask: []byte("\xFF\xFF\xFF\xFF\xFF"),
    112 		pat:  []byte("OggS\x00"),
    113 		ct:   "application/ogg",
    114 	},
    115 	&maskedSig{
    116 		mask: []byte("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"),
    117 		pat:  []byte("MThd\x00\x00\x00\x06"),
    118 		ct:   "audio/midi",
    119 	},
    120 	&maskedSig{
    121 		mask: []byte("\xFF\xFF\xFF"),
    122 		pat:  []byte("ID3"),
    123 		ct:   "audio/mpeg",
    124 	},
    125 	&maskedSig{
    126 		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
    127 		pat:  []byte("RIFF\x00\x00\x00\x00AVI "),
    128 		ct:   "video/avi",
    129 	},
    130 
    131 	// Fonts
    132 	&maskedSig{
    133 		// 34 NULL bytes followed by the string "LP"
    134 		pat: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x4C\x50"),
    135 		// 34 NULL bytes followed by \xF\xF
    136 		mask: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF"),
    137 		ct:   "application/vnd.ms-fontobject",
    138 	},
    139 	&exactSig{[]byte("\x00\x01\x00\x00"), "application/font-ttf"},
    140 	&exactSig{[]byte("OTTO"), "application/font-off"},
    141 	&exactSig{[]byte("ttcf"), "application/font-cff"},
    142 	&exactSig{[]byte("wOFF"), "application/font-woff"},
    143 
    144 	&exactSig{[]byte("\x1A\x45\xDF\xA3"), "video/webm"},
    145 	&exactSig{[]byte("\x52\x61\x72\x20\x1A\x07\x00"), "application/x-rar-compressed"},
    146 	&exactSig{[]byte("\x50\x4B\x03\x04"), "application/zip"},
    147 	&exactSig{[]byte("\x1F\x8B\x08"), "application/x-gzip"},
    148 
    149 	mp4Sig{},
    150 
    151 	textSig{}, // should be last
    152 }
    153 
    154 type exactSig struct {
    155 	sig []byte
    156 	ct  string
    157 }
    158 
    159 func (e *exactSig) match(data []byte, firstNonWS int) string {
    160 	if bytes.HasPrefix(data, e.sig) {
    161 		return e.ct
    162 	}
    163 	return ""
    164 }
    165 
    166 type maskedSig struct {
    167 	mask, pat []byte
    168 	skipWS    bool
    169 	ct        string
    170 }
    171 
    172 func (m *maskedSig) match(data []byte, firstNonWS int) string {
    173 	// pattern matching algorithm section 6
    174 	// https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm
    175 
    176 	if m.skipWS {
    177 		data = data[firstNonWS:]
    178 	}
    179 	if len(m.pat) != len(m.mask) {
    180 		return ""
    181 	}
    182 	if len(data) < len(m.mask) {
    183 		return ""
    184 	}
    185 	for i, mask := range m.mask {
    186 		db := data[i] & mask
    187 		if db != m.pat[i] {
    188 			return ""
    189 		}
    190 	}
    191 	return m.ct
    192 }
    193 
    194 type htmlSig []byte
    195 
    196 func (h htmlSig) match(data []byte, firstNonWS int) string {
    197 	data = data[firstNonWS:]
    198 	if len(data) < len(h)+1 {
    199 		return ""
    200 	}
    201 	for i, b := range h {
    202 		db := data[i]
    203 		if 'A' <= b && b <= 'Z' {
    204 			db &= 0xDF
    205 		}
    206 		if b != db {
    207 			return ""
    208 		}
    209 	}
    210 	// Next byte must be space or right angle bracket.
    211 	if db := data[len(h)]; db != ' ' && db != '>' {
    212 		return ""
    213 	}
    214 	return "text/html; charset=utf-8"
    215 }
    216 
    217 var mp4ftype = []byte("ftyp")
    218 var mp4 = []byte("mp4")
    219 
    220 type mp4Sig struct{}
    221 
    222 func (mp4Sig) match(data []byte, firstNonWS int) string {
    223 	// https://mimesniff.spec.whatwg.org/#signature-for-mp4
    224 	// c.f. section 6.2.1
    225 	if len(data) < 12 {
    226 		return ""
    227 	}
    228 	boxSize := int(binary.BigEndian.Uint32(data[:4]))
    229 	if boxSize%4 != 0 || len(data) < boxSize {
    230 		return ""
    231 	}
    232 	if !bytes.Equal(data[4:8], mp4ftype) {
    233 		return ""
    234 	}
    235 	for st := 8; st < boxSize; st += 4 {
    236 		if st == 12 {
    237 			// minor version number
    238 			continue
    239 		}
    240 		if bytes.Equal(data[st:st+3], mp4) {
    241 			return "video/mp4"
    242 		}
    243 	}
    244 	return ""
    245 }
    246 
    247 type textSig struct{}
    248 
    249 func (textSig) match(data []byte, firstNonWS int) string {
    250 	// c.f. section 5, step 4.
    251 	for _, b := range data[firstNonWS:] {
    252 		switch {
    253 		case b <= 0x08,
    254 			b == 0x0B,
    255 			0x0E <= b && b <= 0x1A,
    256 			0x1C <= b && b <= 0x1F:
    257 			return ""
    258 		}
    259 	}
    260 	return "text/plain; charset=utf-8"
    261 }
    262