1 /* 2 Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies) 3 4 This library is free software; you can redistribute it and/or 5 modify it under the terms of the GNU Library General Public 6 License as published by the Free Software Foundation; either 7 version 2 of the License, or (at your option) any later version. 8 9 This library is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 Library General Public License for more details. 13 14 You should have received a copy of the GNU Library General Public License 15 along with this library; see the file COPYING.LIB. If not, write to 16 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 17 Boston, MA 02110-1301, USA. 18 */ 19 20 #include "config.h" 21 #include "MIMESniffing.h" 22 23 #include <cstring> 24 #include <stdint.h> 25 26 // MIME type sniffing implementation based on http://tools.ietf.org/html/draft-abarth-mime-sniff-06 27 28 namespace { 29 30 static inline bool isTextInList(const char* text, size_t size, const char** data) 31 { 32 for (size_t i = 0; i < size; ++i) { 33 if (!strcmp(text, data[i])) 34 return true; 35 } 36 return false; 37 38 } 39 40 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-6 41 const char* textTypes[] = { 42 "text/plain", 43 "text/plain; charset=ISO-8859-1", 44 "text/plain; charset=iso-8859-1", 45 "text/plain; charset=UTF-8" 46 }; 47 const size_t textTypesSize = sizeof(textTypes) / sizeof(textTypes[0]); 48 49 static inline bool isTextOrBinaryType(const char* type) 50 { 51 return isTextInList(type, textTypesSize, textTypes); 52 } 53 54 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-6 55 const char* unknownTypes[] = { 56 "", 57 "unknown/unknown", 58 "application/unknown", 59 "*/*" 60 }; 61 const size_t unknownTypesSize = sizeof(unknownTypes) / sizeof(unknownTypes[0]); 62 63 static inline bool isUnknownType(const char* type) 64 { 65 return isTextInList(type, unknownTypesSize, unknownTypes); 66 } 67 68 const char* xmlTypes[] = { 69 "text/xml", 70 "application/xml" 71 }; 72 const size_t xmlTypesSize = sizeof(xmlTypes) / sizeof(xmlTypes[0]); 73 74 const char xmlSuffix[] = "+xml"; 75 76 static inline bool isXMLType(const char* type) 77 { 78 const size_t xmlSuffixSize = sizeof(xmlSuffix) - 1; 79 size_t typeSize = strlen(type); 80 if (typeSize >= xmlSuffixSize && !memcmp(type + typeSize - xmlSuffixSize, xmlSuffix, xmlSuffixSize)) 81 return true; 82 83 return isTextInList(type, xmlTypesSize, xmlTypes); 84 } 85 86 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-8 87 const char binaryFlags[256] = { 88 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 89 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 92 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 93 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 104 }; 105 106 static inline bool isBinaryChar(unsigned char data) 107 { 108 return binaryFlags[data]; 109 } 110 111 static inline bool isBinaryData(const char* data, size_t size) 112 { 113 for (size_t i = 0; i < size; ++i) { 114 if (isBinaryChar(data[i])) 115 return true; 116 } 117 return false; 118 } 119 120 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-11 121 const char whiteSpaceChars[256] = { 122 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 124 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 131 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 135 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 136 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 137 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 138 }; 139 140 static inline bool isWhiteSpace(unsigned char data) 141 { 142 return whiteSpaceChars[data]; 143 } 144 145 static inline void skipWhiteSpace(const char* data, size_t& pos, size_t dataSize) 146 { 147 while (pos < dataSize && isWhiteSpace(data[pos])) 148 ++pos; 149 } 150 151 enum { 152 SkipWhiteSpace = 1, 153 TrailingSpaceOrBracket = 2 154 }; 155 156 struct MagicNumbers { 157 const char* pattern; 158 const char* mask; 159 const char* mimeType; 160 size_t size; 161 int flags; 162 }; 163 164 #define MAGIC_NUMBERS_MASKED(pattern, mask, mimeType, flags) {(pattern), (mask), (mimeType), sizeof(pattern) - 1, (flags)} 165 #define MAGIC_NUMBERS_SIMPLE(pattern, mimeType) {(pattern), 0, (mimeType), sizeof(pattern) - 1, 0} 166 167 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-12 168 const MagicNumbers securityConstrainedTypes[] = { 169 MAGIC_NUMBERS_MASKED("<!DOCTYPE HTML", "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 170 MAGIC_NUMBERS_MASKED("<HTML", "\xFF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 171 MAGIC_NUMBERS_MASKED("<HEAD", "\xFF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 172 MAGIC_NUMBERS_MASKED("<SCRIPT", "\xFF\xDF\xDF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 173 MAGIC_NUMBERS_MASKED("<IFRAME", "\xFF\xDF\xDF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 174 MAGIC_NUMBERS_MASKED("<H1", "\xFF\xDF\xFF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 175 MAGIC_NUMBERS_MASKED("<DIV", "\xFF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 176 MAGIC_NUMBERS_MASKED("<FONT", "\xFF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 177 MAGIC_NUMBERS_MASKED("<TABLE", "\xFF\xDF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 178 MAGIC_NUMBERS_MASKED("<A", "\xFF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 179 MAGIC_NUMBERS_MASKED("<STYLE", "\xFF\xDF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 180 MAGIC_NUMBERS_MASKED("<TITLE", "\xFF\xDF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 181 MAGIC_NUMBERS_MASKED("<B", "\xFF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 182 MAGIC_NUMBERS_MASKED("<BODY", "\xFF\xDF\xDF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 183 MAGIC_NUMBERS_MASKED("<BR", "\xFF\xDF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 184 MAGIC_NUMBERS_MASKED("<P", "\xFF\xDF", "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 185 MAGIC_NUMBERS_MASKED("<!--", 0, "text/html", SkipWhiteSpace | TrailingSpaceOrBracket), 186 MAGIC_NUMBERS_MASKED("<?xml", 0, "text/xml", SkipWhiteSpace), 187 MAGIC_NUMBERS_SIMPLE("%PDF-", "application/pdf") 188 }; 189 const size_t securityConstrainedTypesSize = sizeof(securityConstrainedTypes) / sizeof(securityConstrainedTypes[0]); 190 191 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-8 192 const MagicNumbers bomTypes[] = { 193 MAGIC_NUMBERS_SIMPLE("\xFE\xFF", "text/plain"), // UTF-16BE BOM 194 MAGIC_NUMBERS_SIMPLE("\xFF\xFE", "text/plain"), // UTF-16LE BOM 195 MAGIC_NUMBERS_SIMPLE("\xEF\xBB\xBF", "text/plain") // UTF-8 BOM 196 }; 197 const size_t bomTypesSize = sizeof(bomTypes) / sizeof(bomTypes[0]); 198 199 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-13 200 const MagicNumbers safeTypes[] = { 201 MAGIC_NUMBERS_SIMPLE("%!PS-Adobe-", "application/postscript"), 202 MAGIC_NUMBERS_SIMPLE("\x4F\x67\x67\x53\x00", "application/ogg"), // An Ogg Vorbis audio or video signature. 203 MAGIC_NUMBERS_MASKED("RIFF\x00\x00\x00\x00WAVE", "\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", "audio/x-wave", 0), // "RIFF" followed by four bytes, followed by "WAVE". 204 MAGIC_NUMBERS_SIMPLE("\x1A\x45\xDF\xA3", "video/webm"), // The WebM signature. 205 MAGIC_NUMBERS_SIMPLE("Rar!\x1A\x07\x00", "application/x-rar-compressed"), // A RAR archive. 206 MAGIC_NUMBERS_SIMPLE("\x50\x4B\x03\x04", "application/zip"), // A ZIP archive. 207 MAGIC_NUMBERS_SIMPLE("\x1F\x8B\x08", "application/x-gzip") // A GZIP archive. 208 }; 209 const size_t safeTypesSize = sizeof(safeTypes) / sizeof(safeTypes[0]); 210 211 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-16 212 const MagicNumbers imageTypes[] = { 213 MAGIC_NUMBERS_MASKED("RIFF\x00\x00\x00\x00WEBPVP", "\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF", "image/webp", 0), // "RIFF" followed by four bytes, followed by "WEBPVP". 214 MAGIC_NUMBERS_SIMPLE("GIF87a", "image/gif"), 215 MAGIC_NUMBERS_SIMPLE("GIF89a", "image/gif"), 216 MAGIC_NUMBERS_SIMPLE("\x89\x50\x4E\x47\x0D\x0A\x1A\x0A", "image/png"), 217 MAGIC_NUMBERS_SIMPLE("\xFF\xD8\xFF", "image/jpeg"), 218 MAGIC_NUMBERS_SIMPLE("BM", "image/bmp"), 219 MAGIC_NUMBERS_SIMPLE("\x00\x00\x01\x00", "image/vnd.microsoft.icon") // A Windows Icon signature. 220 }; 221 const size_t imageTypesSize = sizeof(imageTypes) / sizeof(imageTypes[0]); 222 223 static inline size_t dataSizeNeededForImageSniffing() 224 { 225 size_t result = 0; 226 for (int i = 0; i < imageTypesSize; ++i) { 227 if (imageTypes[i].size > result) 228 result = imageTypes[i].size; 229 } 230 return result; 231 } 232 233 static inline bool maskedCompare(const MagicNumbers& info, const char* data, size_t dataSize) 234 { 235 if (dataSize < info.size) 236 return false; 237 238 const uint32_t* pattern32 = reinterpret_cast<const uint32_t*>(info.pattern); 239 const uint32_t* mask32 = reinterpret_cast<const uint32_t*>(info.mask); 240 const uint32_t* data32 = reinterpret_cast<const uint32_t*>(data); 241 242 size_t count = info.size >> 2; 243 244 for (size_t i = 0; i < count; ++i) { 245 if ((*data32++ & *mask32++) != *pattern32++) 246 return false; 247 } 248 249 const char* p = reinterpret_cast<const char*>(pattern32); 250 const char* m = reinterpret_cast<const char*>(mask32); 251 const char* d = reinterpret_cast<const char*>(data32); 252 253 count = info.size & 3; 254 255 for (size_t i = 0; i < count; ++i) { 256 if ((*d++ & *m++) != *p++) 257 return false; 258 } 259 260 return true; 261 } 262 263 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-11 264 static inline bool checkSpaceOrBracket(const char* data) 265 { 266 return isWhiteSpace(*data) || *data == 0x3E; 267 } 268 269 static inline bool compare(const MagicNumbers& info, const char* data, size_t dataSize) 270 { 271 if (info.flags & SkipWhiteSpace) { 272 size_t pos = 0; 273 skipWhiteSpace(data, pos, dataSize); 274 data += pos; 275 dataSize -= pos; 276 } 277 278 bool result; 279 if (info.mask) 280 result = maskedCompare(info, data, info.size); 281 else 282 result = dataSize >= info.size && !memcmp(data, info.pattern, info.size); 283 284 return result && (!(info.flags & TrailingSpaceOrBracket) || checkSpaceOrBracket(data + info.size)); 285 } 286 287 static inline const char* findMIMEType(const char* data, size_t dataSize, const MagicNumbers* types, size_t typesCount) 288 { 289 for (size_t i = 0; i < typesCount; ++i) { 290 if (compare(types[i], data, dataSize)) 291 return types[i].mimeType; 292 } 293 return 0; 294 } 295 296 static inline const char* findSimpleMIMEType(const char* data, size_t dataSize, const MagicNumbers* types, size_t typesCount) 297 { 298 for (size_t i = 0; i < typesCount; ++i) { 299 ASSERT(!types[i].mask); 300 ASSERT(!types[i].flags); 301 302 if (dataSize >= types[i].size && !memcmp(data, types[i].pattern, types[i].size)) 303 return types[i].mimeType; 304 } 305 return 0; 306 } 307 308 bool isTypeInList(const char* type, const MagicNumbers* types, size_t typesCount) 309 { 310 for (size_t i = 0; i < typesCount; ++i) { 311 if (!strcmp(type, types[i].mimeType)) 312 return true; 313 } 314 return false; 315 } 316 317 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-8 318 static const char* internalTextOrBinaryTypeSniffingProcedure(const char* data, size_t dataSize) 319 { 320 const char* mimeType = 0; 321 322 mimeType = findSimpleMIMEType(data, dataSize, bomTypes, bomTypesSize); 323 if (mimeType) 324 return mimeType; 325 326 if (!isBinaryData(data, dataSize)) 327 return "text/plain"; 328 329 mimeType = findMIMEType(data, dataSize, safeTypes, safeTypesSize); 330 if (mimeType) 331 return mimeType; 332 333 mimeType = findMIMEType(data, dataSize, imageTypes, imageTypesSize); 334 if (mimeType) 335 return mimeType; 336 337 return "application/octet-stream"; 338 } 339 340 static const char* textOrBinaryTypeSniffingProcedure(const char* data, size_t dataSize) 341 { 342 const char* result = internalTextOrBinaryTypeSniffingProcedure(data, dataSize); 343 ASSERT(!isTypeInList(result, securityConstrainedTypes, securityConstrainedTypesSize)); 344 return result; 345 } 346 347 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-10 348 static const char* unknownTypeSniffingProcedure(const char* data, size_t dataSize) 349 { 350 const char* mimeType = 0; 351 352 mimeType = findMIMEType(data, dataSize, securityConstrainedTypes, securityConstrainedTypesSize); 353 if (mimeType) 354 return mimeType; 355 356 mimeType = findSimpleMIMEType(data, dataSize, bomTypes, bomTypesSize); 357 if (mimeType) 358 return mimeType; 359 360 mimeType = findMIMEType(data, dataSize, safeTypes, safeTypesSize); 361 if (mimeType) 362 return mimeType; 363 364 mimeType = findMIMEType(data, dataSize, imageTypes, imageTypesSize); 365 if (mimeType) 366 return mimeType; 367 368 if (!isBinaryData(data, dataSize)) 369 return "text/plain"; 370 371 return "application/octet-stream"; 372 } 373 374 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-16 375 static const char* imageTypeSniffingProcedure(const char* data, size_t dataSize) 376 { 377 return findMIMEType(data, dataSize, imageTypes, imageTypesSize); 378 } 379 380 static inline bool checkText(const char* data, size_t& pos, size_t dataSize, const char* text, size_t textSize) 381 { 382 if (dataSize - pos < textSize || memcmp(data + pos, text, textSize)) 383 return false; 384 385 pos += textSize; 386 return true; 387 } 388 389 const char rssUrl[] = "http://purl.org/rss/1.0"; 390 const char rdfUrl[] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; 391 392 static inline const char* checkRDF(const char* data, size_t pos, size_t dataSize) 393 { 394 bool isRDF = false; 395 bool isRSS = false; 396 397 while (pos <= dataSize) { 398 if (checkText(data, pos, dataSize, rssUrl, sizeof(rssUrl) - 1)) { 399 isRSS = true; 400 continue; 401 } 402 403 if (checkText(data, pos, dataSize, rdfUrl, sizeof(rdfUrl) - 1)) { 404 isRDF = true; 405 continue; 406 } 407 408 ++pos; 409 410 if (isRSS && isRDF) 411 return "application/rdf+xml"; 412 } 413 414 return 0; 415 } 416 417 static inline bool skipTag(const char*& data, size_t& pos, size_t dataSize, const char* tag, size_t tagSize, const char* tagEnd, size_t tagEndSize) 418 { 419 if (!checkText(data, pos, dataSize, tag, tagSize)) 420 return false; 421 422 while (pos < dataSize && !checkText(data, pos, dataSize, tagEnd, tagEndSize)) 423 ++pos; 424 425 return true; 426 } 427 428 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-17 429 static const char* feedTypeSniffingProcedure(const char* data, size_t dataSize) 430 { 431 size_t pos = 0; 432 433 if (dataSize >= 3 && !memcmp(data, "\xEF\xBB\xBF", 3)) 434 pos += 3; 435 436 while (pos < dataSize) { 437 skipWhiteSpace(data, pos, dataSize); 438 439 if (!skipTag(data, pos, dataSize, "<!--", 4, "-->", 3) && !skipTag(data, pos, dataSize, "<!", 2, "!>", 2) && !skipTag(data, pos, dataSize, "<?", 2, "?>", 2)) 440 break; 441 } 442 443 if (checkText(data, pos, dataSize, "<rss", 4)) 444 return "application/rss+xml"; 445 446 if (checkText(data, pos, dataSize, "<feed", 5)) 447 return "application/atom+xml"; 448 449 if (checkText(data, pos, dataSize, "<rdf:RDF", 8)) 450 return checkRDF(data, pos, dataSize); 451 452 return 0; 453 } 454 455 } 456 457 // http://tools.ietf.org/html/draft-abarth-mime-sniff-06#page-6 458 MIMESniffer::MIMESniffer(const char* advertisedMIMEType, bool isSupportedImageType) 459 : m_dataSize(0) 460 , m_function(0) 461 { 462 if (!advertisedMIMEType) { 463 m_dataSize = 512; 464 m_function = &unknownTypeSniffingProcedure; 465 return; 466 } 467 468 if (isTextOrBinaryType(advertisedMIMEType)) { 469 m_dataSize = 512; 470 m_function = &textOrBinaryTypeSniffingProcedure; 471 return; 472 } 473 474 if (isUnknownType(advertisedMIMEType)) { 475 m_dataSize = 512; 476 m_function = &unknownTypeSniffingProcedure; 477 return; 478 } 479 480 if (isXMLType(advertisedMIMEType)) 481 return; 482 483 if (isSupportedImageType) { 484 static const size_t dataSize = dataSizeNeededForImageSniffing(); 485 m_dataSize = dataSize; 486 m_function = &imageTypeSniffingProcedure; 487 return; 488 } 489 490 if (!strcmp(advertisedMIMEType, "text/html")) { 491 m_dataSize = 512; 492 m_function = &feedTypeSniffingProcedure; 493 return; 494 } 495 } 496