Home | History | Annotate | Download | only in Lib
      1 """Guess the MIME type of a file.
      2 
      3 This module defines two useful functions:
      4 
      5 guess_type(url, strict=True) -- guess the MIME type and encoding of a URL.
      6 
      7 guess_extension(type, strict=True) -- guess the extension for a given MIME type.
      8 
      9 It also contains the following, for tuning the behavior:
     10 
     11 Data:
     12 
     13 knownfiles -- list of files to parse
     14 inited -- flag set when init() has been called
     15 suffix_map -- dictionary mapping suffixes to suffixes
     16 encodings_map -- dictionary mapping suffixes to encodings
     17 types_map -- dictionary mapping suffixes to types
     18 
     19 Functions:
     20 
     21 init([files]) -- parse a list of files, default knownfiles (on Windows, the
     22   default values are taken from the registry)
     23 read_mime_types(file) -- parse one file, return a dictionary or None
     24 """
     25 
     26 import os
     27 import sys
     28 import posixpath
     29 import urllib.parse
     30 try:
     31     import winreg as _winreg
     32 except ImportError:
     33     _winreg = None
     34 
     35 __all__ = [
     36     "knownfiles", "inited", "MimeTypes",
     37     "guess_type", "guess_all_extensions", "guess_extension",
     38     "add_type", "init", "read_mime_types",
     39     "suffix_map", "encodings_map", "types_map", "common_types"
     40 ]
     41 
     42 knownfiles = [
     43     "/etc/mime.types",
     44     "/etc/httpd/mime.types",                    # Mac OS X
     45     "/etc/httpd/conf/mime.types",               # Apache
     46     "/etc/apache/mime.types",                   # Apache 1
     47     "/etc/apache2/mime.types",                  # Apache 2
     48     "/usr/local/etc/httpd/conf/mime.types",
     49     "/usr/local/lib/netscape/mime.types",
     50     "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
     51     "/usr/local/etc/mime.types",                # Apache 1.3
     52     ]
     53 
     54 inited = False
     55 _db = None
     56 
     57 
     58 class MimeTypes:
     59     """MIME-types datastore.
     60 
     61     This datastore can handle information from mime.types-style files
     62     and supports basic determination of MIME type from a filename or
     63     URL, and can guess a reasonable extension given a MIME type.
     64     """
     65 
     66     def __init__(self, filenames=(), strict=True):
     67         if not inited:
     68             init()
     69         self.encodings_map = encodings_map.copy()
     70         self.suffix_map = suffix_map.copy()
     71         self.types_map = ({}, {}) # dict for (non-strict, strict)
     72         self.types_map_inv = ({}, {})
     73         for (ext, type) in types_map.items():
     74             self.add_type(type, ext, True)
     75         for (ext, type) in common_types.items():
     76             self.add_type(type, ext, False)
     77         for name in filenames:
     78             self.read(name, strict)
     79 
     80     def add_type(self, type, ext, strict=True):
     81         """Add a mapping between a type and an extension.
     82 
     83         When the extension is already known, the new
     84         type will replace the old one. When the type
     85         is already known the extension will be added
     86         to the list of known extensions.
     87 
     88         If strict is true, information will be added to
     89         list of standard types, else to the list of non-standard
     90         types.
     91         """
     92         self.types_map[strict][ext] = type
     93         exts = self.types_map_inv[strict].setdefault(type, [])
     94         if ext not in exts:
     95             exts.append(ext)
     96 
     97     def guess_type(self, url, strict=True):
     98         """Guess the type of a file based on its URL.
     99 
    100         Return value is a tuple (type, encoding) where type is None if
    101         the type can't be guessed (no or unknown suffix) or a string
    102         of the form type/subtype, usable for a MIME Content-type
    103         header; and encoding is None for no encoding or the name of
    104         the program used to encode (e.g. compress or gzip).  The
    105         mappings are table driven.  Encoding suffixes are case
    106         sensitive; type suffixes are first tried case sensitive, then
    107         case insensitive.
    108 
    109         The suffixes .tgz, .taz and .tz (case sensitive!) are all
    110         mapped to '.tar.gz'.  (This is table-driven too, using the
    111         dictionary suffix_map.)
    112 
    113         Optional `strict' argument when False adds a bunch of commonly found,
    114         but non-standard types.
    115         """
    116         scheme, url = urllib.parse.splittype(url)
    117         if scheme == 'data':
    118             # syntax of data URLs:
    119             # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
    120             # mediatype := [ type "/" subtype ] *( ";" parameter )
    121             # data      := *urlchar
    122             # parameter := attribute "=" value
    123             # type/subtype defaults to "text/plain"
    124             comma = url.find(',')
    125             if comma < 0:
    126                 # bad data URL
    127                 return None, None
    128             semi = url.find(';', 0, comma)
    129             if semi >= 0:
    130                 type = url[:semi]
    131             else:
    132                 type = url[:comma]
    133             if '=' in type or '/' not in type:
    134                 type = 'text/plain'
    135             return type, None           # never compressed, so encoding is None
    136         base, ext = posixpath.splitext(url)
    137         while ext in self.suffix_map:
    138             base, ext = posixpath.splitext(base + self.suffix_map[ext])
    139         if ext in self.encodings_map:
    140             encoding = self.encodings_map[ext]
    141             base, ext = posixpath.splitext(base)
    142         else:
    143             encoding = None
    144         types_map = self.types_map[True]
    145         if ext in types_map:
    146             return types_map[ext], encoding
    147         elif ext.lower() in types_map:
    148             return types_map[ext.lower()], encoding
    149         elif strict:
    150             return None, encoding
    151         types_map = self.types_map[False]
    152         if ext in types_map:
    153             return types_map[ext], encoding
    154         elif ext.lower() in types_map:
    155             return types_map[ext.lower()], encoding
    156         else:
    157             return None, encoding
    158 
    159     def guess_all_extensions(self, type, strict=True):
    160         """Guess the extensions for a file based on its MIME type.
    161 
    162         Return value is a list of strings giving the possible filename
    163         extensions, including the leading dot ('.').  The extension is not
    164         guaranteed to have been associated with any particular data stream,
    165         but would be mapped to the MIME type `type' by guess_type().
    166 
    167         Optional `strict' argument when false adds a bunch of commonly found,
    168         but non-standard types.
    169         """
    170         type = type.lower()
    171         extensions = self.types_map_inv[True].get(type, [])
    172         if not strict:
    173             for ext in self.types_map_inv[False].get(type, []):
    174                 if ext not in extensions:
    175                     extensions.append(ext)
    176         return extensions
    177 
    178     def guess_extension(self, type, strict=True):
    179         """Guess the extension for a file based on its MIME type.
    180 
    181         Return value is a string giving a filename extension,
    182         including the leading dot ('.').  The extension is not
    183         guaranteed to have been associated with any particular data
    184         stream, but would be mapped to the MIME type `type' by
    185         guess_type().  If no extension can be guessed for `type', None
    186         is returned.
    187 
    188         Optional `strict' argument when false adds a bunch of commonly found,
    189         but non-standard types.
    190         """
    191         extensions = self.guess_all_extensions(type, strict)
    192         if not extensions:
    193             return None
    194         return extensions[0]
    195 
    196     def read(self, filename, strict=True):
    197         """
    198         Read a single mime.types-format file, specified by pathname.
    199 
    200         If strict is true, information will be added to
    201         list of standard types, else to the list of non-standard
    202         types.
    203         """
    204         with open(filename, encoding='utf-8') as fp:
    205             self.readfp(fp, strict)
    206 
    207     def readfp(self, fp, strict=True):
    208         """
    209         Read a single mime.types-format file.
    210 
    211         If strict is true, information will be added to
    212         list of standard types, else to the list of non-standard
    213         types.
    214         """
    215         while 1:
    216             line = fp.readline()
    217             if not line:
    218                 break
    219             words = line.split()
    220             for i in range(len(words)):
    221                 if words[i][0] == '#':
    222                     del words[i:]
    223                     break
    224             if not words:
    225                 continue
    226             type, suffixes = words[0], words[1:]
    227             for suff in suffixes:
    228                 self.add_type(type, '.' + suff, strict)
    229 
    230     def read_windows_registry(self, strict=True):
    231         """
    232         Load the MIME types database from Windows registry.
    233 
    234         If strict is true, information will be added to
    235         list of standard types, else to the list of non-standard
    236         types.
    237         """
    238 
    239         # Windows only
    240         if not _winreg:
    241             return
    242 
    243         def enum_types(mimedb):
    244             i = 0
    245             while True:
    246                 try:
    247                     ctype = _winreg.EnumKey(mimedb, i)
    248                 except EnvironmentError:
    249                     break
    250                 else:
    251                     if '\0' not in ctype:
    252                         yield ctype
    253                 i += 1
    254 
    255         with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr:
    256             for subkeyname in enum_types(hkcr):
    257                 try:
    258                     with _winreg.OpenKey(hkcr, subkeyname) as subkey:
    259                         # Only check file extensions
    260                         if not subkeyname.startswith("."):
    261                             continue
    262                         # raises EnvironmentError if no 'Content Type' value
    263                         mimetype, datatype = _winreg.QueryValueEx(
    264                             subkey, 'Content Type')
    265                         if datatype != _winreg.REG_SZ:
    266                             continue
    267                         self.add_type(mimetype, subkeyname, strict)
    268                 except EnvironmentError:
    269                     continue
    270 
    271 def guess_type(url, strict=True):
    272     """Guess the type of a file based on its URL.
    273 
    274     Return value is a tuple (type, encoding) where type is None if the
    275     type can't be guessed (no or unknown suffix) or a string of the
    276     form type/subtype, usable for a MIME Content-type header; and
    277     encoding is None for no encoding or the name of the program used
    278     to encode (e.g. compress or gzip).  The mappings are table
    279     driven.  Encoding suffixes are case sensitive; type suffixes are
    280     first tried case sensitive, then case insensitive.
    281 
    282     The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
    283     to ".tar.gz".  (This is table-driven too, using the dictionary
    284     suffix_map).
    285 
    286     Optional `strict' argument when false adds a bunch of commonly found, but
    287     non-standard types.
    288     """
    289     if _db is None:
    290         init()
    291     return _db.guess_type(url, strict)
    292 
    293 
    294 def guess_all_extensions(type, strict=True):
    295     """Guess the extensions for a file based on its MIME type.
    296 
    297     Return value is a list of strings giving the possible filename
    298     extensions, including the leading dot ('.').  The extension is not
    299     guaranteed to have been associated with any particular data
    300     stream, but would be mapped to the MIME type `type' by
    301     guess_type().  If no extension can be guessed for `type', None
    302     is returned.
    303 
    304     Optional `strict' argument when false adds a bunch of commonly found,
    305     but non-standard types.
    306     """
    307     if _db is None:
    308         init()
    309     return _db.guess_all_extensions(type, strict)
    310 
    311 def guess_extension(type, strict=True):
    312     """Guess the extension for a file based on its MIME type.
    313 
    314     Return value is a string giving a filename extension, including the
    315     leading dot ('.').  The extension is not guaranteed to have been
    316     associated with any particular data stream, but would be mapped to the
    317     MIME type `type' by guess_type().  If no extension can be guessed for
    318     `type', None is returned.
    319 
    320     Optional `strict' argument when false adds a bunch of commonly found,
    321     but non-standard types.
    322     """
    323     if _db is None:
    324         init()
    325     return _db.guess_extension(type, strict)
    326 
    327 def add_type(type, ext, strict=True):
    328     """Add a mapping between a type and an extension.
    329 
    330     When the extension is already known, the new
    331     type will replace the old one. When the type
    332     is already known the extension will be added
    333     to the list of known extensions.
    334 
    335     If strict is true, information will be added to
    336     list of standard types, else to the list of non-standard
    337     types.
    338     """
    339     if _db is None:
    340         init()
    341     return _db.add_type(type, ext, strict)
    342 
    343 
    344 def init(files=None):
    345     global suffix_map, types_map, encodings_map, common_types
    346     global inited, _db
    347     inited = True    # so that MimeTypes.__init__() doesn't call us again
    348     db = MimeTypes()
    349     if files is None:
    350         if _winreg:
    351             db.read_windows_registry()
    352         files = knownfiles
    353     for file in files:
    354         if os.path.isfile(file):
    355             db.read(file)
    356     encodings_map = db.encodings_map
    357     suffix_map = db.suffix_map
    358     types_map = db.types_map[True]
    359     common_types = db.types_map[False]
    360     # Make the DB a global variable now that it is fully initialized
    361     _db = db
    362 
    363 
    364 def read_mime_types(file):
    365     try:
    366         f = open(file)
    367     except OSError:
    368         return None
    369     with f:
    370         db = MimeTypes()
    371         db.readfp(f, True)
    372         return db.types_map[True]
    373 
    374 
    375 def _default_mime_types():
    376     global suffix_map
    377     global encodings_map
    378     global types_map
    379     global common_types
    380 
    381     suffix_map = {
    382         '.svgz': '.svg.gz',
    383         '.tgz': '.tar.gz',
    384         '.taz': '.tar.gz',
    385         '.tz': '.tar.gz',
    386         '.tbz2': '.tar.bz2',
    387         '.txz': '.tar.xz',
    388         }
    389 
    390     encodings_map = {
    391         '.gz': 'gzip',
    392         '.Z': 'compress',
    393         '.bz2': 'bzip2',
    394         '.xz': 'xz',
    395         }
    396 
    397     # Before adding new types, make sure they are either registered with IANA,
    398     # at http://www.iana.org/assignments/media-types
    399     # or extensions, i.e. using the x- prefix
    400 
    401     # If you add to these, please keep them sorted!
    402     types_map = {
    403         '.a'      : 'application/octet-stream',
    404         '.ai'     : 'application/postscript',
    405         '.aif'    : 'audio/x-aiff',
    406         '.aifc'   : 'audio/x-aiff',
    407         '.aiff'   : 'audio/x-aiff',
    408         '.au'     : 'audio/basic',
    409         '.avi'    : 'video/x-msvideo',
    410         '.bat'    : 'text/plain',
    411         '.bcpio'  : 'application/x-bcpio',
    412         '.bin'    : 'application/octet-stream',
    413         '.bmp'    : 'image/x-ms-bmp',
    414         '.c'      : 'text/plain',
    415         # Duplicates :(
    416         '.cdf'    : 'application/x-cdf',
    417         '.cdf'    : 'application/x-netcdf',
    418         '.cpio'   : 'application/x-cpio',
    419         '.csh'    : 'application/x-csh',
    420         '.css'    : 'text/css',
    421         '.csv'    : 'text/csv',
    422         '.dll'    : 'application/octet-stream',
    423         '.doc'    : 'application/msword',
    424         '.dot'    : 'application/msword',
    425         '.dvi'    : 'application/x-dvi',
    426         '.eml'    : 'message/rfc822',
    427         '.eps'    : 'application/postscript',
    428         '.etx'    : 'text/x-setext',
    429         '.exe'    : 'application/octet-stream',
    430         '.gif'    : 'image/gif',
    431         '.gtar'   : 'application/x-gtar',
    432         '.h'      : 'text/plain',
    433         '.hdf'    : 'application/x-hdf',
    434         '.htm'    : 'text/html',
    435         '.html'   : 'text/html',
    436         '.ico'    : 'image/vnd.microsoft.icon',
    437         '.ief'    : 'image/ief',
    438         '.jpe'    : 'image/jpeg',
    439         '.jpeg'   : 'image/jpeg',
    440         '.jpg'    : 'image/jpeg',
    441         '.js'     : 'application/javascript',
    442         '.ksh'    : 'text/plain',
    443         '.latex'  : 'application/x-latex',
    444         '.m1v'    : 'video/mpeg',
    445         '.m3u'    : 'application/vnd.apple.mpegurl',
    446         '.m3u8'   : 'application/vnd.apple.mpegurl',
    447         '.man'    : 'application/x-troff-man',
    448         '.me'     : 'application/x-troff-me',
    449         '.mht'    : 'message/rfc822',
    450         '.mhtml'  : 'message/rfc822',
    451         '.mif'    : 'application/x-mif',
    452         '.mov'    : 'video/quicktime',
    453         '.movie'  : 'video/x-sgi-movie',
    454         '.mp2'    : 'audio/mpeg',
    455         '.mp3'    : 'audio/mpeg',
    456         '.mp4'    : 'video/mp4',
    457         '.mpa'    : 'video/mpeg',
    458         '.mpe'    : 'video/mpeg',
    459         '.mpeg'   : 'video/mpeg',
    460         '.mpg'    : 'video/mpeg',
    461         '.ms'     : 'application/x-troff-ms',
    462         '.nc'     : 'application/x-netcdf',
    463         '.nws'    : 'message/rfc822',
    464         '.o'      : 'application/octet-stream',
    465         '.obj'    : 'application/octet-stream',
    466         '.oda'    : 'application/oda',
    467         '.p12'    : 'application/x-pkcs12',
    468         '.p7c'    : 'application/pkcs7-mime',
    469         '.pbm'    : 'image/x-portable-bitmap',
    470         '.pdf'    : 'application/pdf',
    471         '.pfx'    : 'application/x-pkcs12',
    472         '.pgm'    : 'image/x-portable-graymap',
    473         '.pl'     : 'text/plain',
    474         '.png'    : 'image/png',
    475         '.pnm'    : 'image/x-portable-anymap',
    476         '.pot'    : 'application/vnd.ms-powerpoint',
    477         '.ppa'    : 'application/vnd.ms-powerpoint',
    478         '.ppm'    : 'image/x-portable-pixmap',
    479         '.pps'    : 'application/vnd.ms-powerpoint',
    480         '.ppt'    : 'application/vnd.ms-powerpoint',
    481         '.ps'     : 'application/postscript',
    482         '.pwz'    : 'application/vnd.ms-powerpoint',
    483         '.py'     : 'text/x-python',
    484         '.pyc'    : 'application/x-python-code',
    485         '.pyo'    : 'application/x-python-code',
    486         '.qt'     : 'video/quicktime',
    487         '.ra'     : 'audio/x-pn-realaudio',
    488         '.ram'    : 'application/x-pn-realaudio',
    489         '.ras'    : 'image/x-cmu-raster',
    490         '.rdf'    : 'application/xml',
    491         '.rgb'    : 'image/x-rgb',
    492         '.roff'   : 'application/x-troff',
    493         '.rtx'    : 'text/richtext',
    494         '.sgm'    : 'text/x-sgml',
    495         '.sgml'   : 'text/x-sgml',
    496         '.sh'     : 'application/x-sh',
    497         '.shar'   : 'application/x-shar',
    498         '.snd'    : 'audio/basic',
    499         '.so'     : 'application/octet-stream',
    500         '.src'    : 'application/x-wais-source',
    501         '.sv4cpio': 'application/x-sv4cpio',
    502         '.sv4crc' : 'application/x-sv4crc',
    503         '.svg'    : 'image/svg+xml',
    504         '.swf'    : 'application/x-shockwave-flash',
    505         '.t'      : 'application/x-troff',
    506         '.tar'    : 'application/x-tar',
    507         '.tcl'    : 'application/x-tcl',
    508         '.tex'    : 'application/x-tex',
    509         '.texi'   : 'application/x-texinfo',
    510         '.texinfo': 'application/x-texinfo',
    511         '.tif'    : 'image/tiff',
    512         '.tiff'   : 'image/tiff',
    513         '.tr'     : 'application/x-troff',
    514         '.tsv'    : 'text/tab-separated-values',
    515         '.txt'    : 'text/plain',
    516         '.ustar'  : 'application/x-ustar',
    517         '.vcf'    : 'text/x-vcard',
    518         '.wav'    : 'audio/x-wav',
    519         '.webm'   : 'video/webm',
    520         '.wiz'    : 'application/msword',
    521         '.wsdl'   : 'application/xml',
    522         '.xbm'    : 'image/x-xbitmap',
    523         '.xlb'    : 'application/vnd.ms-excel',
    524         # Duplicates :(
    525         '.xls'    : 'application/excel',
    526         '.xls'    : 'application/vnd.ms-excel',
    527         '.xml'    : 'text/xml',
    528         '.xpdl'   : 'application/xml',
    529         '.xpm'    : 'image/x-xpixmap',
    530         '.xsl'    : 'application/xml',
    531         '.xwd'    : 'image/x-xwindowdump',
    532         '.zip'    : 'application/zip',
    533         }
    534 
    535     # These are non-standard types, commonly found in the wild.  They will
    536     # only match if strict=0 flag is given to the API methods.
    537 
    538     # Please sort these too
    539     common_types = {
    540         '.jpg' : 'image/jpg',
    541         '.mid' : 'audio/midi',
    542         '.midi': 'audio/midi',
    543         '.pct' : 'image/pict',
    544         '.pic' : 'image/pict',
    545         '.pict': 'image/pict',
    546         '.rtf' : 'application/rtf',
    547         '.xul' : 'text/xul'
    548         }
    549 
    550 
    551 _default_mime_types()
    552 
    553 
    554 if __name__ == '__main__':
    555     import getopt
    556 
    557     USAGE = """\
    558 Usage: mimetypes.py [options] type
    559 
    560 Options:
    561     --help / -h       -- print this message and exit
    562     --lenient / -l    -- additionally search of some common, but non-standard
    563                          types.
    564     --extension / -e  -- guess extension instead of type
    565 
    566 More than one type argument may be given.
    567 """
    568 
    569     def usage(code, msg=''):
    570         print(USAGE)
    571         if msg: print(msg)
    572         sys.exit(code)
    573 
    574     try:
    575         opts, args = getopt.getopt(sys.argv[1:], 'hle',
    576                                    ['help', 'lenient', 'extension'])
    577     except getopt.error as msg:
    578         usage(1, msg)
    579 
    580     strict = 1
    581     extension = 0
    582     for opt, arg in opts:
    583         if opt in ('-h', '--help'):
    584             usage(0)
    585         elif opt in ('-l', '--lenient'):
    586             strict = 0
    587         elif opt in ('-e', '--extension'):
    588             extension = 1
    589     for gtype in args:
    590         if extension:
    591             guess = guess_extension(gtype, strict)
    592             if not guess: print("I don't know anything about type", gtype)
    593             else: print(guess)
    594         else:
    595             guess, encoding = guess_type(gtype, strict)
    596             if not guess: print("I don't know anything about type", gtype)
    597             else: print('type:', guess, 'encoding:', encoding)
    598