Home | History | Annotate | Download | only in Lib
      1 """Guess the MIME type of a file.
      2 
      3 This module defines two useful functions:
      4 
      5 guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
      6 
      7 guess_extension(type, strict=1) -- guess the extension for a given MIME type.
      8 
      9 It also contains the following, for tuning the behavior:
     10 
     11 Data:
     12 
     13 knownfiles -- list of files to parse
     14 inited -- flag set when init() has been called
     15 suffix_map -- dictionary mapping suffixes to suffixes
     16 encodings_map -- dictionary mapping suffixes to encodings
     17 types_map -- dictionary mapping suffixes to types
     18 
     19 Functions:
     20 
     21 init([files]) -- parse a list of files, default knownfiles (on Windows, the
     22   default values are taken from the registry)
     23 read_mime_types(file) -- parse one file, return a dictionary or None
     24 """
     25 
     26 import os
     27 import sys
     28 import posixpath
     29 import urllib
     30 try:
     31     import _winreg
     32 except ImportError:
     33     _winreg = None
     34 
     35 __all__ = [
     36     "guess_type","guess_extension","guess_all_extensions",
     37     "add_type","read_mime_types","init"
     38 ]
     39 
     40 knownfiles = [
     41     "/etc/mime.types",
     42     "/etc/httpd/mime.types",                    # Mac OS X
     43     "/etc/httpd/conf/mime.types",               # Apache
     44     "/etc/apache/mime.types",                   # Apache 1
     45     "/etc/apache2/mime.types",                  # Apache 2
     46     "/usr/local/etc/httpd/conf/mime.types",
     47     "/usr/local/lib/netscape/mime.types",
     48     "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
     49     "/usr/local/etc/mime.types",                # Apache 1.3
     50     ]
     51 
     52 inited = False
     53 _db = None
     54 
     55 
     56 class MimeTypes:
     57     """MIME-types datastore.
     58 
     59     This datastore can handle information from mime.types-style files
     60     and supports basic determination of MIME type from a filename or
     61     URL, and can guess a reasonable extension given a MIME type.
     62     """
     63 
     64     def __init__(self, filenames=(), strict=True):
     65         if not inited:
     66             init()
     67         self.encodings_map = encodings_map.copy()
     68         self.suffix_map = suffix_map.copy()
     69         self.types_map = ({}, {}) # dict for (non-strict, strict)
     70         self.types_map_inv = ({}, {})
     71         for (ext, type) in types_map.items():
     72             self.add_type(type, ext, True)
     73         for (ext, type) in common_types.items():
     74             self.add_type(type, ext, False)
     75         for name in filenames:
     76             self.read(name, strict)
     77 
     78     def add_type(self, type, ext, strict=True):
     79         """Add a mapping between a type and an extension.
     80 
     81         When the extension is already known, the new
     82         type will replace the old one. When the type
     83         is already known the extension will be added
     84         to the list of known extensions.
     85 
     86         If strict is true, information will be added to
     87         list of standard types, else to the list of non-standard
     88         types.
     89         """
     90         self.types_map[strict][ext] = type
     91         exts = self.types_map_inv[strict].setdefault(type, [])
     92         if ext not in exts:
     93             exts.append(ext)
     94 
     95     def guess_type(self, url, strict=True):
     96         """Guess the type of a file based on its URL.
     97 
     98         Return value is a tuple (type, encoding) where type is None if
     99         the type can't be guessed (no or unknown suffix) or a string
    100         of the form type/subtype, usable for a MIME Content-type
    101         header; and encoding is None for no encoding or the name of
    102         the program used to encode (e.g. compress or gzip).  The
    103         mappings are table driven.  Encoding suffixes are case
    104         sensitive; type suffixes are first tried case sensitive, then
    105         case insensitive.
    106 
    107         The suffixes .tgz, .taz and .tz (case sensitive!) are all
    108         mapped to '.tar.gz'.  (This is table-driven too, using the
    109         dictionary suffix_map.)
    110 
    111         Optional `strict' argument when False adds a bunch of commonly found,
    112         but non-standard types.
    113         """
    114         scheme, url = urllib.splittype(url)
    115         if scheme == 'data':
    116             # syntax of data URLs:
    117             # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
    118             # mediatype := [ type "/" subtype ] *( ";" parameter )
    119             # data      := *urlchar
    120             # parameter := attribute "=" value
    121             # type/subtype defaults to "text/plain"
    122             comma = url.find(',')
    123             if comma < 0:
    124                 # bad data URL
    125                 return None, None
    126             semi = url.find(';', 0, comma)
    127             if semi >= 0:
    128                 type = url[:semi]
    129             else:
    130                 type = url[:comma]
    131             if '=' in type or '/' not in type:
    132                 type = 'text/plain'
    133             return type, None           # never compressed, so encoding is None
    134         base, ext = posixpath.splitext(url)
    135         while ext in self.suffix_map:
    136             base, ext = posixpath.splitext(base + self.suffix_map[ext])
    137         if ext in self.encodings_map:
    138             encoding = self.encodings_map[ext]
    139             base, ext = posixpath.splitext(base)
    140         else:
    141             encoding = None
    142         types_map = self.types_map[True]
    143         if ext in types_map:
    144             return types_map[ext], encoding
    145         elif ext.lower() in types_map:
    146             return types_map[ext.lower()], encoding
    147         elif strict:
    148             return None, encoding
    149         types_map = self.types_map[False]
    150         if ext in types_map:
    151             return types_map[ext], encoding
    152         elif ext.lower() in types_map:
    153             return types_map[ext.lower()], encoding
    154         else:
    155             return None, encoding
    156 
    157     def guess_all_extensions(self, type, strict=True):
    158         """Guess the extensions for a file based on its MIME type.
    159 
    160         Return value is a list of strings giving the possible filename
    161         extensions, including the leading dot ('.').  The extension is not
    162         guaranteed to have been associated with any particular data stream,
    163         but would be mapped to the MIME type `type' by guess_type().
    164 
    165         Optional `strict' argument when false adds a bunch of commonly found,
    166         but non-standard types.
    167         """
    168         type = type.lower()
    169         extensions = self.types_map_inv[True].get(type, [])
    170         if not strict:
    171             for ext in self.types_map_inv[False].get(type, []):
    172                 if ext not in extensions:
    173                     extensions.append(ext)
    174         return extensions
    175 
    176     def guess_extension(self, type, strict=True):
    177         """Guess the extension for a file based on its MIME type.
    178 
    179         Return value is a string giving a filename extension,
    180         including the leading dot ('.').  The extension is not
    181         guaranteed to have been associated with any particular data
    182         stream, but would be mapped to the MIME type `type' by
    183         guess_type().  If no extension can be guessed for `type', None
    184         is returned.
    185 
    186         Optional `strict' argument when false adds a bunch of commonly found,
    187         but non-standard types.
    188         """
    189         extensions = self.guess_all_extensions(type, strict)
    190         if not extensions:
    191             return None
    192         return extensions[0]
    193 
    194     def read(self, filename, strict=True):
    195         """
    196         Read a single mime.types-format file, specified by pathname.
    197 
    198         If strict is true, information will be added to
    199         list of standard types, else to the list of non-standard
    200         types.
    201         """
    202         with open(filename) as fp:
    203             self.readfp(fp, strict)
    204 
    205     def readfp(self, fp, strict=True):
    206         """
    207         Read a single mime.types-format file.
    208 
    209         If strict is true, information will be added to
    210         list of standard types, else to the list of non-standard
    211         types.
    212         """
    213         while 1:
    214             line = fp.readline()
    215             if not line:
    216                 break
    217             words = line.split()
    218             for i in range(len(words)):
    219                 if words[i][0] == '#':
    220                     del words[i:]
    221                     break
    222             if not words:
    223                 continue
    224             type, suffixes = words[0], words[1:]
    225             for suff in suffixes:
    226                 self.add_type(type, '.' + suff, strict)
    227 
    228     def read_windows_registry(self, strict=True):
    229         """
    230         Load the MIME types database from Windows registry.
    231 
    232         If strict is true, information will be added to
    233         list of standard types, else to the list of non-standard
    234         types.
    235         """
    236 
    237         # Windows only
    238         if not _winreg:
    239             return
    240 
    241         def enum_types(mimedb):
    242             i = 0
    243             while True:
    244                 try:
    245                     ctype = _winreg.EnumKey(mimedb, i)
    246                 except EnvironmentError:
    247                     break
    248                 else:
    249                     if '\0' not in ctype:
    250                         yield ctype
    251                 i += 1
    252 
    253         default_encoding = sys.getdefaultencoding()
    254         with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr:
    255             for subkeyname in enum_types(hkcr):
    256                 try:
    257                     with _winreg.OpenKey(hkcr, subkeyname) as subkey:
    258                         # Only check file extensions
    259                         if not subkeyname.startswith("."):
    260                             continue
    261                         # raises EnvironmentError if no 'Content Type' value
    262                         mimetype, datatype = _winreg.QueryValueEx(
    263                             subkey, 'Content Type')
    264                         if datatype != _winreg.REG_SZ:
    265                             continue
    266                         try:
    267                             mimetype = mimetype.encode(default_encoding)
    268                         except UnicodeEncodeError:
    269                             continue
    270                         self.add_type(mimetype, subkeyname, strict)
    271                 except EnvironmentError:
    272                     continue
    273 
    274 def guess_type(url, strict=True):
    275     """Guess the type of a file based on its URL.
    276 
    277     Return value is a tuple (type, encoding) where type is None if the
    278     type can't be guessed (no or unknown suffix) or a string of the
    279     form type/subtype, usable for a MIME Content-type header; and
    280     encoding is None for no encoding or the name of the program used
    281     to encode (e.g. compress or gzip).  The mappings are table
    282     driven.  Encoding suffixes are case sensitive; type suffixes are
    283     first tried case sensitive, then case insensitive.
    284 
    285     The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
    286     to ".tar.gz".  (This is table-driven too, using the dictionary
    287     suffix_map).
    288 
    289     Optional `strict' argument when false adds a bunch of commonly found, but
    290     non-standard types.
    291     """
    292     if _db is None:
    293         init()
    294     return _db.guess_type(url, strict)
    295 
    296 
    297 def guess_all_extensions(type, strict=True):
    298     """Guess the extensions for a file based on its MIME type.
    299 
    300     Return value is a list of strings giving the possible filename
    301     extensions, including the leading dot ('.').  The extension is not
    302     guaranteed to have been associated with any particular data
    303     stream, but would be mapped to the MIME type `type' by
    304     guess_type().  If no extension can be guessed for `type', None
    305     is returned.
    306 
    307     Optional `strict' argument when false adds a bunch of commonly found,
    308     but non-standard types.
    309     """
    310     if _db is None:
    311         init()
    312     return _db.guess_all_extensions(type, strict)
    313 
    314 def guess_extension(type, strict=True):
    315     """Guess the extension for a file based on its MIME type.
    316 
    317     Return value is a string giving a filename extension, including the
    318     leading dot ('.').  The extension is not guaranteed to have been
    319     associated with any particular data stream, but would be mapped to the
    320     MIME type `type' by guess_type().  If no extension can be guessed for
    321     `type', None is returned.
    322 
    323     Optional `strict' argument when false adds a bunch of commonly found,
    324     but non-standard types.
    325     """
    326     if _db is None:
    327         init()
    328     return _db.guess_extension(type, strict)
    329 
    330 def add_type(type, ext, strict=True):
    331     """Add a mapping between a type and an extension.
    332 
    333     When the extension is already known, the new
    334     type will replace the old one. When the type
    335     is already known the extension will be added
    336     to the list of known extensions.
    337 
    338     If strict is true, information will be added to
    339     list of standard types, else to the list of non-standard
    340     types.
    341     """
    342     if _db is None:
    343         init()
    344     return _db.add_type(type, ext, strict)
    345 
    346 
    347 def init(files=None):
    348     global suffix_map, types_map, encodings_map, common_types
    349     global inited, _db
    350     inited = True    # so that MimeTypes.__init__() doesn't call us again
    351     db = MimeTypes()
    352     if files is None:
    353         if _winreg:
    354             db.read_windows_registry()
    355         files = knownfiles
    356     for file in files:
    357         if os.path.isfile(file):
    358             db.read(file)
    359     encodings_map = db.encodings_map
    360     suffix_map = db.suffix_map
    361     types_map = db.types_map[True]
    362     common_types = db.types_map[False]
    363     # Make the DB a global variable now that it is fully initialized
    364     _db = db
    365 
    366 
    367 def read_mime_types(file):
    368     try:
    369         f = open(file)
    370     except IOError:
    371         return None
    372     with f:
    373         db = MimeTypes()
    374         db.readfp(f, True)
    375         return db.types_map[True]
    376 
    377 
    378 def _default_mime_types():
    379     global suffix_map
    380     global encodings_map
    381     global types_map
    382     global common_types
    383 
    384     suffix_map = {
    385         '.svgz': '.svg.gz',
    386         '.tgz': '.tar.gz',
    387         '.taz': '.tar.gz',
    388         '.tz': '.tar.gz',
    389         '.tbz2': '.tar.bz2',
    390         '.txz': '.tar.xz',
    391         }
    392 
    393     encodings_map = {
    394         '.gz': 'gzip',
    395         '.Z': 'compress',
    396         '.bz2': 'bzip2',
    397         '.xz': 'xz',
    398         }
    399 
    400     # Before adding new types, make sure they are either registered with IANA,
    401     # at http://www.isi.edu/in-notes/iana/assignments/media-types
    402     # or extensions, i.e. using the x- prefix
    403 
    404     # If you add to these, please keep them sorted!
    405     types_map = {
    406         '.a'      : 'application/octet-stream',
    407         '.ai'     : 'application/postscript',
    408         '.aif'    : 'audio/x-aiff',
    409         '.aifc'   : 'audio/x-aiff',
    410         '.aiff'   : 'audio/x-aiff',
    411         '.au'     : 'audio/basic',
    412         '.avi'    : 'video/x-msvideo',
    413         '.bat'    : 'text/plain',
    414         '.bcpio'  : 'application/x-bcpio',
    415         '.bin'    : 'application/octet-stream',
    416         '.bmp'    : 'image/x-ms-bmp',
    417         '.c'      : 'text/plain',
    418         # Duplicates :(
    419         '.cdf'    : 'application/x-cdf',
    420         '.cdf'    : 'application/x-netcdf',
    421         '.cpio'   : 'application/x-cpio',
    422         '.csh'    : 'application/x-csh',
    423         '.css'    : 'text/css',
    424         '.csv'    : 'text/csv',
    425         '.dll'    : 'application/octet-stream',
    426         '.doc'    : 'application/msword',
    427         '.dot'    : 'application/msword',
    428         '.dvi'    : 'application/x-dvi',
    429         '.eml'    : 'message/rfc822',
    430         '.eps'    : 'application/postscript',
    431         '.etx'    : 'text/x-setext',
    432         '.exe'    : 'application/octet-stream',
    433         '.gif'    : 'image/gif',
    434         '.gtar'   : 'application/x-gtar',
    435         '.h'      : 'text/plain',
    436         '.hdf'    : 'application/x-hdf',
    437         '.htm'    : 'text/html',
    438         '.html'   : 'text/html',
    439         '.ico'    : 'image/vnd.microsoft.icon',
    440         '.ief'    : 'image/ief',
    441         '.jpe'    : 'image/jpeg',
    442         '.jpeg'   : 'image/jpeg',
    443         '.jpg'    : 'image/jpeg',
    444         '.js'     : 'application/javascript',
    445         '.ksh'    : 'text/plain',
    446         '.latex'  : 'application/x-latex',
    447         '.m1v'    : 'video/mpeg',
    448         '.man'    : 'application/x-troff-man',
    449         '.me'     : 'application/x-troff-me',
    450         '.mht'    : 'message/rfc822',
    451         '.mhtml'  : 'message/rfc822',
    452         '.mif'    : 'application/x-mif',
    453         '.mov'    : 'video/quicktime',
    454         '.movie'  : 'video/x-sgi-movie',
    455         '.mp2'    : 'audio/mpeg',
    456         '.mp3'    : 'audio/mpeg',
    457         '.mp4'    : 'video/mp4',
    458         '.mpa'    : 'video/mpeg',
    459         '.mpe'    : 'video/mpeg',
    460         '.mpeg'   : 'video/mpeg',
    461         '.mpg'    : 'video/mpeg',
    462         '.ms'     : 'application/x-troff-ms',
    463         '.nc'     : 'application/x-netcdf',
    464         '.nws'    : 'message/rfc822',
    465         '.o'      : 'application/octet-stream',
    466         '.obj'    : 'application/octet-stream',
    467         '.oda'    : 'application/oda',
    468         '.p12'    : 'application/x-pkcs12',
    469         '.p7c'    : 'application/pkcs7-mime',
    470         '.pbm'    : 'image/x-portable-bitmap',
    471         '.pdf'    : 'application/pdf',
    472         '.pfx'    : 'application/x-pkcs12',
    473         '.pgm'    : 'image/x-portable-graymap',
    474         '.pl'     : 'text/plain',
    475         '.png'    : 'image/png',
    476         '.pnm'    : 'image/x-portable-anymap',
    477         '.pot'    : 'application/vnd.ms-powerpoint',
    478         '.ppa'    : 'application/vnd.ms-powerpoint',
    479         '.ppm'    : 'image/x-portable-pixmap',
    480         '.pps'    : 'application/vnd.ms-powerpoint',
    481         '.ppt'    : 'application/vnd.ms-powerpoint',
    482         '.ps'     : 'application/postscript',
    483         '.pwz'    : 'application/vnd.ms-powerpoint',
    484         '.py'     : 'text/x-python',
    485         '.pyc'    : 'application/x-python-code',
    486         '.pyo'    : 'application/x-python-code',
    487         '.qt'     : 'video/quicktime',
    488         '.ra'     : 'audio/x-pn-realaudio',
    489         '.ram'    : 'application/x-pn-realaudio',
    490         '.ras'    : 'image/x-cmu-raster',
    491         '.rdf'    : 'application/xml',
    492         '.rgb'    : 'image/x-rgb',
    493         '.roff'   : 'application/x-troff',
    494         '.rtx'    : 'text/richtext',
    495         '.sgm'    : 'text/x-sgml',
    496         '.sgml'   : 'text/x-sgml',
    497         '.sh'     : 'application/x-sh',
    498         '.shar'   : 'application/x-shar',
    499         '.snd'    : 'audio/basic',
    500         '.so'     : 'application/octet-stream',
    501         '.src'    : 'application/x-wais-source',
    502         '.sv4cpio': 'application/x-sv4cpio',
    503         '.sv4crc' : 'application/x-sv4crc',
    504         '.svg'    : 'image/svg+xml',
    505         '.swf'    : 'application/x-shockwave-flash',
    506         '.t'      : 'application/x-troff',
    507         '.tar'    : 'application/x-tar',
    508         '.tcl'    : 'application/x-tcl',
    509         '.tex'    : 'application/x-tex',
    510         '.texi'   : 'application/x-texinfo',
    511         '.texinfo': 'application/x-texinfo',
    512         '.tif'    : 'image/tiff',
    513         '.tiff'   : 'image/tiff',
    514         '.tr'     : 'application/x-troff',
    515         '.tsv'    : 'text/tab-separated-values',
    516         '.txt'    : 'text/plain',
    517         '.ustar'  : 'application/x-ustar',
    518         '.vcf'    : 'text/x-vcard',
    519         '.wav'    : 'audio/x-wav',
    520         '.webm'   : 'video/webm',
    521         '.wiz'    : 'application/msword',
    522         '.wsdl'   : 'application/xml',
    523         '.xbm'    : 'image/x-xbitmap',
    524         '.xlb'    : 'application/vnd.ms-excel',
    525         # Duplicates :(
    526         '.xls'    : 'application/excel',
    527         '.xls'    : 'application/vnd.ms-excel',
    528         '.xml'    : 'text/xml',
    529         '.xpdl'   : 'application/xml',
    530         '.xpm'    : 'image/x-xpixmap',
    531         '.xsl'    : 'application/xml',
    532         '.xwd'    : 'image/x-xwindowdump',
    533         '.zip'    : 'application/zip',
    534         }
    535 
    536     # These are non-standard types, commonly found in the wild.  They will
    537     # only match if strict=0 flag is given to the API methods.
    538 
    539     # Please sort these too
    540     common_types = {
    541         '.jpg' : 'image/jpg',
    542         '.mid' : 'audio/midi',
    543         '.midi': 'audio/midi',
    544         '.pct' : 'image/pict',
    545         '.pic' : 'image/pict',
    546         '.pict': 'image/pict',
    547         '.rtf' : 'application/rtf',
    548         '.xul' : 'text/xul'
    549         }
    550 
    551 
    552 _default_mime_types()
    553 
    554 
    555 if __name__ == '__main__':
    556     import getopt
    557 
    558     USAGE = """\
    559 Usage: mimetypes.py [options] type
    560 
    561 Options:
    562     --help / -h       -- print this message and exit
    563     --lenient / -l    -- additionally search of some common, but non-standard
    564                          types.
    565     --extension / -e  -- guess extension instead of type
    566 
    567 More than one type argument may be given.
    568 """
    569 
    570     def usage(code, msg=''):
    571         print USAGE
    572         if msg: print msg
    573         sys.exit(code)
    574 
    575     try:
    576         opts, args = getopt.getopt(sys.argv[1:], 'hle',
    577                                    ['help', 'lenient', 'extension'])
    578     except getopt.error, msg:
    579         usage(1, msg)
    580 
    581     strict = 1
    582     extension = 0
    583     for opt, arg in opts:
    584         if opt in ('-h', '--help'):
    585             usage(0)
    586         elif opt in ('-l', '--lenient'):
    587             strict = 0
    588         elif opt in ('-e', '--extension'):
    589             extension = 1
    590     for gtype in args:
    591         if extension:
    592             guess = guess_extension(gtype, strict)
    593             if not guess: print "I don't know anything about type", gtype
    594             else: print guess
    595         else:
    596             guess, encoding = guess_type(gtype, strict)
    597             if not guess: print "I don't know anything about type", gtype
    598             else: print 'type:', guess, 'encoding:', encoding
    599