Home | History | Annotate | Download | only in python2.7
      1 """Guess the MIME type of a file.
      2 
      3 This module defines two useful functions:
      4 
      5 guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
      6 
      7 guess_extension(type, strict=1) -- guess the extension for a given MIME type.
      8 
      9 It also contains the following, for tuning the behavior:
     10 
     11 Data:
     12 
     13 knownfiles -- list of files to parse
     14 inited -- flag set when init() has been called
     15 suffix_map -- dictionary mapping suffixes to suffixes
     16 encodings_map -- dictionary mapping suffixes to encodings
     17 types_map -- dictionary mapping suffixes to types
     18 
     19 Functions:
     20 
     21 init([files]) -- parse a list of files, default knownfiles (on Windows, the
     22   default values are taken from the registry)
     23 read_mime_types(file) -- parse one file, return a dictionary or None
     24 """
     25 
     26 import os
     27 import sys
     28 import posixpath
     29 import urllib
     30 try:
     31     import _winreg
     32 except ImportError:
     33     _winreg = None
     34 
     35 __all__ = [
     36     "guess_type","guess_extension","guess_all_extensions",
     37     "add_type","read_mime_types","init"
     38 ]
     39 
     40 knownfiles = [
     41     "/etc/mime.types",
     42     "/etc/httpd/mime.types",                    # Mac OS X
     43     "/etc/httpd/conf/mime.types",               # Apache
     44     "/etc/apache/mime.types",                   # Apache 1
     45     "/etc/apache2/mime.types",                  # Apache 2
     46     "/usr/local/etc/httpd/conf/mime.types",
     47     "/usr/local/lib/netscape/mime.types",
     48     "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
     49     "/usr/local/etc/mime.types",                # Apache 1.3
     50     ]
     51 
     52 inited = False
     53 _db = None
     54 
     55 
     56 class MimeTypes:
     57     """MIME-types datastore.
     58 
     59     This datastore can handle information from mime.types-style files
     60     and supports basic determination of MIME type from a filename or
     61     URL, and can guess a reasonable extension given a MIME type.
     62     """
     63 
     64     def __init__(self, filenames=(), strict=True):
     65         if not inited:
     66             init()
     67         self.encodings_map = encodings_map.copy()
     68         self.suffix_map = suffix_map.copy()
     69         self.types_map = ({}, {}) # dict for (non-strict, strict)
     70         self.types_map_inv = ({}, {})
     71         for (ext, type) in types_map.items():
     72             self.add_type(type, ext, True)
     73         for (ext, type) in common_types.items():
     74             self.add_type(type, ext, False)
     75         for name in filenames:
     76             self.read(name, strict)
     77 
     78     def add_type(self, type, ext, strict=True):
     79         """Add a mapping between a type and an extension.
     80 
     81         When the extension is already known, the new
     82         type will replace the old one. When the type
     83         is already known the extension will be added
     84         to the list of known extensions.
     85 
     86         If strict is true, information will be added to
     87         list of standard types, else to the list of non-standard
     88         types.
     89         """
     90         self.types_map[strict][ext] = type
     91         exts = self.types_map_inv[strict].setdefault(type, [])
     92         if ext not in exts:
     93             exts.append(ext)
     94 
     95     def guess_type(self, url, strict=True):
     96         """Guess the type of a file based on its URL.
     97 
     98         Return value is a tuple (type, encoding) where type is None if
     99         the type can't be guessed (no or unknown suffix) or a string
    100         of the form type/subtype, usable for a MIME Content-type
    101         header; and encoding is None for no encoding or the name of
    102         the program used to encode (e.g. compress or gzip).  The
    103         mappings are table driven.  Encoding suffixes are case
    104         sensitive; type suffixes are first tried case sensitive, then
    105         case insensitive.
    106 
    107         The suffixes .tgz, .taz and .tz (case sensitive!) are all
    108         mapped to '.tar.gz'.  (This is table-driven too, using the
    109         dictionary suffix_map.)
    110 
    111         Optional `strict' argument when False adds a bunch of commonly found,
    112         but non-standard types.
    113         """
    114         scheme, url = urllib.splittype(url)
    115         if scheme == 'data':
    116             # syntax of data URLs:
    117             # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
    118             # mediatype := [ type "/" subtype ] *( ";" parameter )
    119             # data      := *urlchar
    120             # parameter := attribute "=" value
    121             # type/subtype defaults to "text/plain"
    122             comma = url.find(',')
    123             if comma < 0:
    124                 # bad data URL
    125                 return None, None
    126             semi = url.find(';', 0, comma)
    127             if semi >= 0:
    128                 type = url[:semi]
    129             else:
    130                 type = url[:comma]
    131             if '=' in type or '/' not in type:
    132                 type = 'text/plain'
    133             return type, None           # never compressed, so encoding is None
    134         base, ext = posixpath.splitext(url)
    135         while ext in self.suffix_map:
    136             base, ext = posixpath.splitext(base + self.suffix_map[ext])
    137         if ext in self.encodings_map:
    138             encoding = self.encodings_map[ext]
    139             base, ext = posixpath.splitext(base)
    140         else:
    141             encoding = None
    142         types_map = self.types_map[True]
    143         if ext in types_map:
    144             return types_map[ext], encoding
    145         elif ext.lower() in types_map:
    146             return types_map[ext.lower()], encoding
    147         elif strict:
    148             return None, encoding
    149         types_map = self.types_map[False]
    150         if ext in types_map:
    151             return types_map[ext], encoding
    152         elif ext.lower() in types_map:
    153             return types_map[ext.lower()], encoding
    154         else:
    155             return None, encoding
    156 
    157     def guess_all_extensions(self, type, strict=True):
    158         """Guess the extensions for a file based on its MIME type.
    159 
    160         Return value is a list of strings giving the possible filename
    161         extensions, including the leading dot ('.').  The extension is not
    162         guaranteed to have been associated with any particular data stream,
    163         but would be mapped to the MIME type `type' by guess_type().
    164 
    165         Optional `strict' argument when false adds a bunch of commonly found,
    166         but non-standard types.
    167         """
    168         type = type.lower()
    169         extensions = self.types_map_inv[True].get(type, [])
    170         if not strict:
    171             for ext in self.types_map_inv[False].get(type, []):
    172                 if ext not in extensions:
    173                     extensions.append(ext)
    174         return extensions
    175 
    176     def guess_extension(self, type, strict=True):
    177         """Guess the extension for a file based on its MIME type.
    178 
    179         Return value is a string giving a filename extension,
    180         including the leading dot ('.').  The extension is not
    181         guaranteed to have been associated with any particular data
    182         stream, but would be mapped to the MIME type `type' by
    183         guess_type().  If no extension can be guessed for `type', None
    184         is returned.
    185 
    186         Optional `strict' argument when false adds a bunch of commonly found,
    187         but non-standard types.
    188         """
    189         extensions = self.guess_all_extensions(type, strict)
    190         if not extensions:
    191             return None
    192         return extensions[0]
    193 
    194     def read(self, filename, strict=True):
    195         """
    196         Read a single mime.types-format file, specified by pathname.
    197 
    198         If strict is true, information will be added to
    199         list of standard types, else to the list of non-standard
    200         types.
    201         """
    202         with open(filename) as fp:
    203             self.readfp(fp, strict)
    204 
    205     def readfp(self, fp, strict=True):
    206         """
    207         Read a single mime.types-format file.
    208 
    209         If strict is true, information will be added to
    210         list of standard types, else to the list of non-standard
    211         types.
    212         """
    213         while 1:
    214             line = fp.readline()
    215             if not line:
    216                 break
    217             words = line.split()
    218             for i in range(len(words)):
    219                 if words[i][0] == '#':
    220                     del words[i:]
    221                     break
    222             if not words:
    223                 continue
    224             type, suffixes = words[0], words[1:]
    225             for suff in suffixes:
    226                 self.add_type(type, '.' + suff, strict)
    227 
    228     def read_windows_registry(self, strict=True):
    229         """
    230         Load the MIME types database from Windows registry.
    231 
    232         If strict is true, information will be added to
    233         list of standard types, else to the list of non-standard
    234         types.
    235         """
    236 
    237         # Windows only
    238         if not _winreg:
    239             return
    240 
    241         def enum_types(mimedb):
    242             i = 0
    243             while True:
    244                 try:
    245                     ctype = _winreg.EnumKey(mimedb, i)
    246                 except EnvironmentError:
    247                     break
    248                 try:
    249                     ctype = ctype.encode(default_encoding) # omit in 3.x!
    250                 except UnicodeEncodeError:
    251                     pass
    252                 else:
    253                     yield ctype
    254                 i += 1
    255 
    256         default_encoding = sys.getdefaultencoding()
    257         with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT,
    258                              r'MIME\Database\Content Type') as mimedb:
    259             for ctype in enum_types(mimedb):
    260                 try:
    261                     with _winreg.OpenKey(mimedb, ctype) as key:
    262                         suffix, datatype = _winreg.QueryValueEx(key,
    263                                                                 'Extension')
    264                 except EnvironmentError:
    265                     continue
    266                 if datatype != _winreg.REG_SZ:
    267                     continue
    268                 try:
    269                     suffix = suffix.encode(default_encoding) # omit in 3.x!
    270                 except UnicodeEncodeError:
    271                     continue
    272                 self.add_type(ctype, suffix, strict)
    273 
    274 
    275 def guess_type(url, strict=True):
    276     """Guess the type of a file based on its URL.
    277 
    278     Return value is a tuple (type, encoding) where type is None if the
    279     type can't be guessed (no or unknown suffix) or a string of the
    280     form type/subtype, usable for a MIME Content-type header; and
    281     encoding is None for no encoding or the name of the program used
    282     to encode (e.g. compress or gzip).  The mappings are table
    283     driven.  Encoding suffixes are case sensitive; type suffixes are
    284     first tried case sensitive, then case insensitive.
    285 
    286     The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
    287     to ".tar.gz".  (This is table-driven too, using the dictionary
    288     suffix_map).
    289 
    290     Optional `strict' argument when false adds a bunch of commonly found, but
    291     non-standard types.
    292     """
    293     if _db is None:
    294         init()
    295     return _db.guess_type(url, strict)
    296 
    297 
    298 def guess_all_extensions(type, strict=True):
    299     """Guess the extensions for a file based on its MIME type.
    300 
    301     Return value is a list of strings giving the possible filename
    302     extensions, including the leading dot ('.').  The extension is not
    303     guaranteed to have been associated with any particular data
    304     stream, but would be mapped to the MIME type `type' by
    305     guess_type().  If no extension can be guessed for `type', None
    306     is returned.
    307 
    308     Optional `strict' argument when false adds a bunch of commonly found,
    309     but non-standard types.
    310     """
    311     if _db is None:
    312         init()
    313     return _db.guess_all_extensions(type, strict)
    314 
    315 def guess_extension(type, strict=True):
    316     """Guess the extension for a file based on its MIME type.
    317 
    318     Return value is a string giving a filename extension, including the
    319     leading dot ('.').  The extension is not guaranteed to have been
    320     associated with any particular data stream, but would be mapped to the
    321     MIME type `type' by guess_type().  If no extension can be guessed for
    322     `type', None is returned.
    323 
    324     Optional `strict' argument when false adds a bunch of commonly found,
    325     but non-standard types.
    326     """
    327     if _db is None:
    328         init()
    329     return _db.guess_extension(type, strict)
    330 
    331 def add_type(type, ext, strict=True):
    332     """Add a mapping between a type and an extension.
    333 
    334     When the extension is already known, the new
    335     type will replace the old one. When the type
    336     is already known the extension will be added
    337     to the list of known extensions.
    338 
    339     If strict is true, information will be added to
    340     list of standard types, else to the list of non-standard
    341     types.
    342     """
    343     if _db is None:
    344         init()
    345     return _db.add_type(type, ext, strict)
    346 
    347 
    348 def init(files=None):
    349     global suffix_map, types_map, encodings_map, common_types
    350     global inited, _db
    351     inited = True    # so that MimeTypes.__init__() doesn't call us again
    352     db = MimeTypes()
    353     if files is None:
    354         if _winreg:
    355             db.read_windows_registry()
    356         files = knownfiles
    357     for file in files:
    358         if os.path.isfile(file):
    359             db.read(file)
    360     encodings_map = db.encodings_map
    361     suffix_map = db.suffix_map
    362     types_map = db.types_map[True]
    363     common_types = db.types_map[False]
    364     # Make the DB a global variable now that it is fully initialized
    365     _db = db
    366 
    367 
    368 def read_mime_types(file):
    369     try:
    370         f = open(file)
    371     except IOError:
    372         return None
    373     db = MimeTypes()
    374     db.readfp(f, True)
    375     return db.types_map[True]
    376 
    377 
    378 def _default_mime_types():
    379     global suffix_map
    380     global encodings_map
    381     global types_map
    382     global common_types
    383 
    384     suffix_map = {
    385         '.tgz': '.tar.gz',
    386         '.taz': '.tar.gz',
    387         '.tz': '.tar.gz',
    388         '.tbz2': '.tar.bz2',
    389         '.txz': '.tar.xz',
    390         }
    391 
    392     encodings_map = {
    393         '.gz': 'gzip',
    394         '.Z': 'compress',
    395         '.bz2': 'bzip2',
    396         '.xz': 'xz',
    397         }
    398 
    399     # Before adding new types, make sure they are either registered with IANA,
    400     # at http://www.isi.edu/in-notes/iana/assignments/media-types
    401     # or extensions, i.e. using the x- prefix
    402 
    403     # If you add to these, please keep them sorted!
    404     types_map = {
    405         '.a'      : 'application/octet-stream',
    406         '.ai'     : 'application/postscript',
    407         '.aif'    : 'audio/x-aiff',
    408         '.aifc'   : 'audio/x-aiff',
    409         '.aiff'   : 'audio/x-aiff',
    410         '.au'     : 'audio/basic',
    411         '.avi'    : 'video/x-msvideo',
    412         '.bat'    : 'text/plain',
    413         '.bcpio'  : 'application/x-bcpio',
    414         '.bin'    : 'application/octet-stream',
    415         '.bmp'    : 'image/x-ms-bmp',
    416         '.c'      : 'text/plain',
    417         # Duplicates :(
    418         '.cdf'    : 'application/x-cdf',
    419         '.cdf'    : 'application/x-netcdf',
    420         '.cpio'   : 'application/x-cpio',
    421         '.csh'    : 'application/x-csh',
    422         '.css'    : 'text/css',
    423         '.dll'    : 'application/octet-stream',
    424         '.doc'    : 'application/msword',
    425         '.dot'    : 'application/msword',
    426         '.dvi'    : 'application/x-dvi',
    427         '.eml'    : 'message/rfc822',
    428         '.eps'    : 'application/postscript',
    429         '.etx'    : 'text/x-setext',
    430         '.exe'    : 'application/octet-stream',
    431         '.gif'    : 'image/gif',
    432         '.gtar'   : 'application/x-gtar',
    433         '.h'      : 'text/plain',
    434         '.hdf'    : 'application/x-hdf',
    435         '.htm'    : 'text/html',
    436         '.html'   : 'text/html',
    437         '.ico'    : 'image/vnd.microsoft.icon',
    438         '.ief'    : 'image/ief',
    439         '.jpe'    : 'image/jpeg',
    440         '.jpeg'   : 'image/jpeg',
    441         '.jpg'    : 'image/jpeg',
    442         '.js'     : 'application/javascript',
    443         '.ksh'    : 'text/plain',
    444         '.latex'  : 'application/x-latex',
    445         '.m1v'    : 'video/mpeg',
    446         '.man'    : 'application/x-troff-man',
    447         '.me'     : 'application/x-troff-me',
    448         '.mht'    : 'message/rfc822',
    449         '.mhtml'  : 'message/rfc822',
    450         '.mif'    : 'application/x-mif',
    451         '.mov'    : 'video/quicktime',
    452         '.movie'  : 'video/x-sgi-movie',
    453         '.mp2'    : 'audio/mpeg',
    454         '.mp3'    : 'audio/mpeg',
    455         '.mp4'    : 'video/mp4',
    456         '.mpa'    : 'video/mpeg',
    457         '.mpe'    : 'video/mpeg',
    458         '.mpeg'   : 'video/mpeg',
    459         '.mpg'    : 'video/mpeg',
    460         '.ms'     : 'application/x-troff-ms',
    461         '.nc'     : 'application/x-netcdf',
    462         '.nws'    : 'message/rfc822',
    463         '.o'      : 'application/octet-stream',
    464         '.obj'    : 'application/octet-stream',
    465         '.oda'    : 'application/oda',
    466         '.p12'    : 'application/x-pkcs12',
    467         '.p7c'    : 'application/pkcs7-mime',
    468         '.pbm'    : 'image/x-portable-bitmap',
    469         '.pdf'    : 'application/pdf',
    470         '.pfx'    : 'application/x-pkcs12',
    471         '.pgm'    : 'image/x-portable-graymap',
    472         '.pl'     : 'text/plain',
    473         '.png'    : 'image/png',
    474         '.pnm'    : 'image/x-portable-anymap',
    475         '.pot'    : 'application/vnd.ms-powerpoint',
    476         '.ppa'    : 'application/vnd.ms-powerpoint',
    477         '.ppm'    : 'image/x-portable-pixmap',
    478         '.pps'    : 'application/vnd.ms-powerpoint',
    479         '.ppt'    : 'application/vnd.ms-powerpoint',
    480         '.ps'     : 'application/postscript',
    481         '.pwz'    : 'application/vnd.ms-powerpoint',
    482         '.py'     : 'text/x-python',
    483         '.pyc'    : 'application/x-python-code',
    484         '.pyo'    : 'application/x-python-code',
    485         '.qt'     : 'video/quicktime',
    486         '.ra'     : 'audio/x-pn-realaudio',
    487         '.ram'    : 'application/x-pn-realaudio',
    488         '.ras'    : 'image/x-cmu-raster',
    489         '.rdf'    : 'application/xml',
    490         '.rgb'    : 'image/x-rgb',
    491         '.roff'   : 'application/x-troff',
    492         '.rtx'    : 'text/richtext',
    493         '.sgm'    : 'text/x-sgml',
    494         '.sgml'   : 'text/x-sgml',
    495         '.sh'     : 'application/x-sh',
    496         '.shar'   : 'application/x-shar',
    497         '.snd'    : 'audio/basic',
    498         '.so'     : 'application/octet-stream',
    499         '.src'    : 'application/x-wais-source',
    500         '.sv4cpio': 'application/x-sv4cpio',
    501         '.sv4crc' : 'application/x-sv4crc',
    502         '.swf'    : 'application/x-shockwave-flash',
    503         '.t'      : 'application/x-troff',
    504         '.tar'    : 'application/x-tar',
    505         '.tcl'    : 'application/x-tcl',
    506         '.tex'    : 'application/x-tex',
    507         '.texi'   : 'application/x-texinfo',
    508         '.texinfo': 'application/x-texinfo',
    509         '.tif'    : 'image/tiff',
    510         '.tiff'   : 'image/tiff',
    511         '.tr'     : 'application/x-troff',
    512         '.tsv'    : 'text/tab-separated-values',
    513         '.txt'    : 'text/plain',
    514         '.ustar'  : 'application/x-ustar',
    515         '.vcf'    : 'text/x-vcard',
    516         '.wav'    : 'audio/x-wav',
    517         '.wiz'    : 'application/msword',
    518         '.wsdl'   : 'application/xml',
    519         '.xbm'    : 'image/x-xbitmap',
    520         '.xlb'    : 'application/vnd.ms-excel',
    521         # Duplicates :(
    522         '.xls'    : 'application/excel',
    523         '.xls'    : 'application/vnd.ms-excel',
    524         '.xml'    : 'text/xml',
    525         '.xpdl'   : 'application/xml',
    526         '.xpm'    : 'image/x-xpixmap',
    527         '.xsl'    : 'application/xml',
    528         '.xwd'    : 'image/x-xwindowdump',
    529         '.zip'    : 'application/zip',
    530         }
    531 
    532     # These are non-standard types, commonly found in the wild.  They will
    533     # only match if strict=0 flag is given to the API methods.
    534 
    535     # Please sort these too
    536     common_types = {
    537         '.jpg' : 'image/jpg',
    538         '.mid' : 'audio/midi',
    539         '.midi': 'audio/midi',
    540         '.pct' : 'image/pict',
    541         '.pic' : 'image/pict',
    542         '.pict': 'image/pict',
    543         '.rtf' : 'application/rtf',
    544         '.xul' : 'text/xul'
    545         }
    546 
    547 
    548 _default_mime_types()
    549 
    550 
    551 if __name__ == '__main__':
    552     import getopt
    553 
    554     USAGE = """\
    555 Usage: mimetypes.py [options] type
    556 
    557 Options:
    558     --help / -h       -- print this message and exit
    559     --lenient / -l    -- additionally search of some common, but non-standard
    560                          types.
    561     --extension / -e  -- guess extension instead of type
    562 
    563 More than one type argument may be given.
    564 """
    565 
    566     def usage(code, msg=''):
    567         print USAGE
    568         if msg: print msg
    569         sys.exit(code)
    570 
    571     try:
    572         opts, args = getopt.getopt(sys.argv[1:], 'hle',
    573                                    ['help', 'lenient', 'extension'])
    574     except getopt.error, msg:
    575         usage(1, msg)
    576 
    577     strict = 1
    578     extension = 0
    579     for opt, arg in opts:
    580         if opt in ('-h', '--help'):
    581             usage(0)
    582         elif opt in ('-l', '--lenient'):
    583             strict = 0
    584         elif opt in ('-e', '--extension'):
    585             extension = 1
    586     for gtype in args:
    587         if extension:
    588             guess = guess_extension(gtype, strict)
    589             if not guess: print "I don't know anything about type", gtype
    590             else: print guess
    591         else:
    592             guess, encoding = guess_type(gtype, strict)
    593             if not guess: print "I don't know anything about type", gtype
    594             else: print 'type:', guess, 'encoding:', encoding
    595