Home | History | Annotate | Download | only in python2.7
      1 """Functions that read and write gzipped files.
      2 
      3 The user of the file doesn't have to worry about the compression,
      4 but random access is not allowed."""
      5 
      6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
      7 
      8 import struct, sys, time, os
      9 import zlib
     10 import io
     11 import __builtin__
     12 
     13 __all__ = ["GzipFile","open"]
     14 
     15 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
     16 
     17 READ, WRITE = 1, 2
     18 
     19 def write32u(output, value):
     20     # The L format writes the bit pattern correctly whether signed
     21     # or unsigned.
     22     output.write(struct.pack("<L", value))
     23 
     24 def read32(input):
     25     return struct.unpack("<I", input.read(4))[0]
     26 
     27 def open(filename, mode="rb", compresslevel=9):
     28     """Shorthand for GzipFile(filename, mode, compresslevel).
     29 
     30     The filename argument is required; mode defaults to 'rb'
     31     and compresslevel defaults to 9.
     32 
     33     """
     34     return GzipFile(filename, mode, compresslevel)
     35 
     36 class GzipFile(io.BufferedIOBase):
     37     """The GzipFile class simulates most of the methods of a file object with
     38     the exception of the readinto() and truncate() methods.
     39 
     40     """
     41 
     42     myfileobj = None
     43     max_read_chunk = 10 * 1024 * 1024   # 10Mb
     44 
     45     def __init__(self, filename=None, mode=None,
     46                  compresslevel=9, fileobj=None, mtime=None):
     47         """Constructor for the GzipFile class.
     48 
     49         At least one of fileobj and filename must be given a
     50         non-trivial value.
     51 
     52         The new class instance is based on fileobj, which can be a regular
     53         file, a StringIO object, or any other object which simulates a file.
     54         It defaults to None, in which case filename is opened to provide
     55         a file object.
     56 
     57         When fileobj is not None, the filename argument is only used to be
     58         included in the gzip file header, which may includes the original
     59         filename of the uncompressed file.  It defaults to the filename of
     60         fileobj, if discernible; otherwise, it defaults to the empty string,
     61         and in this case the original filename is not included in the header.
     62 
     63         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
     64         depending on whether the file will be read or written.  The default
     65         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
     66         Be aware that only the 'rb', 'ab', and 'wb' values should be used
     67         for cross-platform portability.
     68 
     69         The compresslevel argument is an integer from 0 to 9 controlling the
     70         level of compression; 1 is fastest and produces the least compression,
     71         and 9 is slowest and produces the most compression. 0 is no compression
     72         at all. The default is 9.
     73 
     74         The mtime argument is an optional numeric timestamp to be written
     75         to the stream when compressing.  All gzip compressed streams
     76         are required to contain a timestamp.  If omitted or None, the
     77         current time is used.  This module ignores the timestamp when
     78         decompressing; however, some programs, such as gunzip, make use
     79         of it.  The format of the timestamp is the same as that of the
     80         return value of time.time() and of the st_mtime member of the
     81         object returned by os.stat().
     82 
     83         """
     84 
     85         # Make sure we don't inadvertently enable universal newlines on the
     86         # underlying file object - in read mode, this causes data corruption.
     87         if mode:
     88             mode = mode.replace('U', '')
     89         # guarantee the file is opened in binary mode on platforms
     90         # that care about that sort of thing
     91         if mode and 'b' not in mode:
     92             mode += 'b'
     93         if fileobj is None:
     94             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
     95         if filename is None:
     96             # Issue #13781: os.fdopen() creates a fileobj with a bogus name
     97             # attribute. Avoid saving this in the gzip header's filename field.
     98             if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>':
     99                 filename = fileobj.name
    100             else:
    101                 filename = ''
    102         if mode is None:
    103             if hasattr(fileobj, 'mode'): mode = fileobj.mode
    104             else: mode = 'rb'
    105 
    106         if mode[0:1] == 'r':
    107             self.mode = READ
    108             # Set flag indicating start of a new member
    109             self._new_member = True
    110             # Buffer data read from gzip file. extrastart is offset in
    111             # stream where buffer starts. extrasize is number of
    112             # bytes remaining in buffer from current stream position.
    113             self.extrabuf = ""
    114             self.extrasize = 0
    115             self.extrastart = 0
    116             self.name = filename
    117             # Starts small, scales exponentially
    118             self.min_readsize = 100
    119 
    120         elif mode[0:1] == 'w' or mode[0:1] == 'a':
    121             self.mode = WRITE
    122             self._init_write(filename)
    123             self.compress = zlib.compressobj(compresslevel,
    124                                              zlib.DEFLATED,
    125                                              -zlib.MAX_WBITS,
    126                                              zlib.DEF_MEM_LEVEL,
    127                                              0)
    128         else:
    129             raise IOError, "Mode " + mode + " not supported"
    130 
    131         self.fileobj = fileobj
    132         self.offset = 0
    133         self.mtime = mtime
    134 
    135         if self.mode == WRITE:
    136             self._write_gzip_header()
    137 
    138     @property
    139     def filename(self):
    140         import warnings
    141         warnings.warn("use the name attribute", DeprecationWarning, 2)
    142         if self.mode == WRITE and self.name[-3:] != ".gz":
    143             return self.name + ".gz"
    144         return self.name
    145 
    146     def __repr__(self):
    147         s = repr(self.fileobj)
    148         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
    149 
    150     def _check_closed(self):
    151         """Raises a ValueError if the underlying file object has been closed.
    152 
    153         """
    154         if self.closed:
    155             raise ValueError('I/O operation on closed file.')
    156 
    157     def _init_write(self, filename):
    158         self.name = filename
    159         self.crc = zlib.crc32("") & 0xffffffffL
    160         self.size = 0
    161         self.writebuf = []
    162         self.bufsize = 0
    163 
    164     def _write_gzip_header(self):
    165         self.fileobj.write('\037\213')             # magic header
    166         self.fileobj.write('\010')                 # compression method
    167         fname = os.path.basename(self.name)
    168         if fname.endswith(".gz"):
    169             fname = fname[:-3]
    170         flags = 0
    171         if fname:
    172             flags = FNAME
    173         self.fileobj.write(chr(flags))
    174         mtime = self.mtime
    175         if mtime is None:
    176             mtime = time.time()
    177         write32u(self.fileobj, long(mtime))
    178         self.fileobj.write('\002')
    179         self.fileobj.write('\377')
    180         if fname:
    181             self.fileobj.write(fname + '\000')
    182 
    183     def _init_read(self):
    184         self.crc = zlib.crc32("") & 0xffffffffL
    185         self.size = 0
    186 
    187     def _read_gzip_header(self):
    188         magic = self.fileobj.read(2)
    189         if magic != '\037\213':
    190             raise IOError, 'Not a gzipped file'
    191         method = ord( self.fileobj.read(1) )
    192         if method != 8:
    193             raise IOError, 'Unknown compression method'
    194         flag = ord( self.fileobj.read(1) )
    195         self.mtime = read32(self.fileobj)
    196         # extraflag = self.fileobj.read(1)
    197         # os = self.fileobj.read(1)
    198         self.fileobj.read(2)
    199 
    200         if flag & FEXTRA:
    201             # Read & discard the extra field, if present
    202             xlen = ord(self.fileobj.read(1))
    203             xlen = xlen + 256*ord(self.fileobj.read(1))
    204             self.fileobj.read(xlen)
    205         if flag & FNAME:
    206             # Read and discard a null-terminated string containing the filename
    207             while True:
    208                 s = self.fileobj.read(1)
    209                 if not s or s=='\000':
    210                     break
    211         if flag & FCOMMENT:
    212             # Read and discard a null-terminated string containing a comment
    213             while True:
    214                 s = self.fileobj.read(1)
    215                 if not s or s=='\000':
    216                     break
    217         if flag & FHCRC:
    218             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
    219 
    220     def write(self,data):
    221         self._check_closed()
    222         if self.mode != WRITE:
    223             import errno
    224             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
    225 
    226         if self.fileobj is None:
    227             raise ValueError, "write() on closed GzipFile object"
    228 
    229         # Convert data type if called by io.BufferedWriter.
    230         if isinstance(data, memoryview):
    231             data = data.tobytes()
    232 
    233         if len(data) > 0:
    234             self.size = self.size + len(data)
    235             self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
    236             self.fileobj.write( self.compress.compress(data) )
    237             self.offset += len(data)
    238 
    239         return len(data)
    240 
    241     def read(self, size=-1):
    242         self._check_closed()
    243         if self.mode != READ:
    244             import errno
    245             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
    246 
    247         if self.extrasize <= 0 and self.fileobj is None:
    248             return ''
    249 
    250         readsize = 1024
    251         if size < 0:        # get the whole thing
    252             try:
    253                 while True:
    254                     self._read(readsize)
    255                     readsize = min(self.max_read_chunk, readsize * 2)
    256             except EOFError:
    257                 size = self.extrasize
    258         else:               # just get some more of it
    259             try:
    260                 while size > self.extrasize:
    261                     self._read(readsize)
    262                     readsize = min(self.max_read_chunk, readsize * 2)
    263             except EOFError:
    264                 if size > self.extrasize:
    265                     size = self.extrasize
    266 
    267         offset = self.offset - self.extrastart
    268         chunk = self.extrabuf[offset: offset + size]
    269         self.extrasize = self.extrasize - size
    270 
    271         self.offset += size
    272         return chunk
    273 
    274     def _unread(self, buf):
    275         self.extrasize = len(buf) + self.extrasize
    276         self.offset -= len(buf)
    277 
    278     def _read(self, size=1024):
    279         if self.fileobj is None:
    280             raise EOFError, "Reached EOF"
    281 
    282         if self._new_member:
    283             # If the _new_member flag is set, we have to
    284             # jump to the next member, if there is one.
    285             #
    286             # First, check if we're at the end of the file;
    287             # if so, it's time to stop; no more members to read.
    288             pos = self.fileobj.tell()   # Save current position
    289             self.fileobj.seek(0, 2)     # Seek to end of file
    290             if pos == self.fileobj.tell():
    291                 raise EOFError, "Reached EOF"
    292             else:
    293                 self.fileobj.seek( pos ) # Return to original position
    294 
    295             self._init_read()
    296             self._read_gzip_header()
    297             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
    298             self._new_member = False
    299 
    300         # Read a chunk of data from the file
    301         buf = self.fileobj.read(size)
    302 
    303         # If the EOF has been reached, flush the decompression object
    304         # and mark this object as finished.
    305 
    306         if buf == "":
    307             uncompress = self.decompress.flush()
    308             self._read_eof()
    309             self._add_read_data( uncompress )
    310             raise EOFError, 'Reached EOF'
    311 
    312         uncompress = self.decompress.decompress(buf)
    313         self._add_read_data( uncompress )
    314 
    315         if self.decompress.unused_data != "":
    316             # Ending case: we've come to the end of a member in the file,
    317             # so seek back to the start of the unused data, finish up
    318             # this member, and read a new gzip header.
    319             # (The number of bytes to seek back is the length of the unused
    320             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
    321             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
    322 
    323             # Check the CRC and file size, and set the flag so we read
    324             # a new member on the next call
    325             self._read_eof()
    326             self._new_member = True
    327 
    328     def _add_read_data(self, data):
    329         self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
    330         offset = self.offset - self.extrastart
    331         self.extrabuf = self.extrabuf[offset:] + data
    332         self.extrasize = self.extrasize + len(data)
    333         self.extrastart = self.offset
    334         self.size = self.size + len(data)
    335 
    336     def _read_eof(self):
    337         # We've read to the end of the file, so we have to rewind in order
    338         # to reread the 8 bytes containing the CRC and the file size.
    339         # We check the that the computed CRC and size of the
    340         # uncompressed data matches the stored values.  Note that the size
    341         # stored is the true file size mod 2**32.
    342         self.fileobj.seek(-8, 1)
    343         crc32 = read32(self.fileobj)
    344         isize = read32(self.fileobj)  # may exceed 2GB
    345         if crc32 != self.crc:
    346             raise IOError("CRC check failed %s != %s" % (hex(crc32),
    347                                                          hex(self.crc)))
    348         elif isize != (self.size & 0xffffffffL):
    349             raise IOError, "Incorrect length of data produced"
    350 
    351         # Gzip files can be padded with zeroes and still have archives.
    352         # Consume all zero bytes and set the file position to the first
    353         # non-zero byte. See http://www.gzip.org/#faq8
    354         c = "\x00"
    355         while c == "\x00":
    356             c = self.fileobj.read(1)
    357         if c:
    358             self.fileobj.seek(-1, 1)
    359 
    360     @property
    361     def closed(self):
    362         return self.fileobj is None
    363 
    364     def close(self):
    365         if self.fileobj is None:
    366             return
    367         if self.mode == WRITE:
    368             self.fileobj.write(self.compress.flush())
    369             write32u(self.fileobj, self.crc)
    370             # self.size may exceed 2GB, or even 4GB
    371             write32u(self.fileobj, self.size & 0xffffffffL)
    372             self.fileobj = None
    373         elif self.mode == READ:
    374             self.fileobj = None
    375         if self.myfileobj:
    376             self.myfileobj.close()
    377             self.myfileobj = None
    378 
    379     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
    380         self._check_closed()
    381         if self.mode == WRITE:
    382             # Ensure the compressor's buffer is flushed
    383             self.fileobj.write(self.compress.flush(zlib_mode))
    384             self.fileobj.flush()
    385 
    386     def fileno(self):
    387         """Invoke the underlying file object's fileno() method.
    388 
    389         This will raise AttributeError if the underlying file object
    390         doesn't support fileno().
    391         """
    392         return self.fileobj.fileno()
    393 
    394     def rewind(self):
    395         '''Return the uncompressed stream file position indicator to the
    396         beginning of the file'''
    397         if self.mode != READ:
    398             raise IOError("Can't rewind in write mode")
    399         self.fileobj.seek(0)
    400         self._new_member = True
    401         self.extrabuf = ""
    402         self.extrasize = 0
    403         self.extrastart = 0
    404         self.offset = 0
    405 
    406     def readable(self):
    407         return self.mode == READ
    408 
    409     def writable(self):
    410         return self.mode == WRITE
    411 
    412     def seekable(self):
    413         return True
    414 
    415     def seek(self, offset, whence=0):
    416         if whence:
    417             if whence == 1:
    418                 offset = self.offset + offset
    419             else:
    420                 raise ValueError('Seek from end not supported')
    421         if self.mode == WRITE:
    422             if offset < self.offset:
    423                 raise IOError('Negative seek in write mode')
    424             count = offset - self.offset
    425             for i in xrange(count // 1024):
    426                 self.write(1024 * '\0')
    427             self.write((count % 1024) * '\0')
    428         elif self.mode == READ:
    429             if offset < self.offset:
    430                 # for negative seek, rewind and do positive seek
    431                 self.rewind()
    432             count = offset - self.offset
    433             for i in xrange(count // 1024):
    434                 self.read(1024)
    435             self.read(count % 1024)
    436 
    437         return self.offset
    438 
    439     def readline(self, size=-1):
    440         if size < 0:
    441             # Shortcut common case - newline found in buffer.
    442             offset = self.offset - self.extrastart
    443             i = self.extrabuf.find('\n', offset) + 1
    444             if i > 0:
    445                 self.extrasize -= i - offset
    446                 self.offset += i - offset
    447                 return self.extrabuf[offset: i]
    448 
    449             size = sys.maxint
    450             readsize = self.min_readsize
    451         else:
    452             readsize = size
    453         bufs = []
    454         while size != 0:
    455             c = self.read(readsize)
    456             i = c.find('\n')
    457 
    458             # We set i=size to break out of the loop under two
    459             # conditions: 1) there's no newline, and the chunk is
    460             # larger than size, or 2) there is a newline, but the
    461             # resulting line would be longer than 'size'.
    462             if (size <= i) or (i == -1 and len(c) > size):
    463                 i = size - 1
    464 
    465             if i >= 0 or c == '':
    466                 bufs.append(c[:i + 1])    # Add portion of last chunk
    467                 self._unread(c[i + 1:])   # Push back rest of chunk
    468                 break
    469 
    470             # Append chunk to list, decrease 'size',
    471             bufs.append(c)
    472             size = size - len(c)
    473             readsize = min(size, readsize * 2)
    474         if readsize > self.min_readsize:
    475             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
    476         return ''.join(bufs) # Return resulting line
    477 
    478 
    479 def _test():
    480     # Act like gzip; with -d, act like gunzip.
    481     # The input file is not deleted, however, nor are any other gzip
    482     # options or features supported.
    483     args = sys.argv[1:]
    484     decompress = args and args[0] == "-d"
    485     if decompress:
    486         args = args[1:]
    487     if not args:
    488         args = ["-"]
    489     for arg in args:
    490         if decompress:
    491             if arg == "-":
    492                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
    493                 g = sys.stdout
    494             else:
    495                 if arg[-3:] != ".gz":
    496                     print "filename doesn't end in .gz:", repr(arg)
    497                     continue
    498                 f = open(arg, "rb")
    499                 g = __builtin__.open(arg[:-3], "wb")
    500         else:
    501             if arg == "-":
    502                 f = sys.stdin
    503                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
    504             else:
    505                 f = __builtin__.open(arg, "rb")
    506                 g = open(arg + ".gz", "wb")
    507         while True:
    508             chunk = f.read(1024)
    509             if not chunk:
    510                 break
    511             g.write(chunk)
    512         if g is not sys.stdout:
    513             g.close()
    514         if f is not sys.stdin:
    515             f.close()
    516 
    517 if __name__ == '__main__':
    518     _test()
    519