Home | History | Annotate | Download | only in Lib
      1 """Functions that read and write gzipped files.
      2 
      3 The user of the file doesn't have to worry about the compression,
      4 but random access is not allowed."""
      5 
      6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module

      7 
      8 import struct, sys, time, os
      9 import zlib
     10 import io
     11 import __builtin__
     12 
     13 __all__ = ["GzipFile","open"]
     14 
     15 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
     16 
     17 READ, WRITE = 1, 2
     18 
     19 def write32u(output, value):
     20     # The L format writes the bit pattern correctly whether signed

     21     # or unsigned.

     22     output.write(struct.pack("<L", value))
     23 
     24 def read32(input):
     25     return struct.unpack("<I", input.read(4))[0]
     26 
     27 def open(filename, mode="rb", compresslevel=9):
     28     """Shorthand for GzipFile(filename, mode, compresslevel).
     29 
     30     The filename argument is required; mode defaults to 'rb'
     31     and compresslevel defaults to 9.
     32 
     33     """
     34     return GzipFile(filename, mode, compresslevel)
     35 
     36 class GzipFile(io.BufferedIOBase):
     37     """The GzipFile class simulates most of the methods of a file object with
     38     the exception of the readinto() and truncate() methods.
     39 
     40     """
     41 
     42     myfileobj = None
     43     max_read_chunk = 10 * 1024 * 1024   # 10Mb

     44 
     45     def __init__(self, filename=None, mode=None,
     46                  compresslevel=9, fileobj=None, mtime=None):
     47         """Constructor for the GzipFile class.
     48 
     49         At least one of fileobj and filename must be given a
     50         non-trivial value.
     51 
     52         The new class instance is based on fileobj, which can be a regular
     53         file, a StringIO object, or any other object which simulates a file.
     54         It defaults to None, in which case filename is opened to provide
     55         a file object.
     56 
     57         When fileobj is not None, the filename argument is only used to be
     58         included in the gzip file header, which may includes the original
     59         filename of the uncompressed file.  It defaults to the filename of
     60         fileobj, if discernible; otherwise, it defaults to the empty string,
     61         and in this case the original filename is not included in the header.
     62 
     63         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
     64         depending on whether the file will be read or written.  The default
     65         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
     66         Be aware that only the 'rb', 'ab', and 'wb' values should be used
     67         for cross-platform portability.
     68 
     69         The compresslevel argument is an integer from 1 to 9 controlling the
     70         level of compression; 1 is fastest and produces the least compression,
     71         and 9 is slowest and produces the most compression.  The default is 9.
     72 
     73         The mtime argument is an optional numeric timestamp to be written
     74         to the stream when compressing.  All gzip compressed streams
     75         are required to contain a timestamp.  If omitted or None, the
     76         current time is used.  This module ignores the timestamp when
     77         decompressing; however, some programs, such as gunzip, make use
     78         of it.  The format of the timestamp is the same as that of the
     79         return value of time.time() and of the st_mtime member of the
     80         object returned by os.stat().
     81 
     82         """
     83 
     84         # guarantee the file is opened in binary mode on platforms

     85         # that care about that sort of thing

     86         if mode and 'b' not in mode:
     87             mode += 'b'
     88         if fileobj is None:
     89             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
     90         if filename is None:
     91             if hasattr(fileobj, 'name'): filename = fileobj.name
     92             else: filename = ''
     93         if mode is None:
     94             if hasattr(fileobj, 'mode'): mode = fileobj.mode
     95             else: mode = 'rb'
     96 
     97         if mode[0:1] == 'r':
     98             self.mode = READ
     99             # Set flag indicating start of a new member

    100             self._new_member = True
    101             # Buffer data read from gzip file. extrastart is offset in

    102             # stream where buffer starts. extrasize is number of

    103             # bytes remaining in buffer from current stream position.

    104             self.extrabuf = ""
    105             self.extrasize = 0
    106             self.extrastart = 0
    107             self.name = filename
    108             # Starts small, scales exponentially

    109             self.min_readsize = 100
    110 
    111         elif mode[0:1] == 'w' or mode[0:1] == 'a':
    112             self.mode = WRITE
    113             self._init_write(filename)
    114             self.compress = zlib.compressobj(compresslevel,
    115                                              zlib.DEFLATED,
    116                                              -zlib.MAX_WBITS,
    117                                              zlib.DEF_MEM_LEVEL,
    118                                              0)
    119         else:
    120             raise IOError, "Mode " + mode + " not supported"
    121 
    122         self.fileobj = fileobj
    123         self.offset = 0
    124         self.mtime = mtime
    125 
    126         if self.mode == WRITE:
    127             self._write_gzip_header()
    128 
    129     @property
    130     def filename(self):
    131         import warnings
    132         warnings.warn("use the name attribute", DeprecationWarning, 2)
    133         if self.mode == WRITE and self.name[-3:] != ".gz":
    134             return self.name + ".gz"
    135         return self.name
    136 
    137     def __repr__(self):
    138         s = repr(self.fileobj)
    139         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
    140 
    141     def _check_closed(self):
    142         """Raises a ValueError if the underlying file object has been closed.
    143 
    144         """
    145         if self.closed:
    146             raise ValueError('I/O operation on closed file.')
    147 
    148     def _init_write(self, filename):
    149         self.name = filename
    150         self.crc = zlib.crc32("") & 0xffffffffL
    151         self.size = 0
    152         self.writebuf = []
    153         self.bufsize = 0
    154 
    155     def _write_gzip_header(self):
    156         self.fileobj.write('\037\213')             # magic header

    157         self.fileobj.write('\010')                 # compression method

    158         fname = os.path.basename(self.name)
    159         if fname.endswith(".gz"):
    160             fname = fname[:-3]
    161         flags = 0
    162         if fname:
    163             flags = FNAME
    164         self.fileobj.write(chr(flags))
    165         mtime = self.mtime
    166         if mtime is None:
    167             mtime = time.time()
    168         write32u(self.fileobj, long(mtime))
    169         self.fileobj.write('\002')
    170         self.fileobj.write('\377')
    171         if fname:
    172             self.fileobj.write(fname + '\000')
    173 
    174     def _init_read(self):
    175         self.crc = zlib.crc32("") & 0xffffffffL
    176         self.size = 0
    177 
    178     def _read_gzip_header(self):
    179         magic = self.fileobj.read(2)
    180         if magic != '\037\213':
    181             raise IOError, 'Not a gzipped file'
    182         method = ord( self.fileobj.read(1) )
    183         if method != 8:
    184             raise IOError, 'Unknown compression method'
    185         flag = ord( self.fileobj.read(1) )
    186         self.mtime = read32(self.fileobj)
    187         # extraflag = self.fileobj.read(1)

    188         # os = self.fileobj.read(1)

    189         self.fileobj.read(2)
    190 
    191         if flag & FEXTRA:
    192             # Read & discard the extra field, if present

    193             xlen = ord(self.fileobj.read(1))
    194             xlen = xlen + 256*ord(self.fileobj.read(1))
    195             self.fileobj.read(xlen)
    196         if flag & FNAME:
    197             # Read and discard a null-terminated string containing the filename

    198             while True:
    199                 s = self.fileobj.read(1)
    200                 if not s or s=='\000':
    201                     break
    202         if flag & FCOMMENT:
    203             # Read and discard a null-terminated string containing a comment

    204             while True:
    205                 s = self.fileobj.read(1)
    206                 if not s or s=='\000':
    207                     break
    208         if flag & FHCRC:
    209             self.fileobj.read(2)     # Read & discard the 16-bit header CRC

    210 
    211     def write(self,data):
    212         self._check_closed()
    213         if self.mode != WRITE:
    214             import errno
    215             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
    216 
    217         if self.fileobj is None:
    218             raise ValueError, "write() on closed GzipFile object"
    219 
    220         # Convert data type if called by io.BufferedWriter.

    221         if isinstance(data, memoryview):
    222             data = data.tobytes()
    223 
    224         if len(data) > 0:
    225             self.size = self.size + len(data)
    226             self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
    227             self.fileobj.write( self.compress.compress(data) )
    228             self.offset += len(data)
    229 
    230         return len(data)
    231 
    232     def read(self, size=-1):
    233         self._check_closed()
    234         if self.mode != READ:
    235             import errno
    236             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
    237 
    238         if self.extrasize <= 0 and self.fileobj is None:
    239             return ''
    240 
    241         readsize = 1024
    242         if size < 0:        # get the whole thing

    243             try:
    244                 while True:
    245                     self._read(readsize)
    246                     readsize = min(self.max_read_chunk, readsize * 2)
    247             except EOFError:
    248                 size = self.extrasize
    249         else:               # just get some more of it

    250             try:
    251                 while size > self.extrasize:
    252                     self._read(readsize)
    253                     readsize = min(self.max_read_chunk, readsize * 2)
    254             except EOFError:
    255                 if size > self.extrasize:
    256                     size = self.extrasize
    257 
    258         offset = self.offset - self.extrastart
    259         chunk = self.extrabuf[offset: offset + size]
    260         self.extrasize = self.extrasize - size
    261 
    262         self.offset += size
    263         return chunk
    264 
    265     def _unread(self, buf):
    266         self.extrasize = len(buf) + self.extrasize
    267         self.offset -= len(buf)
    268 
    269     def _read(self, size=1024):
    270         if self.fileobj is None:
    271             raise EOFError, "Reached EOF"
    272 
    273         if self._new_member:
    274             # If the _new_member flag is set, we have to

    275             # jump to the next member, if there is one.

    276             #

    277             # First, check if we're at the end of the file;

    278             # if so, it's time to stop; no more members to read.

    279             pos = self.fileobj.tell()   # Save current position

    280             self.fileobj.seek(0, 2)     # Seek to end of file

    281             if pos == self.fileobj.tell():
    282                 raise EOFError, "Reached EOF"
    283             else:
    284                 self.fileobj.seek( pos ) # Return to original position

    285 
    286             self._init_read()
    287             self._read_gzip_header()
    288             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
    289             self._new_member = False
    290 
    291         # Read a chunk of data from the file

    292         buf = self.fileobj.read(size)
    293 
    294         # If the EOF has been reached, flush the decompression object

    295         # and mark this object as finished.

    296 
    297         if buf == "":
    298             uncompress = self.decompress.flush()
    299             self._read_eof()
    300             self._add_read_data( uncompress )
    301             raise EOFError, 'Reached EOF'
    302 
    303         uncompress = self.decompress.decompress(buf)
    304         self._add_read_data( uncompress )
    305 
    306         if self.decompress.unused_data != "":
    307             # Ending case: we've come to the end of a member in the file,

    308             # so seek back to the start of the unused data, finish up

    309             # this member, and read a new gzip header.

    310             # (The number of bytes to seek back is the length of the unused

    311             # data, minus 8 because _read_eof() will rewind a further 8 bytes)

    312             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
    313 
    314             # Check the CRC and file size, and set the flag so we read

    315             # a new member on the next call

    316             self._read_eof()
    317             self._new_member = True
    318 
    319     def _add_read_data(self, data):
    320         self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
    321         offset = self.offset - self.extrastart
    322         self.extrabuf = self.extrabuf[offset:] + data
    323         self.extrasize = self.extrasize + len(data)
    324         self.extrastart = self.offset
    325         self.size = self.size + len(data)
    326 
    327     def _read_eof(self):
    328         # We've read to the end of the file, so we have to rewind in order

    329         # to reread the 8 bytes containing the CRC and the file size.

    330         # We check the that the computed CRC and size of the

    331         # uncompressed data matches the stored values.  Note that the size

    332         # stored is the true file size mod 2**32.

    333         self.fileobj.seek(-8, 1)
    334         crc32 = read32(self.fileobj)
    335         isize = read32(self.fileobj)  # may exceed 2GB

    336         if crc32 != self.crc:
    337             raise IOError("CRC check failed %s != %s" % (hex(crc32),
    338                                                          hex(self.crc)))
    339         elif isize != (self.size & 0xffffffffL):
    340             raise IOError, "Incorrect length of data produced"
    341 
    342         # Gzip files can be padded with zeroes and still have archives.

    343         # Consume all zero bytes and set the file position to the first

    344         # non-zero byte. See http://www.gzip.org/#faq8

    345         c = "\x00"
    346         while c == "\x00":
    347             c = self.fileobj.read(1)
    348         if c:
    349             self.fileobj.seek(-1, 1)
    350 
    351     @property
    352     def closed(self):
    353         return self.fileobj is None
    354 
    355     def close(self):
    356         if self.fileobj is None:
    357             return
    358         if self.mode == WRITE:
    359             self.fileobj.write(self.compress.flush())
    360             write32u(self.fileobj, self.crc)
    361             # self.size may exceed 2GB, or even 4GB

    362             write32u(self.fileobj, self.size & 0xffffffffL)
    363             self.fileobj = None
    364         elif self.mode == READ:
    365             self.fileobj = None
    366         if self.myfileobj:
    367             self.myfileobj.close()
    368             self.myfileobj = None
    369 
    370     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
    371         self._check_closed()
    372         if self.mode == WRITE:
    373             # Ensure the compressor's buffer is flushed

    374             self.fileobj.write(self.compress.flush(zlib_mode))
    375             self.fileobj.flush()
    376 
    377     def fileno(self):
    378         """Invoke the underlying file object's fileno() method.
    379 
    380         This will raise AttributeError if the underlying file object
    381         doesn't support fileno().
    382         """
    383         return self.fileobj.fileno()
    384 
    385     def rewind(self):
    386         '''Return the uncompressed stream file position indicator to the
    387         beginning of the file'''
    388         if self.mode != READ:
    389             raise IOError("Can't rewind in write mode")
    390         self.fileobj.seek(0)
    391         self._new_member = True
    392         self.extrabuf = ""
    393         self.extrasize = 0
    394         self.extrastart = 0
    395         self.offset = 0
    396 
    397     def readable(self):
    398         return self.mode == READ
    399 
    400     def writable(self):
    401         return self.mode == WRITE
    402 
    403     def seekable(self):
    404         return True
    405 
    406     def seek(self, offset, whence=0):
    407         if whence:
    408             if whence == 1:
    409                 offset = self.offset + offset
    410             else:
    411                 raise ValueError('Seek from end not supported')
    412         if self.mode == WRITE:
    413             if offset < self.offset:
    414                 raise IOError('Negative seek in write mode')
    415             count = offset - self.offset
    416             for i in range(count // 1024):
    417                 self.write(1024 * '\0')
    418             self.write((count % 1024) * '\0')
    419         elif self.mode == READ:
    420             if offset < self.offset:
    421                 # for negative seek, rewind and do positive seek

    422                 self.rewind()
    423             count = offset - self.offset
    424             for i in range(count // 1024):
    425                 self.read(1024)
    426             self.read(count % 1024)
    427 
    428         return self.offset
    429 
    430     def readline(self, size=-1):
    431         if size < 0:
    432             # Shortcut common case - newline found in buffer.

    433             offset = self.offset - self.extrastart
    434             i = self.extrabuf.find('\n', offset) + 1
    435             if i > 0:
    436                 self.extrasize -= i - offset
    437                 self.offset += i - offset
    438                 return self.extrabuf[offset: i]
    439 
    440             size = sys.maxint
    441             readsize = self.min_readsize
    442         else:
    443             readsize = size
    444         bufs = []
    445         while size != 0:
    446             c = self.read(readsize)
    447             i = c.find('\n')
    448 
    449             # We set i=size to break out of the loop under two

    450             # conditions: 1) there's no newline, and the chunk is

    451             # larger than size, or 2) there is a newline, but the

    452             # resulting line would be longer than 'size'.

    453             if (size <= i) or (i == -1 and len(c) > size):
    454                 i = size - 1
    455 
    456             if i >= 0 or c == '':
    457                 bufs.append(c[:i + 1])    # Add portion of last chunk

    458                 self._unread(c[i + 1:])   # Push back rest of chunk

    459                 break
    460 
    461             # Append chunk to list, decrease 'size',

    462             bufs.append(c)
    463             size = size - len(c)
    464             readsize = min(size, readsize * 2)
    465         if readsize > self.min_readsize:
    466             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
    467         return ''.join(bufs) # Return resulting line

    468 
    469 
    470 def _test():
    471     # Act like gzip; with -d, act like gunzip.

    472     # The input file is not deleted, however, nor are any other gzip

    473     # options or features supported.

    474     args = sys.argv[1:]
    475     decompress = args and args[0] == "-d"
    476     if decompress:
    477         args = args[1:]
    478     if not args:
    479         args = ["-"]
    480     for arg in args:
    481         if decompress:
    482             if arg == "-":
    483                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
    484                 g = sys.stdout
    485             else:
    486                 if arg[-3:] != ".gz":
    487                     print "filename doesn't end in .gz:", repr(arg)
    488                     continue
    489                 f = open(arg, "rb")
    490                 g = __builtin__.open(arg[:-3], "wb")
    491         else:
    492             if arg == "-":
    493                 f = sys.stdin
    494                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
    495             else:
    496                 f = __builtin__.open(arg, "rb")
    497                 g = open(arg + ".gz", "wb")
    498         while True:
    499             chunk = f.read(1024)
    500             if not chunk:
    501                 break
    502             g.write(chunk)
    503         if g is not sys.stdout:
    504             g.close()
    505         if f is not sys.stdin:
    506             f.close()
    507 
    508 if __name__ == '__main__':
    509     _test()
    510