Home | History | Annotate | Download | only in Lib
      1 """Functions that read and write gzipped files.
      2 
      3 The user of the file doesn't have to worry about the compression,
      4 but random access is not allowed."""
      5 
      6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
      7 
      8 import struct, sys, time, os
      9 import zlib
     10 import io
     11 import __builtin__
     12 
     13 __all__ = ["GzipFile","open"]
     14 
     15 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
     16 
     17 READ, WRITE = 1, 2
     18 
     19 def write32u(output, value):
     20     # The L format writes the bit pattern correctly whether signed
     21     # or unsigned.
     22     output.write(struct.pack("<L", value))
     23 
     24 def read32(input):
     25     return struct.unpack("<I", input.read(4))[0]
     26 
     27 def open(filename, mode="rb", compresslevel=9):
     28     """Shorthand for GzipFile(filename, mode, compresslevel).
     29 
     30     The filename argument is required; mode defaults to 'rb'
     31     and compresslevel defaults to 9.
     32 
     33     """
     34     return GzipFile(filename, mode, compresslevel)
     35 
     36 class GzipFile(io.BufferedIOBase):
     37     """The GzipFile class simulates most of the methods of a file object with
     38     the exception of the readinto() and truncate() methods.
     39 
     40     """
     41 
     42     myfileobj = None
     43     max_read_chunk = 10 * 1024 * 1024   # 10Mb
     44 
     45     def __init__(self, filename=None, mode=None,
     46                  compresslevel=9, fileobj=None, mtime=None):
     47         """Constructor for the GzipFile class.
     48 
     49         At least one of fileobj and filename must be given a
     50         non-trivial value.
     51 
     52         The new class instance is based on fileobj, which can be a regular
     53         file, a StringIO object, or any other object which simulates a file.
     54         It defaults to None, in which case filename is opened to provide
     55         a file object.
     56 
     57         When fileobj is not None, the filename argument is only used to be
     58         included in the gzip file header, which may include the original
     59         filename of the uncompressed file.  It defaults to the filename of
     60         fileobj, if discernible; otherwise, it defaults to the empty string,
     61         and in this case the original filename is not included in the header.
     62 
     63         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
     64         depending on whether the file will be read or written.  The default
     65         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
     66         Be aware that only the 'rb', 'ab', and 'wb' values should be used
     67         for cross-platform portability.
     68 
     69         The compresslevel argument is an integer from 0 to 9 controlling the
     70         level of compression; 1 is fastest and produces the least compression,
     71         and 9 is slowest and produces the most compression. 0 is no compression
     72         at all. The default is 9.
     73 
     74         The mtime argument is an optional numeric timestamp to be written
     75         to the stream when compressing.  All gzip compressed streams
     76         are required to contain a timestamp.  If omitted or None, the
     77         current time is used.  This module ignores the timestamp when
     78         decompressing; however, some programs, such as gunzip, make use
     79         of it.  The format of the timestamp is the same as that of the
     80         return value of time.time() and of the st_mtime member of the
     81         object returned by os.stat().
     82 
     83         """
     84 
     85         # Make sure we don't inadvertently enable universal newlines on the
     86         # underlying file object - in read mode, this causes data corruption.
     87         if mode:
     88             mode = mode.replace('U', '')
     89         # guarantee the file is opened in binary mode on platforms
     90         # that care about that sort of thing
     91         if mode and 'b' not in mode:
     92             mode += 'b'
     93         if fileobj is None:
     94             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
     95         if filename is None:
     96             # Issue #13781: os.fdopen() creates a fileobj with a bogus name
     97             # attribute. Avoid saving this in the gzip header's filename field.
     98             if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>':
     99                 filename = fileobj.name
    100             else:
    101                 filename = ''
    102         if mode is None:
    103             if hasattr(fileobj, 'mode'): mode = fileobj.mode
    104             else: mode = 'rb'
    105 
    106         if mode[0:1] == 'r':
    107             self.mode = READ
    108             # Set flag indicating start of a new member
    109             self._new_member = True
    110             # Buffer data read from gzip file. extrastart is offset in
    111             # stream where buffer starts. extrasize is number of
    112             # bytes remaining in buffer from current stream position.
    113             self.extrabuf = ""
    114             self.extrasize = 0
    115             self.extrastart = 0
    116             self.name = filename
    117             # Starts small, scales exponentially
    118             self.min_readsize = 100
    119 
    120         elif mode[0:1] == 'w' or mode[0:1] == 'a':
    121             self.mode = WRITE
    122             self._init_write(filename)
    123             self.compress = zlib.compressobj(compresslevel,
    124                                              zlib.DEFLATED,
    125                                              -zlib.MAX_WBITS,
    126                                              zlib.DEF_MEM_LEVEL,
    127                                              0)
    128         else:
    129             raise IOError, "Mode " + mode + " not supported"
    130 
    131         self.fileobj = fileobj
    132         self.offset = 0
    133         self.mtime = mtime
    134 
    135         if self.mode == WRITE:
    136             self._write_gzip_header()
    137 
    138     @property
    139     def filename(self):
    140         import warnings
    141         warnings.warn("use the name attribute", DeprecationWarning, 2)
    142         if self.mode == WRITE and self.name[-3:] != ".gz":
    143             return self.name + ".gz"
    144         return self.name
    145 
    146     def __repr__(self):
    147         s = repr(self.fileobj)
    148         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
    149 
    150     def _check_closed(self):
    151         """Raises a ValueError if the underlying file object has been closed.
    152 
    153         """
    154         if self.closed:
    155             raise ValueError('I/O operation on closed file.')
    156 
    157     def _init_write(self, filename):
    158         self.name = filename
    159         self.crc = zlib.crc32("") & 0xffffffffL
    160         self.size = 0
    161         self.writebuf = []
    162         self.bufsize = 0
    163 
    164     def _write_gzip_header(self):
    165         self.fileobj.write('\037\213')             # magic header
    166         self.fileobj.write('\010')                 # compression method
    167         try:
    168             # RFC 1952 requires the FNAME field to be Latin-1. Do not
    169             # include filenames that cannot be represented that way.
    170             fname = os.path.basename(self.name)
    171             if not isinstance(fname, str):
    172                 fname = fname.encode('latin-1')
    173             if fname.endswith('.gz'):
    174                 fname = fname[:-3]
    175         except UnicodeEncodeError:
    176             fname = ''
    177         flags = 0
    178         if fname:
    179             flags = FNAME
    180         self.fileobj.write(chr(flags))
    181         mtime = self.mtime
    182         if mtime is None:
    183             mtime = time.time()
    184         write32u(self.fileobj, long(mtime))
    185         self.fileobj.write('\002')
    186         self.fileobj.write('\377')
    187         if fname:
    188             self.fileobj.write(fname + '\000')
    189 
    190     def _init_read(self):
    191         self.crc = zlib.crc32("") & 0xffffffffL
    192         self.size = 0
    193 
    194     def _read_gzip_header(self):
    195         magic = self.fileobj.read(2)
    196         if magic != '\037\213':
    197             raise IOError, 'Not a gzipped file'
    198         method = ord( self.fileobj.read(1) )
    199         if method != 8:
    200             raise IOError, 'Unknown compression method'
    201         flag = ord( self.fileobj.read(1) )
    202         self.mtime = read32(self.fileobj)
    203         # extraflag = self.fileobj.read(1)
    204         # os = self.fileobj.read(1)
    205         self.fileobj.read(2)
    206 
    207         if flag & FEXTRA:
    208             # Read & discard the extra field, if present
    209             xlen = ord(self.fileobj.read(1))
    210             xlen = xlen + 256*ord(self.fileobj.read(1))
    211             self.fileobj.read(xlen)
    212         if flag & FNAME:
    213             # Read and discard a null-terminated string containing the filename
    214             while True:
    215                 s = self.fileobj.read(1)
    216                 if not s or s=='\000':
    217                     break
    218         if flag & FCOMMENT:
    219             # Read and discard a null-terminated string containing a comment
    220             while True:
    221                 s = self.fileobj.read(1)
    222                 if not s or s=='\000':
    223                     break
    224         if flag & FHCRC:
    225             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
    226 
    227     def write(self,data):
    228         self._check_closed()
    229         if self.mode != WRITE:
    230             import errno
    231             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
    232 
    233         if self.fileobj is None:
    234             raise ValueError, "write() on closed GzipFile object"
    235 
    236         # Convert data type if called by io.BufferedWriter.
    237         if isinstance(data, memoryview):
    238             data = data.tobytes()
    239 
    240         if len(data) > 0:
    241             self.fileobj.write(self.compress.compress(data))
    242             self.size += len(data)
    243             self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
    244             self.offset += len(data)
    245 
    246         return len(data)
    247 
    248     def read(self, size=-1):
    249         self._check_closed()
    250         if self.mode != READ:
    251             import errno
    252             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
    253 
    254         if self.extrasize <= 0 and self.fileobj is None:
    255             return ''
    256 
    257         readsize = 1024
    258         if size < 0:        # get the whole thing
    259             try:
    260                 while True:
    261                     self._read(readsize)
    262                     readsize = min(self.max_read_chunk, readsize * 2)
    263             except EOFError:
    264                 size = self.extrasize
    265         else:               # just get some more of it
    266             try:
    267                 while size > self.extrasize:
    268                     self._read(readsize)
    269                     readsize = min(self.max_read_chunk, readsize * 2)
    270             except EOFError:
    271                 if size > self.extrasize:
    272                     size = self.extrasize
    273 
    274         offset = self.offset - self.extrastart
    275         chunk = self.extrabuf[offset: offset + size]
    276         self.extrasize = self.extrasize - size
    277 
    278         self.offset += size
    279         return chunk
    280 
    281     def _unread(self, buf):
    282         self.extrasize = len(buf) + self.extrasize
    283         self.offset -= len(buf)
    284 
    285     def _read(self, size=1024):
    286         if self.fileobj is None:
    287             raise EOFError, "Reached EOF"
    288 
    289         if self._new_member:
    290             # If the _new_member flag is set, we have to
    291             # jump to the next member, if there is one.
    292             #
    293             # First, check if we're at the end of the file;
    294             # if so, it's time to stop; no more members to read.
    295             pos = self.fileobj.tell()   # Save current position
    296             self.fileobj.seek(0, 2)     # Seek to end of file
    297             if pos == self.fileobj.tell():
    298                 raise EOFError, "Reached EOF"
    299             else:
    300                 self.fileobj.seek( pos ) # Return to original position
    301 
    302             self._init_read()
    303             self._read_gzip_header()
    304             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
    305             self._new_member = False
    306 
    307         # Read a chunk of data from the file
    308         buf = self.fileobj.read(size)
    309 
    310         # If the EOF has been reached, flush the decompression object
    311         # and mark this object as finished.
    312 
    313         if buf == "":
    314             uncompress = self.decompress.flush()
    315             self._read_eof()
    316             self._add_read_data( uncompress )
    317             raise EOFError, 'Reached EOF'
    318 
    319         uncompress = self.decompress.decompress(buf)
    320         self._add_read_data( uncompress )
    321 
    322         if self.decompress.unused_data != "":
    323             # Ending case: we've come to the end of a member in the file,
    324             # so seek back to the start of the unused data, finish up
    325             # this member, and read a new gzip header.
    326             # (The number of bytes to seek back is the length of the unused
    327             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
    328             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
    329 
    330             # Check the CRC and file size, and set the flag so we read
    331             # a new member on the next call
    332             self._read_eof()
    333             self._new_member = True
    334 
    335     def _add_read_data(self, data):
    336         self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
    337         offset = self.offset - self.extrastart
    338         self.extrabuf = self.extrabuf[offset:] + data
    339         self.extrasize = self.extrasize + len(data)
    340         self.extrastart = self.offset
    341         self.size = self.size + len(data)
    342 
    343     def _read_eof(self):
    344         # We've read to the end of the file, so we have to rewind in order
    345         # to reread the 8 bytes containing the CRC and the file size.
    346         # We check the that the computed CRC and size of the
    347         # uncompressed data matches the stored values.  Note that the size
    348         # stored is the true file size mod 2**32.
    349         self.fileobj.seek(-8, 1)
    350         crc32 = read32(self.fileobj)
    351         isize = read32(self.fileobj)  # may exceed 2GB
    352         if crc32 != self.crc:
    353             raise IOError("CRC check failed %s != %s" % (hex(crc32),
    354                                                          hex(self.crc)))
    355         elif isize != (self.size & 0xffffffffL):
    356             raise IOError, "Incorrect length of data produced"
    357 
    358         # Gzip files can be padded with zeroes and still have archives.
    359         # Consume all zero bytes and set the file position to the first
    360         # non-zero byte. See http://www.gzip.org/#faq8
    361         c = "\x00"
    362         while c == "\x00":
    363             c = self.fileobj.read(1)
    364         if c:
    365             self.fileobj.seek(-1, 1)
    366 
    367     @property
    368     def closed(self):
    369         return self.fileobj is None
    370 
    371     def close(self):
    372         fileobj = self.fileobj
    373         if fileobj is None:
    374             return
    375         self.fileobj = None
    376         try:
    377             if self.mode == WRITE:
    378                 fileobj.write(self.compress.flush())
    379                 write32u(fileobj, self.crc)
    380                 # self.size may exceed 2GB, or even 4GB
    381                 write32u(fileobj, self.size & 0xffffffffL)
    382         finally:
    383             myfileobj = self.myfileobj
    384             if myfileobj:
    385                 self.myfileobj = None
    386                 myfileobj.close()
    387 
    388     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
    389         self._check_closed()
    390         if self.mode == WRITE:
    391             # Ensure the compressor's buffer is flushed
    392             self.fileobj.write(self.compress.flush(zlib_mode))
    393             self.fileobj.flush()
    394 
    395     def fileno(self):
    396         """Invoke the underlying file object's fileno() method.
    397 
    398         This will raise AttributeError if the underlying file object
    399         doesn't support fileno().
    400         """
    401         return self.fileobj.fileno()
    402 
    403     def rewind(self):
    404         '''Return the uncompressed stream file position indicator to the
    405         beginning of the file'''
    406         if self.mode != READ:
    407             raise IOError("Can't rewind in write mode")
    408         self.fileobj.seek(0)
    409         self._new_member = True
    410         self.extrabuf = ""
    411         self.extrasize = 0
    412         self.extrastart = 0
    413         self.offset = 0
    414 
    415     def readable(self):
    416         return self.mode == READ
    417 
    418     def writable(self):
    419         return self.mode == WRITE
    420 
    421     def seekable(self):
    422         return True
    423 
    424     def seek(self, offset, whence=0):
    425         if whence:
    426             if whence == 1:
    427                 offset = self.offset + offset
    428             else:
    429                 raise ValueError('Seek from end not supported')
    430         if self.mode == WRITE:
    431             if offset < self.offset:
    432                 raise IOError('Negative seek in write mode')
    433             count = offset - self.offset
    434             for i in xrange(count // 1024):
    435                 self.write(1024 * '\0')
    436             self.write((count % 1024) * '\0')
    437         elif self.mode == READ:
    438             if offset < self.offset:
    439                 # for negative seek, rewind and do positive seek
    440                 self.rewind()
    441             count = offset - self.offset
    442             for i in xrange(count // 1024):
    443                 self.read(1024)
    444             self.read(count % 1024)
    445 
    446         return self.offset
    447 
    448     def readline(self, size=-1):
    449         if size < 0:
    450             # Shortcut common case - newline found in buffer.
    451             offset = self.offset - self.extrastart
    452             i = self.extrabuf.find('\n', offset) + 1
    453             if i > 0:
    454                 self.extrasize -= i - offset
    455                 self.offset += i - offset
    456                 return self.extrabuf[offset: i]
    457 
    458             size = sys.maxint
    459             readsize = self.min_readsize
    460         else:
    461             readsize = size
    462         bufs = []
    463         while size != 0:
    464             c = self.read(readsize)
    465             i = c.find('\n')
    466 
    467             # We set i=size to break out of the loop under two
    468             # conditions: 1) there's no newline, and the chunk is
    469             # larger than size, or 2) there is a newline, but the
    470             # resulting line would be longer than 'size'.
    471             if (size <= i) or (i == -1 and len(c) > size):
    472                 i = size - 1
    473 
    474             if i >= 0 or c == '':
    475                 bufs.append(c[:i + 1])    # Add portion of last chunk
    476                 self._unread(c[i + 1:])   # Push back rest of chunk
    477                 break
    478 
    479             # Append chunk to list, decrease 'size',
    480             bufs.append(c)
    481             size = size - len(c)
    482             readsize = min(size, readsize * 2)
    483         if readsize > self.min_readsize:
    484             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
    485         return ''.join(bufs) # Return resulting line
    486 
    487 
    488 def _test():
    489     # Act like gzip; with -d, act like gunzip.
    490     # The input file is not deleted, however, nor are any other gzip
    491     # options or features supported.
    492     args = sys.argv[1:]
    493     decompress = args and args[0] == "-d"
    494     if decompress:
    495         args = args[1:]
    496     if not args:
    497         args = ["-"]
    498     for arg in args:
    499         if decompress:
    500             if arg == "-":
    501                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
    502                 g = sys.stdout
    503             else:
    504                 if arg[-3:] != ".gz":
    505                     print "filename doesn't end in .gz:", repr(arg)
    506                     continue
    507                 f = open(arg, "rb")
    508                 g = __builtin__.open(arg[:-3], "wb")
    509         else:
    510             if arg == "-":
    511                 f = sys.stdin
    512                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
    513             else:
    514                 f = __builtin__.open(arg, "rb")
    515                 g = open(arg + ".gz", "wb")
    516         while True:
    517             chunk = f.read(1024)
    518             if not chunk:
    519                 break
    520             g.write(chunk)
    521         if g is not sys.stdout:
    522             g.close()
    523         if f is not sys.stdin:
    524             f.close()
    525 
    526 if __name__ == '__main__':
    527     _test()
    528