Home | History | Annotate | Download | only in Lib
      1 """Functions that read and write gzipped files.
      2 
      3 The user of the file doesn't have to worry about the compression,
      4 but random access is not allowed."""
      5 
      6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
      7 
      8 import struct, sys, time, os
      9 import zlib
     10 import builtins
     11 import io
     12 import _compression
     13 
     14 __all__ = ["GzipFile", "open", "compress", "decompress"]
     15 
     16 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
     17 
     18 READ, WRITE = 1, 2
     19 
     20 def open(filename, mode="rb", compresslevel=9,
     21          encoding=None, errors=None, newline=None):
     22     """Open a gzip-compressed file in binary or text mode.
     23 
     24     The filename argument can be an actual filename (a str or bytes object), or
     25     an existing file object to read from or write to.
     26 
     27     The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
     28     binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
     29     "rb", and the default compresslevel is 9.
     30 
     31     For binary mode, this function is equivalent to the GzipFile constructor:
     32     GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
     33     and newline arguments must not be provided.
     34 
     35     For text mode, a GzipFile object is created, and wrapped in an
     36     io.TextIOWrapper instance with the specified encoding, error handling
     37     behavior, and line ending(s).
     38 
     39     """
     40     if "t" in mode:
     41         if "b" in mode:
     42             raise ValueError("Invalid mode: %r" % (mode,))
     43     else:
     44         if encoding is not None:
     45             raise ValueError("Argument 'encoding' not supported in binary mode")
     46         if errors is not None:
     47             raise ValueError("Argument 'errors' not supported in binary mode")
     48         if newline is not None:
     49             raise ValueError("Argument 'newline' not supported in binary mode")
     50 
     51     gz_mode = mode.replace("t", "")
     52     if isinstance(filename, (str, bytes, os.PathLike)):
     53         binary_file = GzipFile(filename, gz_mode, compresslevel)
     54     elif hasattr(filename, "read") or hasattr(filename, "write"):
     55         binary_file = GzipFile(None, gz_mode, compresslevel, filename)
     56     else:
     57         raise TypeError("filename must be a str or bytes object, or a file")
     58 
     59     if "t" in mode:
     60         return io.TextIOWrapper(binary_file, encoding, errors, newline)
     61     else:
     62         return binary_file
     63 
     64 def write32u(output, value):
     65     # The L format writes the bit pattern correctly whether signed
     66     # or unsigned.
     67     output.write(struct.pack("<L", value))
     68 
     69 class _PaddedFile:
     70     """Minimal read-only file object that prepends a string to the contents
     71     of an actual file. Shouldn't be used outside of gzip.py, as it lacks
     72     essential functionality."""
     73 
     74     def __init__(self, f, prepend=b''):
     75         self._buffer = prepend
     76         self._length = len(prepend)
     77         self.file = f
     78         self._read = 0
     79 
     80     def read(self, size):
     81         if self._read is None:
     82             return self.file.read(size)
     83         if self._read + size <= self._length:
     84             read = self._read
     85             self._read += size
     86             return self._buffer[read:self._read]
     87         else:
     88             read = self._read
     89             self._read = None
     90             return self._buffer[read:] + \
     91                    self.file.read(size-self._length+read)
     92 
     93     def prepend(self, prepend=b''):
     94         if self._read is None:
     95             self._buffer = prepend
     96         else:  # Assume data was read since the last prepend() call
     97             self._read -= len(prepend)
     98             return
     99         self._length = len(self._buffer)
    100         self._read = 0
    101 
    102     def seek(self, off):
    103         self._read = None
    104         self._buffer = None
    105         return self.file.seek(off)
    106 
    107     def seekable(self):
    108         return True  # Allows fast-forwarding even in unseekable streams
    109 
    110 class GzipFile(_compression.BaseStream):
    111     """The GzipFile class simulates most of the methods of a file object with
    112     the exception of the truncate() method.
    113 
    114     This class only supports opening files in binary mode. If you need to open a
    115     compressed file in text mode, use the gzip.open() function.
    116 
    117     """
    118 
    119     # Overridden with internal file object to be closed, if only a filename
    120     # is passed in
    121     myfileobj = None
    122 
    123     def __init__(self, filename=None, mode=None,
    124                  compresslevel=9, fileobj=None, mtime=None):
    125         """Constructor for the GzipFile class.
    126 
    127         At least one of fileobj and filename must be given a
    128         non-trivial value.
    129 
    130         The new class instance is based on fileobj, which can be a regular
    131         file, an io.BytesIO object, or any other object which simulates a file.
    132         It defaults to None, in which case filename is opened to provide
    133         a file object.
    134 
    135         When fileobj is not None, the filename argument is only used to be
    136         included in the gzip file header, which may include the original
    137         filename of the uncompressed file.  It defaults to the filename of
    138         fileobj, if discernible; otherwise, it defaults to the empty string,
    139         and in this case the original filename is not included in the header.
    140 
    141         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
    142         'xb' depending on whether the file will be read or written.  The default
    143         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
    144         A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
    145         'wb', 'a' and 'ab', and 'x' and 'xb'.
    146 
    147         The compresslevel argument is an integer from 0 to 9 controlling the
    148         level of compression; 1 is fastest and produces the least compression,
    149         and 9 is slowest and produces the most compression. 0 is no compression
    150         at all. The default is 9.
    151 
    152         The mtime argument is an optional numeric timestamp to be written
    153         to the last modification time field in the stream when compressing.
    154         If omitted or None, the current time is used.
    155 
    156         """
    157 
    158         if mode and ('t' in mode or 'U' in mode):
    159             raise ValueError("Invalid mode: {!r}".format(mode))
    160         if mode and 'b' not in mode:
    161             mode += 'b'
    162         if fileobj is None:
    163             fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
    164         if filename is None:
    165             filename = getattr(fileobj, 'name', '')
    166             if not isinstance(filename, (str, bytes)):
    167                 filename = ''
    168         else:
    169             filename = os.fspath(filename)
    170         if mode is None:
    171             mode = getattr(fileobj, 'mode', 'rb')
    172 
    173         if mode.startswith('r'):
    174             self.mode = READ
    175             raw = _GzipReader(fileobj)
    176             self._buffer = io.BufferedReader(raw)
    177             self.name = filename
    178 
    179         elif mode.startswith(('w', 'a', 'x')):
    180             self.mode = WRITE
    181             self._init_write(filename)
    182             self.compress = zlib.compressobj(compresslevel,
    183                                              zlib.DEFLATED,
    184                                              -zlib.MAX_WBITS,
    185                                              zlib.DEF_MEM_LEVEL,
    186                                              0)
    187             self._write_mtime = mtime
    188         else:
    189             raise ValueError("Invalid mode: {!r}".format(mode))
    190 
    191         self.fileobj = fileobj
    192 
    193         if self.mode == WRITE:
    194             self._write_gzip_header()
    195 
    196     @property
    197     def filename(self):
    198         import warnings
    199         warnings.warn("use the name attribute", DeprecationWarning, 2)
    200         if self.mode == WRITE and self.name[-3:] != ".gz":
    201             return self.name + ".gz"
    202         return self.name
    203 
    204     @property
    205     def mtime(self):
    206         """Last modification time read from stream, or None"""
    207         return self._buffer.raw._last_mtime
    208 
    209     def __repr__(self):
    210         s = repr(self.fileobj)
    211         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
    212 
    213     def _init_write(self, filename):
    214         self.name = filename
    215         self.crc = zlib.crc32(b"")
    216         self.size = 0
    217         self.writebuf = []
    218         self.bufsize = 0
    219         self.offset = 0  # Current file offset for seek(), tell(), etc
    220 
    221     def _write_gzip_header(self):
    222         self.fileobj.write(b'\037\213')             # magic header
    223         self.fileobj.write(b'\010')                 # compression method
    224         try:
    225             # RFC 1952 requires the FNAME field to be Latin-1. Do not
    226             # include filenames that cannot be represented that way.
    227             fname = os.path.basename(self.name)
    228             if not isinstance(fname, bytes):
    229                 fname = fname.encode('latin-1')
    230             if fname.endswith(b'.gz'):
    231                 fname = fname[:-3]
    232         except UnicodeEncodeError:
    233             fname = b''
    234         flags = 0
    235         if fname:
    236             flags = FNAME
    237         self.fileobj.write(chr(flags).encode('latin-1'))
    238         mtime = self._write_mtime
    239         if mtime is None:
    240             mtime = time.time()
    241         write32u(self.fileobj, int(mtime))
    242         self.fileobj.write(b'\002')
    243         self.fileobj.write(b'\377')
    244         if fname:
    245             self.fileobj.write(fname + b'\000')
    246 
    247     def write(self,data):
    248         self._check_not_closed()
    249         if self.mode != WRITE:
    250             import errno
    251             raise OSError(errno.EBADF, "write() on read-only GzipFile object")
    252 
    253         if self.fileobj is None:
    254             raise ValueError("write() on closed GzipFile object")
    255 
    256         if isinstance(data, bytes):
    257             length = len(data)
    258         else:
    259             # accept any data that supports the buffer protocol
    260             data = memoryview(data)
    261             length = data.nbytes
    262 
    263         if length > 0:
    264             self.fileobj.write(self.compress.compress(data))
    265             self.size += length
    266             self.crc = zlib.crc32(data, self.crc)
    267             self.offset += length
    268 
    269         return length
    270 
    271     def read(self, size=-1):
    272         self._check_not_closed()
    273         if self.mode != READ:
    274             import errno
    275             raise OSError(errno.EBADF, "read() on write-only GzipFile object")
    276         return self._buffer.read(size)
    277 
    278     def read1(self, size=-1):
    279         """Implements BufferedIOBase.read1()
    280 
    281         Reads up to a buffer's worth of data is size is negative."""
    282         self._check_not_closed()
    283         if self.mode != READ:
    284             import errno
    285             raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
    286 
    287         if size < 0:
    288             size = io.DEFAULT_BUFFER_SIZE
    289         return self._buffer.read1(size)
    290 
    291     def peek(self, n):
    292         self._check_not_closed()
    293         if self.mode != READ:
    294             import errno
    295             raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
    296         return self._buffer.peek(n)
    297 
    298     @property
    299     def closed(self):
    300         return self.fileobj is None
    301 
    302     def close(self):
    303         fileobj = self.fileobj
    304         if fileobj is None:
    305             return
    306         self.fileobj = None
    307         try:
    308             if self.mode == WRITE:
    309                 fileobj.write(self.compress.flush())
    310                 write32u(fileobj, self.crc)
    311                 # self.size may exceed 2GB, or even 4GB
    312                 write32u(fileobj, self.size & 0xffffffff)
    313             elif self.mode == READ:
    314                 self._buffer.close()
    315         finally:
    316             myfileobj = self.myfileobj
    317             if myfileobj:
    318                 self.myfileobj = None
    319                 myfileobj.close()
    320 
    321     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
    322         self._check_not_closed()
    323         if self.mode == WRITE:
    324             # Ensure the compressor's buffer is flushed
    325             self.fileobj.write(self.compress.flush(zlib_mode))
    326             self.fileobj.flush()
    327 
    328     def fileno(self):
    329         """Invoke the underlying file object's fileno() method.
    330 
    331         This will raise AttributeError if the underlying file object
    332         doesn't support fileno().
    333         """
    334         return self.fileobj.fileno()
    335 
    336     def rewind(self):
    337         '''Return the uncompressed stream file position indicator to the
    338         beginning of the file'''
    339         if self.mode != READ:
    340             raise OSError("Can't rewind in write mode")
    341         self._buffer.seek(0)
    342 
    343     def readable(self):
    344         return self.mode == READ
    345 
    346     def writable(self):
    347         return self.mode == WRITE
    348 
    349     def seekable(self):
    350         return True
    351 
    352     def seek(self, offset, whence=io.SEEK_SET):
    353         if self.mode == WRITE:
    354             if whence != io.SEEK_SET:
    355                 if whence == io.SEEK_CUR:
    356                     offset = self.offset + offset
    357                 else:
    358                     raise ValueError('Seek from end not supported')
    359             if offset < self.offset:
    360                 raise OSError('Negative seek in write mode')
    361             count = offset - self.offset
    362             chunk = b'\0' * 1024
    363             for i in range(count // 1024):
    364                 self.write(chunk)
    365             self.write(b'\0' * (count % 1024))
    366         elif self.mode == READ:
    367             self._check_not_closed()
    368             return self._buffer.seek(offset, whence)
    369 
    370         return self.offset
    371 
    372     def readline(self, size=-1):
    373         self._check_not_closed()
    374         return self._buffer.readline(size)
    375 
    376 
    377 class _GzipReader(_compression.DecompressReader):
    378     def __init__(self, fp):
    379         super().__init__(_PaddedFile(fp), zlib.decompressobj,
    380                          wbits=-zlib.MAX_WBITS)
    381         # Set flag indicating start of a new member
    382         self._new_member = True
    383         self._last_mtime = None
    384 
    385     def _init_read(self):
    386         self._crc = zlib.crc32(b"")
    387         self._stream_size = 0  # Decompressed size of unconcatenated stream
    388 
    389     def _read_exact(self, n):
    390         '''Read exactly *n* bytes from `self._fp`
    391 
    392         This method is required because self._fp may be unbuffered,
    393         i.e. return short reads.
    394         '''
    395 
    396         data = self._fp.read(n)
    397         while len(data) < n:
    398             b = self._fp.read(n - len(data))
    399             if not b:
    400                 raise EOFError("Compressed file ended before the "
    401                                "end-of-stream marker was reached")
    402             data += b
    403         return data
    404 
    405     def _read_gzip_header(self):
    406         magic = self._fp.read(2)
    407         if magic == b'':
    408             return False
    409 
    410         if magic != b'\037\213':
    411             raise OSError('Not a gzipped file (%r)' % magic)
    412 
    413         (method, flag,
    414          self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
    415         if method != 8:
    416             raise OSError('Unknown compression method')
    417 
    418         if flag & FEXTRA:
    419             # Read & discard the extra field, if present
    420             extra_len, = struct.unpack("<H", self._read_exact(2))
    421             self._read_exact(extra_len)
    422         if flag & FNAME:
    423             # Read and discard a null-terminated string containing the filename
    424             while True:
    425                 s = self._fp.read(1)
    426                 if not s or s==b'\000':
    427                     break
    428         if flag & FCOMMENT:
    429             # Read and discard a null-terminated string containing a comment
    430             while True:
    431                 s = self._fp.read(1)
    432                 if not s or s==b'\000':
    433                     break
    434         if flag & FHCRC:
    435             self._read_exact(2)     # Read & discard the 16-bit header CRC
    436         return True
    437 
    438     def read(self, size=-1):
    439         if size < 0:
    440             return self.readall()
    441         # size=0 is special because decompress(max_length=0) is not supported
    442         if not size:
    443             return b""
    444 
    445         # For certain input data, a single
    446         # call to decompress() may not return
    447         # any data. In this case, retry until we get some data or reach EOF.
    448         while True:
    449             if self._decompressor.eof:
    450                 # Ending case: we've come to the end of a member in the file,
    451                 # so finish up this member, and read a new gzip header.
    452                 # Check the CRC and file size, and set the flag so we read
    453                 # a new member
    454                 self._read_eof()
    455                 self._new_member = True
    456                 self._decompressor = self._decomp_factory(
    457                     **self._decomp_args)
    458 
    459             if self._new_member:
    460                 # If the _new_member flag is set, we have to
    461                 # jump to the next member, if there is one.
    462                 self._init_read()
    463                 if not self._read_gzip_header():
    464                     self._size = self._pos
    465                     return b""
    466                 self._new_member = False
    467 
    468             # Read a chunk of data from the file
    469             buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
    470 
    471             uncompress = self._decompressor.decompress(buf, size)
    472             if self._decompressor.unconsumed_tail != b"":
    473                 self._fp.prepend(self._decompressor.unconsumed_tail)
    474             elif self._decompressor.unused_data != b"":
    475                 # Prepend the already read bytes to the fileobj so they can
    476                 # be seen by _read_eof() and _read_gzip_header()
    477                 self._fp.prepend(self._decompressor.unused_data)
    478 
    479             if uncompress != b"":
    480                 break
    481             if buf == b"":
    482                 raise EOFError("Compressed file ended before the "
    483                                "end-of-stream marker was reached")
    484 
    485         self._add_read_data( uncompress )
    486         self._pos += len(uncompress)
    487         return uncompress
    488 
    489     def _add_read_data(self, data):
    490         self._crc = zlib.crc32(data, self._crc)
    491         self._stream_size = self._stream_size + len(data)
    492 
    493     def _read_eof(self):
    494         # We've read to the end of the file
    495         # We check the that the computed CRC and size of the
    496         # uncompressed data matches the stored values.  Note that the size
    497         # stored is the true file size mod 2**32.
    498         crc32, isize = struct.unpack("<II", self._read_exact(8))
    499         if crc32 != self._crc:
    500             raise OSError("CRC check failed %s != %s" % (hex(crc32),
    501                                                          hex(self._crc)))
    502         elif isize != (self._stream_size & 0xffffffff):
    503             raise OSError("Incorrect length of data produced")
    504 
    505         # Gzip files can be padded with zeroes and still have archives.
    506         # Consume all zero bytes and set the file position to the first
    507         # non-zero byte. See http://www.gzip.org/#faq8
    508         c = b"\x00"
    509         while c == b"\x00":
    510             c = self._fp.read(1)
    511         if c:
    512             self._fp.prepend(c)
    513 
    514     def _rewind(self):
    515         super()._rewind()
    516         self._new_member = True
    517 
    518 def compress(data, compresslevel=9):
    519     """Compress data in one shot and return the compressed string.
    520     Optional argument is the compression level, in range of 0-9.
    521     """
    522     buf = io.BytesIO()
    523     with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
    524         f.write(data)
    525     return buf.getvalue()
    526 
    527 def decompress(data):
    528     """Decompress a gzip compressed string in one shot.
    529     Return the decompressed string.
    530     """
    531     with GzipFile(fileobj=io.BytesIO(data)) as f:
    532         return f.read()
    533 
    534 
    535 def _test():
    536     # Act like gzip; with -d, act like gunzip.
    537     # The input file is not deleted, however, nor are any other gzip
    538     # options or features supported.
    539     args = sys.argv[1:]
    540     decompress = args and args[0] == "-d"
    541     if decompress:
    542         args = args[1:]
    543     if not args:
    544         args = ["-"]
    545     for arg in args:
    546         if decompress:
    547             if arg == "-":
    548                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
    549                 g = sys.stdout.buffer
    550             else:
    551                 if arg[-3:] != ".gz":
    552                     print("filename doesn't end in .gz:", repr(arg))
    553                     continue
    554                 f = open(arg, "rb")
    555                 g = builtins.open(arg[:-3], "wb")
    556         else:
    557             if arg == "-":
    558                 f = sys.stdin.buffer
    559                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
    560             else:
    561                 f = builtins.open(arg, "rb")
    562                 g = open(arg + ".gz", "wb")
    563         while True:
    564             chunk = f.read(1024)
    565             if not chunk:
    566                 break
    567             g.write(chunk)
    568         if g is not sys.stdout.buffer:
    569             g.close()
    570         if f is not sys.stdin.buffer:
    571             f.close()
    572 
    573 if __name__ == '__main__':
    574     _test()
    575