Home | History | Annotate | Download | only in Lib
      1 """Interface to the libbzip2 compression library.
      2 
      3 This module provides a file interface, classes for incremental
      4 (de)compression, and functions for one-shot (de)compression.
      5 """
      6 
      7 __all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
      8            "open", "compress", "decompress"]
      9 
     10 __author__ = "Nadeem Vawda <nadeem.vawda (at] gmail.com>"
     11 
     12 from builtins import open as _builtin_open
     13 import io
     14 import os
     15 import warnings
     16 import _compression
     17 
     18 try:
     19     from threading import RLock
     20 except ImportError:
     21     from dummy_threading import RLock
     22 
     23 from _bz2 import BZ2Compressor, BZ2Decompressor
     24 
     25 
     26 _MODE_CLOSED   = 0
     27 _MODE_READ     = 1
     28 # Value 2 no longer used
     29 _MODE_WRITE    = 3
     30 
     31 
     32 class BZ2File(_compression.BaseStream):
     33 
     34     """A file object providing transparent bzip2 (de)compression.
     35 
     36     A BZ2File can act as a wrapper for an existing file object, or refer
     37     directly to a named file on disk.
     38 
     39     Note that BZ2File provides a *binary* file interface - data read is
     40     returned as bytes, and data to be written should be given as bytes.
     41     """
     42 
     43     def __init__(self, filename, mode="r", buffering=None, compresslevel=9):
     44         """Open a bzip2-compressed file.
     45 
     46         If filename is a str, bytes, or PathLike object, it gives the
     47         name of the file to be opened. Otherwise, it should be a file
     48         object, which will be used to read or write the compressed data.
     49 
     50         mode can be 'r' for reading (default), 'w' for (over)writing,
     51         'x' for creating exclusively, or 'a' for appending. These can
     52         equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
     53 
     54         buffering is ignored. Its use is deprecated.
     55 
     56         If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
     57         and 9 specifying the level of compression: 1 produces the least
     58         compression, and 9 (default) produces the most compression.
     59 
     60         If mode is 'r', the input file may be the concatenation of
     61         multiple compressed streams.
     62         """
     63         # This lock must be recursive, so that BufferedIOBase's
     64         # writelines() does not deadlock.
     65         self._lock = RLock()
     66         self._fp = None
     67         self._closefp = False
     68         self._mode = _MODE_CLOSED
     69 
     70         if buffering is not None:
     71             warnings.warn("Use of 'buffering' argument is deprecated",
     72                           DeprecationWarning)
     73 
     74         if not (1 <= compresslevel <= 9):
     75             raise ValueError("compresslevel must be between 1 and 9")
     76 
     77         if mode in ("", "r", "rb"):
     78             mode = "rb"
     79             mode_code = _MODE_READ
     80         elif mode in ("w", "wb"):
     81             mode = "wb"
     82             mode_code = _MODE_WRITE
     83             self._compressor = BZ2Compressor(compresslevel)
     84         elif mode in ("x", "xb"):
     85             mode = "xb"
     86             mode_code = _MODE_WRITE
     87             self._compressor = BZ2Compressor(compresslevel)
     88         elif mode in ("a", "ab"):
     89             mode = "ab"
     90             mode_code = _MODE_WRITE
     91             self._compressor = BZ2Compressor(compresslevel)
     92         else:
     93             raise ValueError("Invalid mode: %r" % (mode,))
     94 
     95         if isinstance(filename, (str, bytes, os.PathLike)):
     96             self._fp = _builtin_open(filename, mode)
     97             self._closefp = True
     98             self._mode = mode_code
     99         elif hasattr(filename, "read") or hasattr(filename, "write"):
    100             self._fp = filename
    101             self._mode = mode_code
    102         else:
    103             raise TypeError("filename must be a str, bytes, file or PathLike object")
    104 
    105         if self._mode == _MODE_READ:
    106             raw = _compression.DecompressReader(self._fp,
    107                 BZ2Decompressor, trailing_error=OSError)
    108             self._buffer = io.BufferedReader(raw)
    109         else:
    110             self._pos = 0
    111 
    112     def close(self):
    113         """Flush and close the file.
    114 
    115         May be called more than once without error. Once the file is
    116         closed, any other operation on it will raise a ValueError.
    117         """
    118         with self._lock:
    119             if self._mode == _MODE_CLOSED:
    120                 return
    121             try:
    122                 if self._mode == _MODE_READ:
    123                     self._buffer.close()
    124                 elif self._mode == _MODE_WRITE:
    125                     self._fp.write(self._compressor.flush())
    126                     self._compressor = None
    127             finally:
    128                 try:
    129                     if self._closefp:
    130                         self._fp.close()
    131                 finally:
    132                     self._fp = None
    133                     self._closefp = False
    134                     self._mode = _MODE_CLOSED
    135                     self._buffer = None
    136 
    137     @property
    138     def closed(self):
    139         """True if this file is closed."""
    140         return self._mode == _MODE_CLOSED
    141 
    142     def fileno(self):
    143         """Return the file descriptor for the underlying file."""
    144         self._check_not_closed()
    145         return self._fp.fileno()
    146 
    147     def seekable(self):
    148         """Return whether the file supports seeking."""
    149         return self.readable() and self._buffer.seekable()
    150 
    151     def readable(self):
    152         """Return whether the file was opened for reading."""
    153         self._check_not_closed()
    154         return self._mode == _MODE_READ
    155 
    156     def writable(self):
    157         """Return whether the file was opened for writing."""
    158         self._check_not_closed()
    159         return self._mode == _MODE_WRITE
    160 
    161     def peek(self, n=0):
    162         """Return buffered data without advancing the file position.
    163 
    164         Always returns at least one byte of data, unless at EOF.
    165         The exact number of bytes returned is unspecified.
    166         """
    167         with self._lock:
    168             self._check_can_read()
    169             # Relies on the undocumented fact that BufferedReader.peek()
    170             # always returns at least one byte (except at EOF), independent
    171             # of the value of n
    172             return self._buffer.peek(n)
    173 
    174     def read(self, size=-1):
    175         """Read up to size uncompressed bytes from the file.
    176 
    177         If size is negative or omitted, read until EOF is reached.
    178         Returns b'' if the file is already at EOF.
    179         """
    180         with self._lock:
    181             self._check_can_read()
    182             return self._buffer.read(size)
    183 
    184     def read1(self, size=-1):
    185         """Read up to size uncompressed bytes, while trying to avoid
    186         making multiple reads from the underlying stream. Reads up to a
    187         buffer's worth of data if size is negative.
    188 
    189         Returns b'' if the file is at EOF.
    190         """
    191         with self._lock:
    192             self._check_can_read()
    193             if size < 0:
    194                 size = io.DEFAULT_BUFFER_SIZE
    195             return self._buffer.read1(size)
    196 
    197     def readinto(self, b):
    198         """Read bytes into b.
    199 
    200         Returns the number of bytes read (0 for EOF).
    201         """
    202         with self._lock:
    203             self._check_can_read()
    204             return self._buffer.readinto(b)
    205 
    206     def readline(self, size=-1):
    207         """Read a line of uncompressed bytes from the file.
    208 
    209         The terminating newline (if present) is retained. If size is
    210         non-negative, no more than size bytes will be read (in which
    211         case the line may be incomplete). Returns b'' if already at EOF.
    212         """
    213         if not isinstance(size, int):
    214             if not hasattr(size, "__index__"):
    215                 raise TypeError("Integer argument expected")
    216             size = size.__index__()
    217         with self._lock:
    218             self._check_can_read()
    219             return self._buffer.readline(size)
    220 
    221     def readlines(self, size=-1):
    222         """Read a list of lines of uncompressed bytes from the file.
    223 
    224         size can be specified to control the number of lines read: no
    225         further lines will be read once the total size of the lines read
    226         so far equals or exceeds size.
    227         """
    228         if not isinstance(size, int):
    229             if not hasattr(size, "__index__"):
    230                 raise TypeError("Integer argument expected")
    231             size = size.__index__()
    232         with self._lock:
    233             self._check_can_read()
    234             return self._buffer.readlines(size)
    235 
    236     def write(self, data):
    237         """Write a byte string to the file.
    238 
    239         Returns the number of uncompressed bytes written, which is
    240         always len(data). Note that due to buffering, the file on disk
    241         may not reflect the data written until close() is called.
    242         """
    243         with self._lock:
    244             self._check_can_write()
    245             compressed = self._compressor.compress(data)
    246             self._fp.write(compressed)
    247             self._pos += len(data)
    248             return len(data)
    249 
    250     def writelines(self, seq):
    251         """Write a sequence of byte strings to the file.
    252 
    253         Returns the number of uncompressed bytes written.
    254         seq can be any iterable yielding byte strings.
    255 
    256         Line separators are not added between the written byte strings.
    257         """
    258         with self._lock:
    259             return _compression.BaseStream.writelines(self, seq)
    260 
    261     def seek(self, offset, whence=io.SEEK_SET):
    262         """Change the file position.
    263 
    264         The new position is specified by offset, relative to the
    265         position indicated by whence. Values for whence are:
    266 
    267             0: start of stream (default); offset must not be negative
    268             1: current stream position
    269             2: end of stream; offset must not be positive
    270 
    271         Returns the new file position.
    272 
    273         Note that seeking is emulated, so depending on the parameters,
    274         this operation may be extremely slow.
    275         """
    276         with self._lock:
    277             self._check_can_seek()
    278             return self._buffer.seek(offset, whence)
    279 
    280     def tell(self):
    281         """Return the current file position."""
    282         with self._lock:
    283             self._check_not_closed()
    284             if self._mode == _MODE_READ:
    285                 return self._buffer.tell()
    286             return self._pos
    287 
    288 
    289 def open(filename, mode="rb", compresslevel=9,
    290          encoding=None, errors=None, newline=None):
    291     """Open a bzip2-compressed file in binary or text mode.
    292 
    293     The filename argument can be an actual filename (a str, bytes, or
    294     PathLike object), or an existing file object to read from or write
    295     to.
    296 
    297     The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
    298     "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
    299     The default mode is "rb", and the default compresslevel is 9.
    300 
    301     For binary mode, this function is equivalent to the BZ2File
    302     constructor: BZ2File(filename, mode, compresslevel). In this case,
    303     the encoding, errors and newline arguments must not be provided.
    304 
    305     For text mode, a BZ2File object is created, and wrapped in an
    306     io.TextIOWrapper instance with the specified encoding, error
    307     handling behavior, and line ending(s).
    308 
    309     """
    310     if "t" in mode:
    311         if "b" in mode:
    312             raise ValueError("Invalid mode: %r" % (mode,))
    313     else:
    314         if encoding is not None:
    315             raise ValueError("Argument 'encoding' not supported in binary mode")
    316         if errors is not None:
    317             raise ValueError("Argument 'errors' not supported in binary mode")
    318         if newline is not None:
    319             raise ValueError("Argument 'newline' not supported in binary mode")
    320 
    321     bz_mode = mode.replace("t", "")
    322     binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
    323 
    324     if "t" in mode:
    325         return io.TextIOWrapper(binary_file, encoding, errors, newline)
    326     else:
    327         return binary_file
    328 
    329 
    330 def compress(data, compresslevel=9):
    331     """Compress a block of data.
    332 
    333     compresslevel, if given, must be a number between 1 and 9.
    334 
    335     For incremental compression, use a BZ2Compressor object instead.
    336     """
    337     comp = BZ2Compressor(compresslevel)
    338     return comp.compress(data) + comp.flush()
    339 
    340 
    341 def decompress(data):
    342     """Decompress a block of data.
    343 
    344     For incremental decompression, use a BZ2Decompressor object instead.
    345     """
    346     results = []
    347     while data:
    348         decomp = BZ2Decompressor()
    349         try:
    350             res = decomp.decompress(data)
    351         except OSError:
    352             if results:
    353                 break  # Leftover data is not a valid bzip2 stream; ignore it.
    354             else:
    355                 raise  # Error on the first iteration; bail out.
    356         results.append(res)
    357         if not decomp.eof:
    358             raise ValueError("Compressed data ended before the "
    359                              "end-of-stream marker was reached")
    360         data = decomp.unused_data
    361     return b"".join(results)
    362