Home | History | Annotate | Download | only in python2.7
      1 #!/usr/bin/env python
      2 # -*- coding: iso-8859-1 -*-
      3 #-------------------------------------------------------------------
      4 # tarfile.py
      5 #-------------------------------------------------------------------
      6 # Copyright (C) 2002 Lars Gustbel <lars (at] gustaebel.de>
      7 # All rights reserved.
      8 #
      9 # Permission  is  hereby granted,  free  of charge,  to  any person
     10 # obtaining a  copy of  this software  and associated documentation
     11 # files  (the  "Software"),  to   deal  in  the  Software   without
     12 # restriction,  including  without limitation  the  rights to  use,
     13 # copy, modify, merge, publish, distribute, sublicense, and/or sell
     14 # copies  of  the  Software,  and to  permit  persons  to  whom the
     15 # Software  is  furnished  to  do  so,  subject  to  the  following
     16 # conditions:
     17 #
     18 # The above copyright  notice and this  permission notice shall  be
     19 # included in all copies or substantial portions of the Software.
     20 #
     21 # THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
     22 # EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
     23 # OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
     24 # NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
     25 # HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
     26 # WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
     27 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     28 # OTHER DEALINGS IN THE SOFTWARE.
     29 #
     30 """Read from and write to tar format archives.
     31 """
     32 
     33 __version__ = "$Revision: 85213 $"
     34 # $Source$
     35 
     36 version     = "0.9.0"
     37 __author__  = "Lars Gustbel (lars (at] gustaebel.de)"
     38 __date__    = "$Date$"
     39 __cvsid__   = "$Id$"
     40 __credits__ = "Gustavo Niemeyer, Niels Gustbel, Richard Townsend."
     41 
     42 #---------
     43 # Imports
     44 #---------
     45 import sys
     46 import os
     47 import shutil
     48 import stat
     49 import errno
     50 import time
     51 import struct
     52 import copy
     53 import re
     54 import operator
     55 
     56 try:
     57     import grp, pwd
     58 except ImportError:
     59     grp = pwd = None
     60 
     61 # from tarfile import *
     62 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
     63 
     64 #---------------------------------------------------------
     65 # tar constants
     66 #---------------------------------------------------------
     67 NUL = "\0"                      # the null character
     68 BLOCKSIZE = 512                 # length of processing blocks
     69 RECORDSIZE = BLOCKSIZE * 20     # length of records
     70 GNU_MAGIC = "ustar  \0"         # magic gnu tar string
     71 POSIX_MAGIC = "ustar\x0000"     # magic posix tar string
     72 
     73 LENGTH_NAME = 100               # maximum length of a filename
     74 LENGTH_LINK = 100               # maximum length of a linkname
     75 LENGTH_PREFIX = 155             # maximum length of the prefix field
     76 
     77 REGTYPE = "0"                   # regular file
     78 AREGTYPE = "\0"                 # regular file
     79 LNKTYPE = "1"                   # link (inside tarfile)
     80 SYMTYPE = "2"                   # symbolic link
     81 CHRTYPE = "3"                   # character special device
     82 BLKTYPE = "4"                   # block special device
     83 DIRTYPE = "5"                   # directory
     84 FIFOTYPE = "6"                  # fifo special device
     85 CONTTYPE = "7"                  # contiguous file
     86 
     87 GNUTYPE_LONGNAME = "L"          # GNU tar longname
     88 GNUTYPE_LONGLINK = "K"          # GNU tar longlink
     89 GNUTYPE_SPARSE = "S"            # GNU tar sparse file
     90 
     91 XHDTYPE = "x"                   # POSIX.1-2001 extended header
     92 XGLTYPE = "g"                   # POSIX.1-2001 global header
     93 SOLARIS_XHDTYPE = "X"           # Solaris extended header
     94 
     95 USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
     96 GNU_FORMAT = 1                  # GNU tar format
     97 PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
     98 DEFAULT_FORMAT = GNU_FORMAT
     99 
    100 #---------------------------------------------------------
    101 # tarfile constants
    102 #---------------------------------------------------------
    103 # File types that tarfile supports:
    104 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
    105                    SYMTYPE, DIRTYPE, FIFOTYPE,
    106                    CONTTYPE, CHRTYPE, BLKTYPE,
    107                    GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
    108                    GNUTYPE_SPARSE)
    109 
    110 # File types that will be treated as a regular file.
    111 REGULAR_TYPES = (REGTYPE, AREGTYPE,
    112                  CONTTYPE, GNUTYPE_SPARSE)
    113 
    114 # File types that are part of the GNU tar format.
    115 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
    116              GNUTYPE_SPARSE)
    117 
    118 # Fields from a pax header that override a TarInfo attribute.
    119 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
    120               "uid", "gid", "uname", "gname")
    121 
    122 # Fields in a pax header that are numbers, all other fields
    123 # are treated as strings.
    124 PAX_NUMBER_FIELDS = {
    125     "atime": float,
    126     "ctime": float,
    127     "mtime": float,
    128     "uid": int,
    129     "gid": int,
    130     "size": int
    131 }
    132 
    133 #---------------------------------------------------------
    134 # Bits used in the mode field, values in octal.
    135 #---------------------------------------------------------
    136 S_IFLNK = 0120000        # symbolic link
    137 S_IFREG = 0100000        # regular file
    138 S_IFBLK = 0060000        # block device
    139 S_IFDIR = 0040000        # directory
    140 S_IFCHR = 0020000        # character device
    141 S_IFIFO = 0010000        # fifo
    142 
    143 TSUID   = 04000          # set UID on execution
    144 TSGID   = 02000          # set GID on execution
    145 TSVTX   = 01000          # reserved
    146 
    147 TUREAD  = 0400           # read by owner
    148 TUWRITE = 0200           # write by owner
    149 TUEXEC  = 0100           # execute/search by owner
    150 TGREAD  = 0040           # read by group
    151 TGWRITE = 0020           # write by group
    152 TGEXEC  = 0010           # execute/search by group
    153 TOREAD  = 0004           # read by other
    154 TOWRITE = 0002           # write by other
    155 TOEXEC  = 0001           # execute/search by other
    156 
    157 #---------------------------------------------------------
    158 # initialization
    159 #---------------------------------------------------------
    160 ENCODING = sys.getfilesystemencoding()
    161 if ENCODING is None:
    162     ENCODING = sys.getdefaultencoding()
    163 
    164 #---------------------------------------------------------
    165 # Some useful functions
    166 #---------------------------------------------------------
    167 
    168 def stn(s, length):
    169     """Convert a python string to a null-terminated string buffer.
    170     """
    171     return s[:length] + (length - len(s)) * NUL
    172 
    173 def nts(s):
    174     """Convert a null-terminated string field to a python string.
    175     """
    176     # Use the string up to the first null char.
    177     p = s.find("\0")
    178     if p == -1:
    179         return s
    180     return s[:p]
    181 
    182 def nti(s):
    183     """Convert a number field to a python number.
    184     """
    185     # There are two possible encodings for a number field, see
    186     # itn() below.
    187     if s[0] != chr(0200):
    188         try:
    189             n = int(nts(s) or "0", 8)
    190         except ValueError:
    191             raise InvalidHeaderError("invalid header")
    192     else:
    193         n = 0L
    194         for i in xrange(len(s) - 1):
    195             n <<= 8
    196             n += ord(s[i + 1])
    197     return n
    198 
    199 def itn(n, digits=8, format=DEFAULT_FORMAT):
    200     """Convert a python number to a number field.
    201     """
    202     # POSIX 1003.1-1988 requires numbers to be encoded as a string of
    203     # octal digits followed by a null-byte, this allows values up to
    204     # (8**(digits-1))-1. GNU tar allows storing numbers greater than
    205     # that if necessary. A leading 0200 byte indicates this particular
    206     # encoding, the following digits-1 bytes are a big-endian
    207     # representation. This allows values up to (256**(digits-1))-1.
    208     if 0 <= n < 8 ** (digits - 1):
    209         s = "%0*o" % (digits - 1, n) + NUL
    210     else:
    211         if format != GNU_FORMAT or n >= 256 ** (digits - 1):
    212             raise ValueError("overflow in number field")
    213 
    214         if n < 0:
    215             # XXX We mimic GNU tar's behaviour with negative numbers,
    216             # this could raise OverflowError.
    217             n = struct.unpack("L", struct.pack("l", n))[0]
    218 
    219         s = ""
    220         for i in xrange(digits - 1):
    221             s = chr(n & 0377) + s
    222             n >>= 8
    223         s = chr(0200) + s
    224     return s
    225 
    226 def uts(s, encoding, errors):
    227     """Convert a unicode object to a string.
    228     """
    229     if errors == "utf-8":
    230         # An extra error handler similar to the -o invalid=UTF-8 option
    231         # in POSIX.1-2001. Replace untranslatable characters with their
    232         # UTF-8 representation.
    233         try:
    234             return s.encode(encoding, "strict")
    235         except UnicodeEncodeError:
    236             x = []
    237             for c in s:
    238                 try:
    239                     x.append(c.encode(encoding, "strict"))
    240                 except UnicodeEncodeError:
    241                     x.append(c.encode("utf8"))
    242             return "".join(x)
    243     else:
    244         return s.encode(encoding, errors)
    245 
    246 def calc_chksums(buf):
    247     """Calculate the checksum for a member's header by summing up all
    248        characters except for the chksum field which is treated as if
    249        it was filled with spaces. According to the GNU tar sources,
    250        some tars (Sun and NeXT) calculate chksum with signed char,
    251        which will be different if there are chars in the buffer with
    252        the high bit set. So we calculate two checksums, unsigned and
    253        signed.
    254     """
    255     unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
    256     signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
    257     return unsigned_chksum, signed_chksum
    258 
    259 def copyfileobj(src, dst, length=None):
    260     """Copy length bytes from fileobj src to fileobj dst.
    261        If length is None, copy the entire content.
    262     """
    263     if length == 0:
    264         return
    265     if length is None:
    266         shutil.copyfileobj(src, dst)
    267         return
    268 
    269     BUFSIZE = 16 * 1024
    270     blocks, remainder = divmod(length, BUFSIZE)
    271     for b in xrange(blocks):
    272         buf = src.read(BUFSIZE)
    273         if len(buf) < BUFSIZE:
    274             raise IOError("end of file reached")
    275         dst.write(buf)
    276 
    277     if remainder != 0:
    278         buf = src.read(remainder)
    279         if len(buf) < remainder:
    280             raise IOError("end of file reached")
    281         dst.write(buf)
    282     return
    283 
    284 filemode_table = (
    285     ((S_IFLNK,      "l"),
    286      (S_IFREG,      "-"),
    287      (S_IFBLK,      "b"),
    288      (S_IFDIR,      "d"),
    289      (S_IFCHR,      "c"),
    290      (S_IFIFO,      "p")),
    291 
    292     ((TUREAD,       "r"),),
    293     ((TUWRITE,      "w"),),
    294     ((TUEXEC|TSUID, "s"),
    295      (TSUID,        "S"),
    296      (TUEXEC,       "x")),
    297 
    298     ((TGREAD,       "r"),),
    299     ((TGWRITE,      "w"),),
    300     ((TGEXEC|TSGID, "s"),
    301      (TSGID,        "S"),
    302      (TGEXEC,       "x")),
    303 
    304     ((TOREAD,       "r"),),
    305     ((TOWRITE,      "w"),),
    306     ((TOEXEC|TSVTX, "t"),
    307      (TSVTX,        "T"),
    308      (TOEXEC,       "x"))
    309 )
    310 
    311 def filemode(mode):
    312     """Convert a file's mode to a string of the form
    313        -rwxrwxrwx.
    314        Used by TarFile.list()
    315     """
    316     perm = []
    317     for table in filemode_table:
    318         for bit, char in table:
    319             if mode & bit == bit:
    320                 perm.append(char)
    321                 break
    322         else:
    323             perm.append("-")
    324     return "".join(perm)
    325 
    326 class TarError(Exception):
    327     """Base exception."""
    328     pass
    329 class ExtractError(TarError):
    330     """General exception for extract errors."""
    331     pass
    332 class ReadError(TarError):
    333     """Exception for unreadble tar archives."""
    334     pass
    335 class CompressionError(TarError):
    336     """Exception for unavailable compression methods."""
    337     pass
    338 class StreamError(TarError):
    339     """Exception for unsupported operations on stream-like TarFiles."""
    340     pass
    341 class HeaderError(TarError):
    342     """Base exception for header errors."""
    343     pass
    344 class EmptyHeaderError(HeaderError):
    345     """Exception for empty headers."""
    346     pass
    347 class TruncatedHeaderError(HeaderError):
    348     """Exception for truncated headers."""
    349     pass
    350 class EOFHeaderError(HeaderError):
    351     """Exception for end of file headers."""
    352     pass
    353 class InvalidHeaderError(HeaderError):
    354     """Exception for invalid headers."""
    355     pass
    356 class SubsequentHeaderError(HeaderError):
    357     """Exception for missing and invalid extended headers."""
    358     pass
    359 
    360 #---------------------------
    361 # internal stream interface
    362 #---------------------------
    363 class _LowLevelFile:
    364     """Low-level file object. Supports reading and writing.
    365        It is used instead of a regular file object for streaming
    366        access.
    367     """
    368 
    369     def __init__(self, name, mode):
    370         mode = {
    371             "r": os.O_RDONLY,
    372             "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
    373         }[mode]
    374         if hasattr(os, "O_BINARY"):
    375             mode |= os.O_BINARY
    376         self.fd = os.open(name, mode, 0666)
    377 
    378     def close(self):
    379         os.close(self.fd)
    380 
    381     def read(self, size):
    382         return os.read(self.fd, size)
    383 
    384     def write(self, s):
    385         os.write(self.fd, s)
    386 
    387 class _Stream:
    388     """Class that serves as an adapter between TarFile and
    389        a stream-like object.  The stream-like object only
    390        needs to have a read() or write() method and is accessed
    391        blockwise.  Use of gzip or bzip2 compression is possible.
    392        A stream-like object could be for example: sys.stdin,
    393        sys.stdout, a socket, a tape device etc.
    394 
    395        _Stream is intended to be used only internally.
    396     """
    397 
    398     def __init__(self, name, mode, comptype, fileobj, bufsize):
    399         """Construct a _Stream object.
    400         """
    401         self._extfileobj = True
    402         if fileobj is None:
    403             fileobj = _LowLevelFile(name, mode)
    404             self._extfileobj = False
    405 
    406         if comptype == '*':
    407             # Enable transparent compression detection for the
    408             # stream interface
    409             fileobj = _StreamProxy(fileobj)
    410             comptype = fileobj.getcomptype()
    411 
    412         self.name     = name or ""
    413         self.mode     = mode
    414         self.comptype = comptype
    415         self.fileobj  = fileobj
    416         self.bufsize  = bufsize
    417         self.buf      = ""
    418         self.pos      = 0L
    419         self.closed   = False
    420 
    421         if comptype == "gz":
    422             try:
    423                 import zlib
    424             except ImportError:
    425                 raise CompressionError("zlib module is not available")
    426             self.zlib = zlib
    427             self.crc = zlib.crc32("") & 0xffffffffL
    428             if mode == "r":
    429                 self._init_read_gz()
    430             else:
    431                 self._init_write_gz()
    432 
    433         if comptype == "bz2":
    434             try:
    435                 import bz2
    436             except ImportError:
    437                 raise CompressionError("bz2 module is not available")
    438             if mode == "r":
    439                 self.dbuf = ""
    440                 self.cmp = bz2.BZ2Decompressor()
    441             else:
    442                 self.cmp = bz2.BZ2Compressor()
    443 
    444     def __del__(self):
    445         if hasattr(self, "closed") and not self.closed:
    446             self.close()
    447 
    448     def _init_write_gz(self):
    449         """Initialize for writing with gzip compression.
    450         """
    451         self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
    452                                             -self.zlib.MAX_WBITS,
    453                                             self.zlib.DEF_MEM_LEVEL,
    454                                             0)
    455         timestamp = struct.pack("<L", long(time.time()))
    456         self.__write("\037\213\010\010%s\002\377" % timestamp)
    457         if type(self.name) is unicode:
    458             self.name = self.name.encode("iso-8859-1", "replace")
    459         if self.name.endswith(".gz"):
    460             self.name = self.name[:-3]
    461         self.__write(self.name + NUL)
    462 
    463     def write(self, s):
    464         """Write string s to the stream.
    465         """
    466         if self.comptype == "gz":
    467             self.crc = self.zlib.crc32(s, self.crc) & 0xffffffffL
    468         self.pos += len(s)
    469         if self.comptype != "tar":
    470             s = self.cmp.compress(s)
    471         self.__write(s)
    472 
    473     def __write(self, s):
    474         """Write string s to the stream if a whole new block
    475            is ready to be written.
    476         """
    477         self.buf += s
    478         while len(self.buf) > self.bufsize:
    479             self.fileobj.write(self.buf[:self.bufsize])
    480             self.buf = self.buf[self.bufsize:]
    481 
    482     def close(self):
    483         """Close the _Stream object. No operation should be
    484            done on it afterwards.
    485         """
    486         if self.closed:
    487             return
    488 
    489         if self.mode == "w" and self.comptype != "tar":
    490             self.buf += self.cmp.flush()
    491 
    492         if self.mode == "w" and self.buf:
    493             self.fileobj.write(self.buf)
    494             self.buf = ""
    495             if self.comptype == "gz":
    496                 # The native zlib crc is an unsigned 32-bit integer, but
    497                 # the Python wrapper implicitly casts that to a signed C
    498                 # long.  So, on a 32-bit box self.crc may "look negative",
    499                 # while the same crc on a 64-bit box may "look positive".
    500                 # To avoid irksome warnings from the `struct` module, force
    501                 # it to look positive on all boxes.
    502                 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffffL))
    503                 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFFL))
    504 
    505         if not self._extfileobj:
    506             self.fileobj.close()
    507 
    508         self.closed = True
    509 
    510     def _init_read_gz(self):
    511         """Initialize for reading a gzip compressed fileobj.
    512         """
    513         self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
    514         self.dbuf = ""
    515 
    516         # taken from gzip.GzipFile with some alterations
    517         if self.__read(2) != "\037\213":
    518             raise ReadError("not a gzip file")
    519         if self.__read(1) != "\010":
    520             raise CompressionError("unsupported compression method")
    521 
    522         flag = ord(self.__read(1))
    523         self.__read(6)
    524 
    525         if flag & 4:
    526             xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
    527             self.read(xlen)
    528         if flag & 8:
    529             while True:
    530                 s = self.__read(1)
    531                 if not s or s == NUL:
    532                     break
    533         if flag & 16:
    534             while True:
    535                 s = self.__read(1)
    536                 if not s or s == NUL:
    537                     break
    538         if flag & 2:
    539             self.__read(2)
    540 
    541     def tell(self):
    542         """Return the stream's file pointer position.
    543         """
    544         return self.pos
    545 
    546     def seek(self, pos=0):
    547         """Set the stream's file pointer to pos. Negative seeking
    548            is forbidden.
    549         """
    550         if pos - self.pos >= 0:
    551             blocks, remainder = divmod(pos - self.pos, self.bufsize)
    552             for i in xrange(blocks):
    553                 self.read(self.bufsize)
    554             self.read(remainder)
    555         else:
    556             raise StreamError("seeking backwards is not allowed")
    557         return self.pos
    558 
    559     def read(self, size=None):
    560         """Return the next size number of bytes from the stream.
    561            If size is not defined, return all bytes of the stream
    562            up to EOF.
    563         """
    564         if size is None:
    565             t = []
    566             while True:
    567                 buf = self._read(self.bufsize)
    568                 if not buf:
    569                     break
    570                 t.append(buf)
    571             buf = "".join(t)
    572         else:
    573             buf = self._read(size)
    574         self.pos += len(buf)
    575         return buf
    576 
    577     def _read(self, size):
    578         """Return size bytes from the stream.
    579         """
    580         if self.comptype == "tar":
    581             return self.__read(size)
    582 
    583         c = len(self.dbuf)
    584         t = [self.dbuf]
    585         while c < size:
    586             buf = self.__read(self.bufsize)
    587             if not buf:
    588                 break
    589             try:
    590                 buf = self.cmp.decompress(buf)
    591             except IOError:
    592                 raise ReadError("invalid compressed data")
    593             t.append(buf)
    594             c += len(buf)
    595         t = "".join(t)
    596         self.dbuf = t[size:]
    597         return t[:size]
    598 
    599     def __read(self, size):
    600         """Return size bytes from stream. If internal buffer is empty,
    601            read another block from the stream.
    602         """
    603         c = len(self.buf)
    604         t = [self.buf]
    605         while c < size:
    606             buf = self.fileobj.read(self.bufsize)
    607             if not buf:
    608                 break
    609             t.append(buf)
    610             c += len(buf)
    611         t = "".join(t)
    612         self.buf = t[size:]
    613         return t[:size]
    614 # class _Stream
    615 
    616 class _StreamProxy(object):
    617     """Small proxy class that enables transparent compression
    618        detection for the Stream interface (mode 'r|*').
    619     """
    620 
    621     def __init__(self, fileobj):
    622         self.fileobj = fileobj
    623         self.buf = self.fileobj.read(BLOCKSIZE)
    624 
    625     def read(self, size):
    626         self.read = self.fileobj.read
    627         return self.buf
    628 
    629     def getcomptype(self):
    630         if self.buf.startswith("\037\213\010"):
    631             return "gz"
    632         if self.buf[0:3] == "BZh" and self.buf[4:10] == "1AY&SY":
    633             return "bz2"
    634         return "tar"
    635 
    636     def close(self):
    637         self.fileobj.close()
    638 # class StreamProxy
    639 
    640 class _BZ2Proxy(object):
    641     """Small proxy class that enables external file object
    642        support for "r:bz2" and "w:bz2" modes. This is actually
    643        a workaround for a limitation in bz2 module's BZ2File
    644        class which (unlike gzip.GzipFile) has no support for
    645        a file object argument.
    646     """
    647 
    648     blocksize = 16 * 1024
    649 
    650     def __init__(self, fileobj, mode):
    651         self.fileobj = fileobj
    652         self.mode = mode
    653         self.name = getattr(self.fileobj, "name", None)
    654         self.init()
    655 
    656     def init(self):
    657         import bz2
    658         self.pos = 0
    659         if self.mode == "r":
    660             self.bz2obj = bz2.BZ2Decompressor()
    661             self.fileobj.seek(0)
    662             self.buf = ""
    663         else:
    664             self.bz2obj = bz2.BZ2Compressor()
    665 
    666     def read(self, size):
    667         b = [self.buf]
    668         x = len(self.buf)
    669         while x < size:
    670             raw = self.fileobj.read(self.blocksize)
    671             if not raw:
    672                 break
    673             data = self.bz2obj.decompress(raw)
    674             b.append(data)
    675             x += len(data)
    676         self.buf = "".join(b)
    677 
    678         buf = self.buf[:size]
    679         self.buf = self.buf[size:]
    680         self.pos += len(buf)
    681         return buf
    682 
    683     def seek(self, pos):
    684         if pos < self.pos:
    685             self.init()
    686         self.read(pos - self.pos)
    687 
    688     def tell(self):
    689         return self.pos
    690 
    691     def write(self, data):
    692         self.pos += len(data)
    693         raw = self.bz2obj.compress(data)
    694         self.fileobj.write(raw)
    695 
    696     def close(self):
    697         if self.mode == "w":
    698             raw = self.bz2obj.flush()
    699             self.fileobj.write(raw)
    700 # class _BZ2Proxy
    701 
    702 #------------------------
    703 # Extraction file object
    704 #------------------------
    705 class _FileInFile(object):
    706     """A thin wrapper around an existing file object that
    707        provides a part of its data as an individual file
    708        object.
    709     """
    710 
    711     def __init__(self, fileobj, offset, size, sparse=None):
    712         self.fileobj = fileobj
    713         self.offset = offset
    714         self.size = size
    715         self.sparse = sparse
    716         self.position = 0
    717 
    718     def tell(self):
    719         """Return the current file position.
    720         """
    721         return self.position
    722 
    723     def seek(self, position):
    724         """Seek to a position in the file.
    725         """
    726         self.position = position
    727 
    728     def read(self, size=None):
    729         """Read data from the file.
    730         """
    731         if size is None:
    732             size = self.size - self.position
    733         else:
    734             size = min(size, self.size - self.position)
    735 
    736         if self.sparse is None:
    737             return self.readnormal(size)
    738         else:
    739             return self.readsparse(size)
    740 
    741     def readnormal(self, size):
    742         """Read operation for regular files.
    743         """
    744         self.fileobj.seek(self.offset + self.position)
    745         self.position += size
    746         return self.fileobj.read(size)
    747 
    748     def readsparse(self, size):
    749         """Read operation for sparse files.
    750         """
    751         data = []
    752         while size > 0:
    753             buf = self.readsparsesection(size)
    754             if not buf:
    755                 break
    756             size -= len(buf)
    757             data.append(buf)
    758         return "".join(data)
    759 
    760     def readsparsesection(self, size):
    761         """Read a single section of a sparse file.
    762         """
    763         section = self.sparse.find(self.position)
    764 
    765         if section is None:
    766             return ""
    767 
    768         size = min(size, section.offset + section.size - self.position)
    769 
    770         if isinstance(section, _data):
    771             realpos = section.realpos + self.position - section.offset
    772             self.fileobj.seek(self.offset + realpos)
    773             self.position += size
    774             return self.fileobj.read(size)
    775         else:
    776             self.position += size
    777             return NUL * size
    778 #class _FileInFile
    779 
    780 
    781 class ExFileObject(object):
    782     """File-like object for reading an archive member.
    783        Is returned by TarFile.extractfile().
    784     """
    785     blocksize = 1024
    786 
    787     def __init__(self, tarfile, tarinfo):
    788         self.fileobj = _FileInFile(tarfile.fileobj,
    789                                    tarinfo.offset_data,
    790                                    tarinfo.size,
    791                                    getattr(tarinfo, "sparse", None))
    792         self.name = tarinfo.name
    793         self.mode = "r"
    794         self.closed = False
    795         self.size = tarinfo.size
    796 
    797         self.position = 0
    798         self.buffer = ""
    799 
    800     def read(self, size=None):
    801         """Read at most size bytes from the file. If size is not
    802            present or None, read all data until EOF is reached.
    803         """
    804         if self.closed:
    805             raise ValueError("I/O operation on closed file")
    806 
    807         buf = ""
    808         if self.buffer:
    809             if size is None:
    810                 buf = self.buffer
    811                 self.buffer = ""
    812             else:
    813                 buf = self.buffer[:size]
    814                 self.buffer = self.buffer[size:]
    815 
    816         if size is None:
    817             buf += self.fileobj.read()
    818         else:
    819             buf += self.fileobj.read(size - len(buf))
    820 
    821         self.position += len(buf)
    822         return buf
    823 
    824     def readline(self, size=-1):
    825         """Read one entire line from the file. If size is present
    826            and non-negative, return a string with at most that
    827            size, which may be an incomplete line.
    828         """
    829         if self.closed:
    830             raise ValueError("I/O operation on closed file")
    831 
    832         if "\n" in self.buffer:
    833             pos = self.buffer.find("\n") + 1
    834         else:
    835             buffers = [self.buffer]
    836             while True:
    837                 buf = self.fileobj.read(self.blocksize)
    838                 buffers.append(buf)
    839                 if not buf or "\n" in buf:
    840                     self.buffer = "".join(buffers)
    841                     pos = self.buffer.find("\n") + 1
    842                     if pos == 0:
    843                         # no newline found.
    844                         pos = len(self.buffer)
    845                     break
    846 
    847         if size != -1:
    848             pos = min(size, pos)
    849 
    850         buf = self.buffer[:pos]
    851         self.buffer = self.buffer[pos:]
    852         self.position += len(buf)
    853         return buf
    854 
    855     def readlines(self):
    856         """Return a list with all remaining lines.
    857         """
    858         result = []
    859         while True:
    860             line = self.readline()
    861             if not line: break
    862             result.append(line)
    863         return result
    864 
    865     def tell(self):
    866         """Return the current file position.
    867         """
    868         if self.closed:
    869             raise ValueError("I/O operation on closed file")
    870 
    871         return self.position
    872 
    873     def seek(self, pos, whence=os.SEEK_SET):
    874         """Seek to a position in the file.
    875         """
    876         if self.closed:
    877             raise ValueError("I/O operation on closed file")
    878 
    879         if whence == os.SEEK_SET:
    880             self.position = min(max(pos, 0), self.size)
    881         elif whence == os.SEEK_CUR:
    882             if pos < 0:
    883                 self.position = max(self.position + pos, 0)
    884             else:
    885                 self.position = min(self.position + pos, self.size)
    886         elif whence == os.SEEK_END:
    887             self.position = max(min(self.size + pos, self.size), 0)
    888         else:
    889             raise ValueError("Invalid argument")
    890 
    891         self.buffer = ""
    892         self.fileobj.seek(self.position)
    893 
    894     def close(self):
    895         """Close the file object.
    896         """
    897         self.closed = True
    898 
    899     def __iter__(self):
    900         """Get an iterator over the file's lines.
    901         """
    902         while True:
    903             line = self.readline()
    904             if not line:
    905                 break
    906             yield line
    907 #class ExFileObject
    908 
    909 #------------------
    910 # Exported Classes
    911 #------------------
    912 class TarInfo(object):
    913     """Informational class which holds the details about an
    914        archive member given by a tar header block.
    915        TarInfo objects are returned by TarFile.getmember(),
    916        TarFile.getmembers() and TarFile.gettarinfo() and are
    917        usually created internally.
    918     """
    919 
    920     def __init__(self, name=""):
    921         """Construct a TarInfo object. name is the optional name
    922            of the member.
    923         """
    924         self.name = name        # member name
    925         self.mode = 0644        # file permissions
    926         self.uid = 0            # user id
    927         self.gid = 0            # group id
    928         self.size = 0           # file size
    929         self.mtime = 0          # modification time
    930         self.chksum = 0         # header checksum
    931         self.type = REGTYPE     # member type
    932         self.linkname = ""      # link name
    933         self.uname = ""         # user name
    934         self.gname = ""         # group name
    935         self.devmajor = 0       # device major number
    936         self.devminor = 0       # device minor number
    937 
    938         self.offset = 0         # the tar header starts here
    939         self.offset_data = 0    # the file's data starts here
    940 
    941         self.pax_headers = {}   # pax header information
    942 
    943     # In pax headers the "name" and "linkname" field are called
    944     # "path" and "linkpath".
    945     def _getpath(self):
    946         return self.name
    947     def _setpath(self, name):
    948         self.name = name
    949     path = property(_getpath, _setpath)
    950 
    951     def _getlinkpath(self):
    952         return self.linkname
    953     def _setlinkpath(self, linkname):
    954         self.linkname = linkname
    955     linkpath = property(_getlinkpath, _setlinkpath)
    956 
    957     def __repr__(self):
    958         return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
    959 
    960     def get_info(self, encoding, errors):
    961         """Return the TarInfo's attributes as a dictionary.
    962         """
    963         info = {
    964             "name":     self.name,
    965             "mode":     self.mode & 07777,
    966             "uid":      self.uid,
    967             "gid":      self.gid,
    968             "size":     self.size,
    969             "mtime":    self.mtime,
    970             "chksum":   self.chksum,
    971             "type":     self.type,
    972             "linkname": self.linkname,
    973             "uname":    self.uname,
    974             "gname":    self.gname,
    975             "devmajor": self.devmajor,
    976             "devminor": self.devminor
    977         }
    978 
    979         if info["type"] == DIRTYPE and not info["name"].endswith("/"):
    980             info["name"] += "/"
    981 
    982         for key in ("name", "linkname", "uname", "gname"):
    983             if type(info[key]) is unicode:
    984                 info[key] = info[key].encode(encoding, errors)
    985 
    986         return info
    987 
    988     def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"):
    989         """Return a tar header as a string of 512 byte blocks.
    990         """
    991         info = self.get_info(encoding, errors)
    992 
    993         if format == USTAR_FORMAT:
    994             return self.create_ustar_header(info)
    995         elif format == GNU_FORMAT:
    996             return self.create_gnu_header(info)
    997         elif format == PAX_FORMAT:
    998             return self.create_pax_header(info, encoding, errors)
    999         else:
   1000             raise ValueError("invalid format")
   1001 
   1002     def create_ustar_header(self, info):
   1003         """Return the object as a ustar header block.
   1004         """
   1005         info["magic"] = POSIX_MAGIC
   1006 
   1007         if len(info["linkname"]) > LENGTH_LINK:
   1008             raise ValueError("linkname is too long")
   1009 
   1010         if len(info["name"]) > LENGTH_NAME:
   1011             info["prefix"], info["name"] = self._posix_split_name(info["name"])
   1012 
   1013         return self._create_header(info, USTAR_FORMAT)
   1014 
   1015     def create_gnu_header(self, info):
   1016         """Return the object as a GNU header block sequence.
   1017         """
   1018         info["magic"] = GNU_MAGIC
   1019 
   1020         buf = ""
   1021         if len(info["linkname"]) > LENGTH_LINK:
   1022             buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK)
   1023 
   1024         if len(info["name"]) > LENGTH_NAME:
   1025             buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME)
   1026 
   1027         return buf + self._create_header(info, GNU_FORMAT)
   1028 
   1029     def create_pax_header(self, info, encoding, errors):
   1030         """Return the object as a ustar header block. If it cannot be
   1031            represented this way, prepend a pax extended header sequence
   1032            with supplement information.
   1033         """
   1034         info["magic"] = POSIX_MAGIC
   1035         pax_headers = self.pax_headers.copy()
   1036 
   1037         # Test string fields for values that exceed the field length or cannot
   1038         # be represented in ASCII encoding.
   1039         for name, hname, length in (
   1040                 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
   1041                 ("uname", "uname", 32), ("gname", "gname", 32)):
   1042 
   1043             if hname in pax_headers:
   1044                 # The pax header has priority.
   1045                 continue
   1046 
   1047             val = info[name].decode(encoding, errors)
   1048 
   1049             # Try to encode the string as ASCII.
   1050             try:
   1051                 val.encode("ascii")
   1052             except UnicodeEncodeError:
   1053                 pax_headers[hname] = val
   1054                 continue
   1055 
   1056             if len(info[name]) > length:
   1057                 pax_headers[hname] = val
   1058 
   1059         # Test number fields for values that exceed the field limit or values
   1060         # that like to be stored as float.
   1061         for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
   1062             if name in pax_headers:
   1063                 # The pax header has priority. Avoid overflow.
   1064                 info[name] = 0
   1065                 continue
   1066 
   1067             val = info[name]
   1068             if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
   1069                 pax_headers[name] = unicode(val)
   1070                 info[name] = 0
   1071 
   1072         # Create a pax extended header if necessary.
   1073         if pax_headers:
   1074             buf = self._create_pax_generic_header(pax_headers)
   1075         else:
   1076             buf = ""
   1077 
   1078         return buf + self._create_header(info, USTAR_FORMAT)
   1079 
   1080     @classmethod
   1081     def create_pax_global_header(cls, pax_headers):
   1082         """Return the object as a pax global header block sequence.
   1083         """
   1084         return cls._create_pax_generic_header(pax_headers, type=XGLTYPE)
   1085 
   1086     def _posix_split_name(self, name):
   1087         """Split a name longer than 100 chars into a prefix
   1088            and a name part.
   1089         """
   1090         prefix = name[:LENGTH_PREFIX + 1]
   1091         while prefix and prefix[-1] != "/":
   1092             prefix = prefix[:-1]
   1093 
   1094         name = name[len(prefix):]
   1095         prefix = prefix[:-1]
   1096 
   1097         if not prefix or len(name) > LENGTH_NAME:
   1098             raise ValueError("name is too long")
   1099         return prefix, name
   1100 
   1101     @staticmethod
   1102     def _create_header(info, format):
   1103         """Return a header block. info is a dictionary with file
   1104            information, format must be one of the *_FORMAT constants.
   1105         """
   1106         parts = [
   1107             stn(info.get("name", ""), 100),
   1108             itn(info.get("mode", 0) & 07777, 8, format),
   1109             itn(info.get("uid", 0), 8, format),
   1110             itn(info.get("gid", 0), 8, format),
   1111             itn(info.get("size", 0), 12, format),
   1112             itn(info.get("mtime", 0), 12, format),
   1113             "        ", # checksum field
   1114             info.get("type", REGTYPE),
   1115             stn(info.get("linkname", ""), 100),
   1116             stn(info.get("magic", POSIX_MAGIC), 8),
   1117             stn(info.get("uname", ""), 32),
   1118             stn(info.get("gname", ""), 32),
   1119             itn(info.get("devmajor", 0), 8, format),
   1120             itn(info.get("devminor", 0), 8, format),
   1121             stn(info.get("prefix", ""), 155)
   1122         ]
   1123 
   1124         buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts))
   1125         chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
   1126         buf = buf[:-364] + "%06o\0" % chksum + buf[-357:]
   1127         return buf
   1128 
   1129     @staticmethod
   1130     def _create_payload(payload):
   1131         """Return the string payload filled with zero bytes
   1132            up to the next 512 byte border.
   1133         """
   1134         blocks, remainder = divmod(len(payload), BLOCKSIZE)
   1135         if remainder > 0:
   1136             payload += (BLOCKSIZE - remainder) * NUL
   1137         return payload
   1138 
   1139     @classmethod
   1140     def _create_gnu_long_header(cls, name, type):
   1141         """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
   1142            for name.
   1143         """
   1144         name += NUL
   1145 
   1146         info = {}
   1147         info["name"] = "././@LongLink"
   1148         info["type"] = type
   1149         info["size"] = len(name)
   1150         info["magic"] = GNU_MAGIC
   1151 
   1152         # create extended header + name blocks.
   1153         return cls._create_header(info, USTAR_FORMAT) + \
   1154                 cls._create_payload(name)
   1155 
   1156     @classmethod
   1157     def _create_pax_generic_header(cls, pax_headers, type=XHDTYPE):
   1158         """Return a POSIX.1-2001 extended or global header sequence
   1159            that contains a list of keyword, value pairs. The values
   1160            must be unicode objects.
   1161         """
   1162         records = []
   1163         for keyword, value in pax_headers.iteritems():
   1164             keyword = keyword.encode("utf8")
   1165             value = value.encode("utf8")
   1166             l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
   1167             n = p = 0
   1168             while True:
   1169                 n = l + len(str(p))
   1170                 if n == p:
   1171                     break
   1172                 p = n
   1173             records.append("%d %s=%s\n" % (p, keyword, value))
   1174         records = "".join(records)
   1175 
   1176         # We use a hardcoded "././@PaxHeader" name like star does
   1177         # instead of the one that POSIX recommends.
   1178         info = {}
   1179         info["name"] = "././@PaxHeader"
   1180         info["type"] = type
   1181         info["size"] = len(records)
   1182         info["magic"] = POSIX_MAGIC
   1183 
   1184         # Create pax header + record blocks.
   1185         return cls._create_header(info, USTAR_FORMAT) + \
   1186                 cls._create_payload(records)
   1187 
   1188     @classmethod
   1189     def frombuf(cls, buf):
   1190         """Construct a TarInfo object from a 512 byte string buffer.
   1191         """
   1192         if len(buf) == 0:
   1193             raise EmptyHeaderError("empty header")
   1194         if len(buf) != BLOCKSIZE:
   1195             raise TruncatedHeaderError("truncated header")
   1196         if buf.count(NUL) == BLOCKSIZE:
   1197             raise EOFHeaderError("end of file header")
   1198 
   1199         chksum = nti(buf[148:156])
   1200         if chksum not in calc_chksums(buf):
   1201             raise InvalidHeaderError("bad checksum")
   1202 
   1203         obj = cls()
   1204         obj.buf = buf
   1205         obj.name = nts(buf[0:100])
   1206         obj.mode = nti(buf[100:108])
   1207         obj.uid = nti(buf[108:116])
   1208         obj.gid = nti(buf[116:124])
   1209         obj.size = nti(buf[124:136])
   1210         obj.mtime = nti(buf[136:148])
   1211         obj.chksum = chksum
   1212         obj.type = buf[156:157]
   1213         obj.linkname = nts(buf[157:257])
   1214         obj.uname = nts(buf[265:297])
   1215         obj.gname = nts(buf[297:329])
   1216         obj.devmajor = nti(buf[329:337])
   1217         obj.devminor = nti(buf[337:345])
   1218         prefix = nts(buf[345:500])
   1219 
   1220         # Old V7 tar format represents a directory as a regular
   1221         # file with a trailing slash.
   1222         if obj.type == AREGTYPE and obj.name.endswith("/"):
   1223             obj.type = DIRTYPE
   1224 
   1225         # Remove redundant slashes from directories.
   1226         if obj.isdir():
   1227             obj.name = obj.name.rstrip("/")
   1228 
   1229         # Reconstruct a ustar longname.
   1230         if prefix and obj.type not in GNU_TYPES:
   1231             obj.name = prefix + "/" + obj.name
   1232         return obj
   1233 
   1234     @classmethod
   1235     def fromtarfile(cls, tarfile):
   1236         """Return the next TarInfo object from TarFile object
   1237            tarfile.
   1238         """
   1239         buf = tarfile.fileobj.read(BLOCKSIZE)
   1240         obj = cls.frombuf(buf)
   1241         obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
   1242         return obj._proc_member(tarfile)
   1243 
   1244     #--------------------------------------------------------------------------
   1245     # The following are methods that are called depending on the type of a
   1246     # member. The entry point is _proc_member() which can be overridden in a
   1247     # subclass to add custom _proc_*() methods. A _proc_*() method MUST
   1248     # implement the following
   1249     # operations:
   1250     # 1. Set self.offset_data to the position where the data blocks begin,
   1251     #    if there is data that follows.
   1252     # 2. Set tarfile.offset to the position where the next member's header will
   1253     #    begin.
   1254     # 3. Return self or another valid TarInfo object.
   1255     def _proc_member(self, tarfile):
   1256         """Choose the right processing method depending on
   1257            the type and call it.
   1258         """
   1259         if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
   1260             return self._proc_gnulong(tarfile)
   1261         elif self.type == GNUTYPE_SPARSE:
   1262             return self._proc_sparse(tarfile)
   1263         elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
   1264             return self._proc_pax(tarfile)
   1265         else:
   1266             return self._proc_builtin(tarfile)
   1267 
   1268     def _proc_builtin(self, tarfile):
   1269         """Process a builtin type or an unknown type which
   1270            will be treated as a regular file.
   1271         """
   1272         self.offset_data = tarfile.fileobj.tell()
   1273         offset = self.offset_data
   1274         if self.isreg() or self.type not in SUPPORTED_TYPES:
   1275             # Skip the following data blocks.
   1276             offset += self._block(self.size)
   1277         tarfile.offset = offset
   1278 
   1279         # Patch the TarInfo object with saved global
   1280         # header information.
   1281         self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
   1282 
   1283         return self
   1284 
   1285     def _proc_gnulong(self, tarfile):
   1286         """Process the blocks that hold a GNU longname
   1287            or longlink member.
   1288         """
   1289         buf = tarfile.fileobj.read(self._block(self.size))
   1290 
   1291         # Fetch the next header and process it.
   1292         try:
   1293             next = self.fromtarfile(tarfile)
   1294         except HeaderError:
   1295             raise SubsequentHeaderError("missing or bad subsequent header")
   1296 
   1297         # Patch the TarInfo object from the next header with
   1298         # the longname information.
   1299         next.offset = self.offset
   1300         if self.type == GNUTYPE_LONGNAME:
   1301             next.name = nts(buf)
   1302         elif self.type == GNUTYPE_LONGLINK:
   1303             next.linkname = nts(buf)
   1304 
   1305         return next
   1306 
   1307     def _proc_sparse(self, tarfile):
   1308         """Process a GNU sparse header plus extra headers.
   1309         """
   1310         buf = self.buf
   1311         sp = _ringbuffer()
   1312         pos = 386
   1313         lastpos = 0L
   1314         realpos = 0L
   1315         # There are 4 possible sparse structs in the
   1316         # first header.
   1317         for i in xrange(4):
   1318             try:
   1319                 offset = nti(buf[pos:pos + 12])
   1320                 numbytes = nti(buf[pos + 12:pos + 24])
   1321             except ValueError:
   1322                 break
   1323             if offset > lastpos:
   1324                 sp.append(_hole(lastpos, offset - lastpos))
   1325             sp.append(_data(offset, numbytes, realpos))
   1326             realpos += numbytes
   1327             lastpos = offset + numbytes
   1328             pos += 24
   1329 
   1330         isextended = ord(buf[482])
   1331         origsize = nti(buf[483:495])
   1332 
   1333         # If the isextended flag is given,
   1334         # there are extra headers to process.
   1335         while isextended == 1:
   1336             buf = tarfile.fileobj.read(BLOCKSIZE)
   1337             pos = 0
   1338             for i in xrange(21):
   1339                 try:
   1340                     offset = nti(buf[pos:pos + 12])
   1341                     numbytes = nti(buf[pos + 12:pos + 24])
   1342                 except ValueError:
   1343                     break
   1344                 if offset > lastpos:
   1345                     sp.append(_hole(lastpos, offset - lastpos))
   1346                 sp.append(_data(offset, numbytes, realpos))
   1347                 realpos += numbytes
   1348                 lastpos = offset + numbytes
   1349                 pos += 24
   1350             isextended = ord(buf[504])
   1351 
   1352         if lastpos < origsize:
   1353             sp.append(_hole(lastpos, origsize - lastpos))
   1354 
   1355         self.sparse = sp
   1356 
   1357         self.offset_data = tarfile.fileobj.tell()
   1358         tarfile.offset = self.offset_data + self._block(self.size)
   1359         self.size = origsize
   1360 
   1361         return self
   1362 
   1363     def _proc_pax(self, tarfile):
   1364         """Process an extended or global header as described in
   1365            POSIX.1-2001.
   1366         """
   1367         # Read the header information.
   1368         buf = tarfile.fileobj.read(self._block(self.size))
   1369 
   1370         # A pax header stores supplemental information for either
   1371         # the following file (extended) or all following files
   1372         # (global).
   1373         if self.type == XGLTYPE:
   1374             pax_headers = tarfile.pax_headers
   1375         else:
   1376             pax_headers = tarfile.pax_headers.copy()
   1377 
   1378         # Parse pax header information. A record looks like that:
   1379         # "%d %s=%s\n" % (length, keyword, value). length is the size
   1380         # of the complete record including the length field itself and
   1381         # the newline. keyword and value are both UTF-8 encoded strings.
   1382         regex = re.compile(r"(\d+) ([^=]+)=", re.U)
   1383         pos = 0
   1384         while True:
   1385             match = regex.match(buf, pos)
   1386             if not match:
   1387                 break
   1388 
   1389             length, keyword = match.groups()
   1390             length = int(length)
   1391             value = buf[match.end(2) + 1:match.start(1) + length - 1]
   1392 
   1393             keyword = keyword.decode("utf8")
   1394             value = value.decode("utf8")
   1395 
   1396             pax_headers[keyword] = value
   1397             pos += length
   1398 
   1399         # Fetch the next header.
   1400         try:
   1401             next = self.fromtarfile(tarfile)
   1402         except HeaderError:
   1403             raise SubsequentHeaderError("missing or bad subsequent header")
   1404 
   1405         if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
   1406             # Patch the TarInfo object with the extended header info.
   1407             next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
   1408             next.offset = self.offset
   1409 
   1410             if "size" in pax_headers:
   1411                 # If the extended header replaces the size field,
   1412                 # we need to recalculate the offset where the next
   1413                 # header starts.
   1414                 offset = next.offset_data
   1415                 if next.isreg() or next.type not in SUPPORTED_TYPES:
   1416                     offset += next._block(next.size)
   1417                 tarfile.offset = offset
   1418 
   1419         return next
   1420 
   1421     def _apply_pax_info(self, pax_headers, encoding, errors):
   1422         """Replace fields with supplemental information from a previous
   1423            pax extended or global header.
   1424         """
   1425         for keyword, value in pax_headers.iteritems():
   1426             if keyword not in PAX_FIELDS:
   1427                 continue
   1428 
   1429             if keyword == "path":
   1430                 value = value.rstrip("/")
   1431 
   1432             if keyword in PAX_NUMBER_FIELDS:
   1433                 try:
   1434                     value = PAX_NUMBER_FIELDS[keyword](value)
   1435                 except ValueError:
   1436                     value = 0
   1437             else:
   1438                 value = uts(value, encoding, errors)
   1439 
   1440             setattr(self, keyword, value)
   1441 
   1442         self.pax_headers = pax_headers.copy()
   1443 
   1444     def _block(self, count):
   1445         """Round up a byte count by BLOCKSIZE and return it,
   1446            e.g. _block(834) => 1024.
   1447         """
   1448         blocks, remainder = divmod(count, BLOCKSIZE)
   1449         if remainder:
   1450             blocks += 1
   1451         return blocks * BLOCKSIZE
   1452 
   1453     def isreg(self):
   1454         return self.type in REGULAR_TYPES
   1455     def isfile(self):
   1456         return self.isreg()
   1457     def isdir(self):
   1458         return self.type == DIRTYPE
   1459     def issym(self):
   1460         return self.type == SYMTYPE
   1461     def islnk(self):
   1462         return self.type == LNKTYPE
   1463     def ischr(self):
   1464         return self.type == CHRTYPE
   1465     def isblk(self):
   1466         return self.type == BLKTYPE
   1467     def isfifo(self):
   1468         return self.type == FIFOTYPE
   1469     def issparse(self):
   1470         return self.type == GNUTYPE_SPARSE
   1471     def isdev(self):
   1472         return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
   1473 # class TarInfo
   1474 
   1475 class TarFile(object):
   1476     """The TarFile Class provides an interface to tar archives.
   1477     """
   1478 
   1479     debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
   1480 
   1481     dereference = False         # If true, add content of linked file to the
   1482                                 # tar file, else the link.
   1483 
   1484     ignore_zeros = False        # If true, skips empty or invalid blocks and
   1485                                 # continues processing.
   1486 
   1487     errorlevel = 1              # If 0, fatal errors only appear in debug
   1488                                 # messages (if debug >= 0). If > 0, errors
   1489                                 # are passed to the caller as exceptions.
   1490 
   1491     format = DEFAULT_FORMAT     # The format to use when creating an archive.
   1492 
   1493     encoding = ENCODING         # Encoding for 8-bit character strings.
   1494 
   1495     errors = None               # Error handler for unicode conversion.
   1496 
   1497     tarinfo = TarInfo           # The default TarInfo class to use.
   1498 
   1499     fileobject = ExFileObject   # The default ExFileObject class to use.
   1500 
   1501     def __init__(self, name=None, mode="r", fileobj=None, format=None,
   1502             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
   1503             errors=None, pax_headers=None, debug=None, errorlevel=None):
   1504         """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
   1505            read from an existing archive, 'a' to append data to an existing
   1506            file or 'w' to create a new file overwriting an existing one. `mode'
   1507            defaults to 'r'.
   1508            If `fileobj' is given, it is used for reading or writing data. If it
   1509            can be determined, `mode' is overridden by `fileobj's mode.
   1510            `fileobj' is not closed, when TarFile is closed.
   1511         """
   1512         if len(mode) > 1 or mode not in "raw":
   1513             raise ValueError("mode must be 'r', 'a' or 'w'")
   1514         self.mode = mode
   1515         self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
   1516 
   1517         if not fileobj:
   1518             if self.mode == "a" and not os.path.exists(name):
   1519                 # Create nonexistent files in append mode.
   1520                 self.mode = "w"
   1521                 self._mode = "wb"
   1522             fileobj = bltn_open(name, self._mode)
   1523             self._extfileobj = False
   1524         else:
   1525             if name is None and hasattr(fileobj, "name"):
   1526                 name = fileobj.name
   1527             if hasattr(fileobj, "mode"):
   1528                 self._mode = fileobj.mode
   1529             self._extfileobj = True
   1530         self.name = os.path.abspath(name) if name else None
   1531         self.fileobj = fileobj
   1532 
   1533         # Init attributes.
   1534         if format is not None:
   1535             self.format = format
   1536         if tarinfo is not None:
   1537             self.tarinfo = tarinfo
   1538         if dereference is not None:
   1539             self.dereference = dereference
   1540         if ignore_zeros is not None:
   1541             self.ignore_zeros = ignore_zeros
   1542         if encoding is not None:
   1543             self.encoding = encoding
   1544 
   1545         if errors is not None:
   1546             self.errors = errors
   1547         elif mode == "r":
   1548             self.errors = "utf-8"
   1549         else:
   1550             self.errors = "strict"
   1551 
   1552         if pax_headers is not None and self.format == PAX_FORMAT:
   1553             self.pax_headers = pax_headers
   1554         else:
   1555             self.pax_headers = {}
   1556 
   1557         if debug is not None:
   1558             self.debug = debug
   1559         if errorlevel is not None:
   1560             self.errorlevel = errorlevel
   1561 
   1562         # Init datastructures.
   1563         self.closed = False
   1564         self.members = []       # list of members as TarInfo objects
   1565         self._loaded = False    # flag if all members have been read
   1566         self.offset = self.fileobj.tell()
   1567                                 # current position in the archive file
   1568         self.inodes = {}        # dictionary caching the inodes of
   1569                                 # archive members already added
   1570 
   1571         try:
   1572             if self.mode == "r":
   1573                 self.firstmember = None
   1574                 self.firstmember = self.next()
   1575 
   1576             if self.mode == "a":
   1577                 # Move to the end of the archive,
   1578                 # before the first empty block.
   1579                 while True:
   1580                     self.fileobj.seek(self.offset)
   1581                     try:
   1582                         tarinfo = self.tarinfo.fromtarfile(self)
   1583                         self.members.append(tarinfo)
   1584                     except EOFHeaderError:
   1585                         self.fileobj.seek(self.offset)
   1586                         break
   1587                     except HeaderError, e:
   1588                         raise ReadError(str(e))
   1589 
   1590             if self.mode in "aw":
   1591                 self._loaded = True
   1592 
   1593                 if self.pax_headers:
   1594                     buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
   1595                     self.fileobj.write(buf)
   1596                     self.offset += len(buf)
   1597         except:
   1598             if not self._extfileobj:
   1599                 self.fileobj.close()
   1600             self.closed = True
   1601             raise
   1602 
   1603     def _getposix(self):
   1604         return self.format == USTAR_FORMAT
   1605     def _setposix(self, value):
   1606         import warnings
   1607         warnings.warn("use the format attribute instead", DeprecationWarning,
   1608                       2)
   1609         if value:
   1610             self.format = USTAR_FORMAT
   1611         else:
   1612             self.format = GNU_FORMAT
   1613     posix = property(_getposix, _setposix)
   1614 
   1615     #--------------------------------------------------------------------------
   1616     # Below are the classmethods which act as alternate constructors to the
   1617     # TarFile class. The open() method is the only one that is needed for
   1618     # public use; it is the "super"-constructor and is able to select an
   1619     # adequate "sub"-constructor for a particular compression using the mapping
   1620     # from OPEN_METH.
   1621     #
   1622     # This concept allows one to subclass TarFile without losing the comfort of
   1623     # the super-constructor. A sub-constructor is registered and made available
   1624     # by adding it to the mapping in OPEN_METH.
   1625 
   1626     @classmethod
   1627     def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
   1628         """Open a tar archive for reading, writing or appending. Return
   1629            an appropriate TarFile class.
   1630 
   1631            mode:
   1632            'r' or 'r:*' open for reading with transparent compression
   1633            'r:'         open for reading exclusively uncompressed
   1634            'r:gz'       open for reading with gzip compression
   1635            'r:bz2'      open for reading with bzip2 compression
   1636            'a' or 'a:'  open for appending, creating the file if necessary
   1637            'w' or 'w:'  open for writing without compression
   1638            'w:gz'       open for writing with gzip compression
   1639            'w:bz2'      open for writing with bzip2 compression
   1640 
   1641            'r|*'        open a stream of tar blocks with transparent compression
   1642            'r|'         open an uncompressed stream of tar blocks for reading
   1643            'r|gz'       open a gzip compressed stream of tar blocks
   1644            'r|bz2'      open a bzip2 compressed stream of tar blocks
   1645            'w|'         open an uncompressed stream for writing
   1646            'w|gz'       open a gzip compressed stream for writing
   1647            'w|bz2'      open a bzip2 compressed stream for writing
   1648         """
   1649 
   1650         if not name and not fileobj:
   1651             raise ValueError("nothing to open")
   1652 
   1653         if mode in ("r", "r:*"):
   1654             # Find out which *open() is appropriate for opening the file.
   1655             for comptype in cls.OPEN_METH:
   1656                 func = getattr(cls, cls.OPEN_METH[comptype])
   1657                 if fileobj is not None:
   1658                     saved_pos = fileobj.tell()
   1659                 try:
   1660                     return func(name, "r", fileobj, **kwargs)
   1661                 except (ReadError, CompressionError), e:
   1662                     if fileobj is not None:
   1663                         fileobj.seek(saved_pos)
   1664                     continue
   1665             raise ReadError("file could not be opened successfully")
   1666 
   1667         elif ":" in mode:
   1668             filemode, comptype = mode.split(":", 1)
   1669             filemode = filemode or "r"
   1670             comptype = comptype or "tar"
   1671 
   1672             # Select the *open() function according to
   1673             # given compression.
   1674             if comptype in cls.OPEN_METH:
   1675                 func = getattr(cls, cls.OPEN_METH[comptype])
   1676             else:
   1677                 raise CompressionError("unknown compression type %r" % comptype)
   1678             return func(name, filemode, fileobj, **kwargs)
   1679 
   1680         elif "|" in mode:
   1681             filemode, comptype = mode.split("|", 1)
   1682             filemode = filemode or "r"
   1683             comptype = comptype or "tar"
   1684 
   1685             if filemode not in "rw":
   1686                 raise ValueError("mode must be 'r' or 'w'")
   1687 
   1688             t = cls(name, filemode,
   1689                     _Stream(name, filemode, comptype, fileobj, bufsize),
   1690                     **kwargs)
   1691             t._extfileobj = False
   1692             return t
   1693 
   1694         elif mode in "aw":
   1695             return cls.taropen(name, mode, fileobj, **kwargs)
   1696 
   1697         raise ValueError("undiscernible mode")
   1698 
   1699     @classmethod
   1700     def taropen(cls, name, mode="r", fileobj=None, **kwargs):
   1701         """Open uncompressed tar archive name for reading or writing.
   1702         """
   1703         if len(mode) > 1 or mode not in "raw":
   1704             raise ValueError("mode must be 'r', 'a' or 'w'")
   1705         return cls(name, mode, fileobj, **kwargs)
   1706 
   1707     @classmethod
   1708     def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
   1709         """Open gzip compressed tar archive name for reading or writing.
   1710            Appending is not allowed.
   1711         """
   1712         if len(mode) > 1 or mode not in "rw":
   1713             raise ValueError("mode must be 'r' or 'w'")
   1714 
   1715         try:
   1716             import gzip
   1717             gzip.GzipFile
   1718         except (ImportError, AttributeError):
   1719             raise CompressionError("gzip module is not available")
   1720 
   1721         if fileobj is None:
   1722             fileobj = bltn_open(name, mode + "b")
   1723 
   1724         try:
   1725             t = cls.taropen(name, mode,
   1726                 gzip.GzipFile(name, mode, compresslevel, fileobj),
   1727                 **kwargs)
   1728         except IOError:
   1729             raise ReadError("not a gzip file")
   1730         t._extfileobj = False
   1731         return t
   1732 
   1733     @classmethod
   1734     def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
   1735         """Open bzip2 compressed tar archive name for reading or writing.
   1736            Appending is not allowed.
   1737         """
   1738         if len(mode) > 1 or mode not in "rw":
   1739             raise ValueError("mode must be 'r' or 'w'.")
   1740 
   1741         try:
   1742             import bz2
   1743         except ImportError:
   1744             raise CompressionError("bz2 module is not available")
   1745 
   1746         if fileobj is not None:
   1747             fileobj = _BZ2Proxy(fileobj, mode)
   1748         else:
   1749             fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
   1750 
   1751         try:
   1752             t = cls.taropen(name, mode, fileobj, **kwargs)
   1753         except (IOError, EOFError):
   1754             raise ReadError("not a bzip2 file")
   1755         t._extfileobj = False
   1756         return t
   1757 
   1758     # All *open() methods are registered here.
   1759     OPEN_METH = {
   1760         "tar": "taropen",   # uncompressed tar
   1761         "gz":  "gzopen",    # gzip compressed tar
   1762         "bz2": "bz2open"    # bzip2 compressed tar
   1763     }
   1764 
   1765     #--------------------------------------------------------------------------
   1766     # The public methods which TarFile provides:
   1767 
   1768     def close(self):
   1769         """Close the TarFile. In write-mode, two finishing zero blocks are
   1770            appended to the archive.
   1771         """
   1772         if self.closed:
   1773             return
   1774 
   1775         if self.mode in "aw":
   1776             self.fileobj.write(NUL * (BLOCKSIZE * 2))
   1777             self.offset += (BLOCKSIZE * 2)
   1778             # fill up the end with zero-blocks
   1779             # (like option -b20 for tar does)
   1780             blocks, remainder = divmod(self.offset, RECORDSIZE)
   1781             if remainder > 0:
   1782                 self.fileobj.write(NUL * (RECORDSIZE - remainder))
   1783 
   1784         if not self._extfileobj:
   1785             self.fileobj.close()
   1786         self.closed = True
   1787 
   1788     def getmember(self, name):
   1789         """Return a TarInfo object for member `name'. If `name' can not be
   1790            found in the archive, KeyError is raised. If a member occurs more
   1791            than once in the archive, its last occurrence is assumed to be the
   1792            most up-to-date version.
   1793         """
   1794         tarinfo = self._getmember(name)
   1795         if tarinfo is None:
   1796             raise KeyError("filename %r not found" % name)
   1797         return tarinfo
   1798 
   1799     def getmembers(self):
   1800         """Return the members of the archive as a list of TarInfo objects. The
   1801            list has the same order as the members in the archive.
   1802         """
   1803         self._check()
   1804         if not self._loaded:    # if we want to obtain a list of
   1805             self._load()        # all members, we first have to
   1806                                 # scan the whole archive.
   1807         return self.members
   1808 
   1809     def getnames(self):
   1810         """Return the members of the archive as a list of their names. It has
   1811            the same order as the list returned by getmembers().
   1812         """
   1813         return [tarinfo.name for tarinfo in self.getmembers()]
   1814 
   1815     def gettarinfo(self, name=None, arcname=None, fileobj=None):
   1816         """Create a TarInfo object for either the file `name' or the file
   1817            object `fileobj' (using os.fstat on its file descriptor). You can
   1818            modify some of the TarInfo's attributes before you add it using
   1819            addfile(). If given, `arcname' specifies an alternative name for the
   1820            file in the archive.
   1821         """
   1822         self._check("aw")
   1823 
   1824         # When fileobj is given, replace name by
   1825         # fileobj's real name.
   1826         if fileobj is not None:
   1827             name = fileobj.name
   1828 
   1829         # Building the name of the member in the archive.
   1830         # Backward slashes are converted to forward slashes,
   1831         # Absolute paths are turned to relative paths.
   1832         if arcname is None:
   1833             arcname = name
   1834         drv, arcname = os.path.splitdrive(arcname)
   1835         arcname = arcname.replace(os.sep, "/")
   1836         arcname = arcname.lstrip("/")
   1837 
   1838         # Now, fill the TarInfo object with
   1839         # information specific for the file.
   1840         tarinfo = self.tarinfo()
   1841         tarinfo.tarfile = self
   1842 
   1843         # Use os.stat or os.lstat, depending on platform
   1844         # and if symlinks shall be resolved.
   1845         if fileobj is None:
   1846             if hasattr(os, "lstat") and not self.dereference:
   1847                 statres = os.lstat(name)
   1848             else:
   1849                 statres = os.stat(name)
   1850         else:
   1851             statres = os.fstat(fileobj.fileno())
   1852         linkname = ""
   1853 
   1854         stmd = statres.st_mode
   1855         if stat.S_ISREG(stmd):
   1856             inode = (statres.st_ino, statres.st_dev)
   1857             if not self.dereference and statres.st_nlink > 1 and \
   1858                     inode in self.inodes and arcname != self.inodes[inode]:
   1859                 # Is it a hardlink to an already
   1860                 # archived file?
   1861                 type = LNKTYPE
   1862                 linkname = self.inodes[inode]
   1863             else:
   1864                 # The inode is added only if its valid.
   1865                 # For win32 it is always 0.
   1866                 type = REGTYPE
   1867                 if inode[0]:
   1868                     self.inodes[inode] = arcname
   1869         elif stat.S_ISDIR(stmd):
   1870             type = DIRTYPE
   1871         elif stat.S_ISFIFO(stmd):
   1872             type = FIFOTYPE
   1873         elif stat.S_ISLNK(stmd):
   1874             type = SYMTYPE
   1875             linkname = os.readlink(name)
   1876         elif stat.S_ISCHR(stmd):
   1877             type = CHRTYPE
   1878         elif stat.S_ISBLK(stmd):
   1879             type = BLKTYPE
   1880         else:
   1881             return None
   1882 
   1883         # Fill the TarInfo object with all
   1884         # information we can get.
   1885         tarinfo.name = arcname
   1886         tarinfo.mode = stmd
   1887         tarinfo.uid = statres.st_uid
   1888         tarinfo.gid = statres.st_gid
   1889         if type == REGTYPE:
   1890             tarinfo.size = statres.st_size
   1891         else:
   1892             tarinfo.size = 0L
   1893         tarinfo.mtime = statres.st_mtime
   1894         tarinfo.type = type
   1895         tarinfo.linkname = linkname
   1896         if pwd:
   1897             try:
   1898                 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
   1899             except KeyError:
   1900                 pass
   1901         if grp:
   1902             try:
   1903                 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
   1904             except KeyError:
   1905                 pass
   1906 
   1907         if type in (CHRTYPE, BLKTYPE):
   1908             if hasattr(os, "major") and hasattr(os, "minor"):
   1909                 tarinfo.devmajor = os.major(statres.st_rdev)
   1910                 tarinfo.devminor = os.minor(statres.st_rdev)
   1911         return tarinfo
   1912 
   1913     def list(self, verbose=True):
   1914         """Print a table of contents to sys.stdout. If `verbose' is False, only
   1915            the names of the members are printed. If it is True, an `ls -l'-like
   1916            output is produced.
   1917         """
   1918         self._check()
   1919 
   1920         for tarinfo in self:
   1921             if verbose:
   1922                 print filemode(tarinfo.mode),
   1923                 print "%s/%s" % (tarinfo.uname or tarinfo.uid,
   1924                                  tarinfo.gname or tarinfo.gid),
   1925                 if tarinfo.ischr() or tarinfo.isblk():
   1926                     print "%10s" % ("%d,%d" \
   1927                                     % (tarinfo.devmajor, tarinfo.devminor)),
   1928                 else:
   1929                     print "%10d" % tarinfo.size,
   1930                 print "%d-%02d-%02d %02d:%02d:%02d" \
   1931                       % time.localtime(tarinfo.mtime)[:6],
   1932 
   1933             print tarinfo.name + ("/" if tarinfo.isdir() else ""),
   1934 
   1935             if verbose:
   1936                 if tarinfo.issym():
   1937                     print "->", tarinfo.linkname,
   1938                 if tarinfo.islnk():
   1939                     print "link to", tarinfo.linkname,
   1940             print
   1941 
   1942     def add(self, name, arcname=None, recursive=True, exclude=None, filter=None):
   1943         """Add the file `name' to the archive. `name' may be any type of file
   1944            (directory, fifo, symbolic link, etc.). If given, `arcname'
   1945            specifies an alternative name for the file in the archive.
   1946            Directories are added recursively by default. This can be avoided by
   1947            setting `recursive' to False. `exclude' is a function that should
   1948            return True for each filename to be excluded. `filter' is a function
   1949            that expects a TarInfo object argument and returns the changed
   1950            TarInfo object, if it returns None the TarInfo object will be
   1951            excluded from the archive.
   1952         """
   1953         self._check("aw")
   1954 
   1955         if arcname is None:
   1956             arcname = name
   1957 
   1958         # Exclude pathnames.
   1959         if exclude is not None:
   1960             import warnings
   1961             warnings.warn("use the filter argument instead",
   1962                     DeprecationWarning, 2)
   1963             if exclude(name):
   1964                 self._dbg(2, "tarfile: Excluded %r" % name)
   1965                 return
   1966 
   1967         # Skip if somebody tries to archive the archive...
   1968         if self.name is not None and os.path.abspath(name) == self.name:
   1969             self._dbg(2, "tarfile: Skipped %r" % name)
   1970             return
   1971 
   1972         self._dbg(1, name)
   1973 
   1974         # Create a TarInfo object from the file.
   1975         tarinfo = self.gettarinfo(name, arcname)
   1976 
   1977         if tarinfo is None:
   1978             self._dbg(1, "tarfile: Unsupported type %r" % name)
   1979             return
   1980 
   1981         # Change or exclude the TarInfo object.
   1982         if filter is not None:
   1983             tarinfo = filter(tarinfo)
   1984             if tarinfo is None:
   1985                 self._dbg(2, "tarfile: Excluded %r" % name)
   1986                 return
   1987 
   1988         # Append the tar header and data to the archive.
   1989         if tarinfo.isreg():
   1990             with bltn_open(name, "rb") as f:
   1991                 self.addfile(tarinfo, f)
   1992 
   1993         elif tarinfo.isdir():
   1994             self.addfile(tarinfo)
   1995             if recursive:
   1996                 for f in os.listdir(name):
   1997                     self.add(os.path.join(name, f), os.path.join(arcname, f),
   1998                             recursive, exclude, filter)
   1999 
   2000         else:
   2001             self.addfile(tarinfo)
   2002 
   2003     def addfile(self, tarinfo, fileobj=None):
   2004         """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
   2005            given, tarinfo.size bytes are read from it and added to the archive.
   2006            You can create TarInfo objects using gettarinfo().
   2007            On Windows platforms, `fileobj' should always be opened with mode
   2008            'rb' to avoid irritation about the file size.
   2009         """
   2010         self._check("aw")
   2011 
   2012         tarinfo = copy.copy(tarinfo)
   2013 
   2014         buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
   2015         self.fileobj.write(buf)
   2016         self.offset += len(buf)
   2017 
   2018         # If there's data to follow, append it.
   2019         if fileobj is not None:
   2020             copyfileobj(fileobj, self.fileobj, tarinfo.size)
   2021             blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
   2022             if remainder > 0:
   2023                 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
   2024                 blocks += 1
   2025             self.offset += blocks * BLOCKSIZE
   2026 
   2027         self.members.append(tarinfo)
   2028 
   2029     def extractall(self, path=".", members=None):
   2030         """Extract all members from the archive to the current working
   2031            directory and set owner, modification time and permissions on
   2032            directories afterwards. `path' specifies a different directory
   2033            to extract to. `members' is optional and must be a subset of the
   2034            list returned by getmembers().
   2035         """
   2036         directories = []
   2037 
   2038         if members is None:
   2039             members = self
   2040 
   2041         for tarinfo in members:
   2042             if tarinfo.isdir():
   2043                 # Extract directories with a safe mode.
   2044                 directories.append(tarinfo)
   2045                 tarinfo = copy.copy(tarinfo)
   2046                 tarinfo.mode = 0700
   2047             self.extract(tarinfo, path)
   2048 
   2049         # Reverse sort directories.
   2050         directories.sort(key=operator.attrgetter('name'))
   2051         directories.reverse()
   2052 
   2053         # Set correct owner, mtime and filemode on directories.
   2054         for tarinfo in directories:
   2055             dirpath = os.path.join(path, tarinfo.name)
   2056             try:
   2057                 self.chown(tarinfo, dirpath)
   2058                 self.utime(tarinfo, dirpath)
   2059                 self.chmod(tarinfo, dirpath)
   2060             except ExtractError, e:
   2061                 if self.errorlevel > 1:
   2062                     raise
   2063                 else:
   2064                     self._dbg(1, "tarfile: %s" % e)
   2065 
   2066     def extract(self, member, path=""):
   2067         """Extract a member from the archive to the current working directory,
   2068            using its full name. Its file information is extracted as accurately
   2069            as possible. `member' may be a filename or a TarInfo object. You can
   2070            specify a different directory using `path'.
   2071         """
   2072         self._check("r")
   2073 
   2074         if isinstance(member, basestring):
   2075             tarinfo = self.getmember(member)
   2076         else:
   2077             tarinfo = member
   2078 
   2079         # Prepare the link target for makelink().
   2080         if tarinfo.islnk():
   2081             tarinfo._link_target = os.path.join(path, tarinfo.linkname)
   2082 
   2083         try:
   2084             self._extract_member(tarinfo, os.path.join(path, tarinfo.name))
   2085         except EnvironmentError, e:
   2086             if self.errorlevel > 0:
   2087                 raise
   2088             else:
   2089                 if e.filename is None:
   2090                     self._dbg(1, "tarfile: %s" % e.strerror)
   2091                 else:
   2092                     self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
   2093         except ExtractError, e:
   2094             if self.errorlevel > 1:
   2095                 raise
   2096             else:
   2097                 self._dbg(1, "tarfile: %s" % e)
   2098 
   2099     def extractfile(self, member):
   2100         """Extract a member from the archive as a file object. `member' may be
   2101            a filename or a TarInfo object. If `member' is a regular file, a
   2102            file-like object is returned. If `member' is a link, a file-like
   2103            object is constructed from the link's target. If `member' is none of
   2104            the above, None is returned.
   2105            The file-like object is read-only and provides the following
   2106            methods: read(), readline(), readlines(), seek() and tell()
   2107         """
   2108         self._check("r")
   2109 
   2110         if isinstance(member, basestring):
   2111             tarinfo = self.getmember(member)
   2112         else:
   2113             tarinfo = member
   2114 
   2115         if tarinfo.isreg():
   2116             return self.fileobject(self, tarinfo)
   2117 
   2118         elif tarinfo.type not in SUPPORTED_TYPES:
   2119             # If a member's type is unknown, it is treated as a
   2120             # regular file.
   2121             return self.fileobject(self, tarinfo)
   2122 
   2123         elif tarinfo.islnk() or tarinfo.issym():
   2124             if isinstance(self.fileobj, _Stream):
   2125                 # A small but ugly workaround for the case that someone tries
   2126                 # to extract a (sym)link as a file-object from a non-seekable
   2127                 # stream of tar blocks.
   2128                 raise StreamError("cannot extract (sym)link as file object")
   2129             else:
   2130                 # A (sym)link's file object is its target's file object.
   2131                 return self.extractfile(self._find_link_target(tarinfo))
   2132         else:
   2133             # If there's no data associated with the member (directory, chrdev,
   2134             # blkdev, etc.), return None instead of a file object.
   2135             return None
   2136 
   2137     def _extract_member(self, tarinfo, targetpath):
   2138         """Extract the TarInfo object tarinfo to a physical
   2139            file called targetpath.
   2140         """
   2141         # Fetch the TarInfo object for the given name
   2142         # and build the destination pathname, replacing
   2143         # forward slashes to platform specific separators.
   2144         targetpath = targetpath.rstrip("/")
   2145         targetpath = targetpath.replace("/", os.sep)
   2146 
   2147         # Create all upper directories.
   2148         upperdirs = os.path.dirname(targetpath)
   2149         if upperdirs and not os.path.exists(upperdirs):
   2150             # Create directories that are not part of the archive with
   2151             # default permissions.
   2152             os.makedirs(upperdirs)
   2153 
   2154         if tarinfo.islnk() or tarinfo.issym():
   2155             self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
   2156         else:
   2157             self._dbg(1, tarinfo.name)
   2158 
   2159         if tarinfo.isreg():
   2160             self.makefile(tarinfo, targetpath)
   2161         elif tarinfo.isdir():
   2162             self.makedir(tarinfo, targetpath)
   2163         elif tarinfo.isfifo():
   2164             self.makefifo(tarinfo, targetpath)
   2165         elif tarinfo.ischr() or tarinfo.isblk():
   2166             self.makedev(tarinfo, targetpath)
   2167         elif tarinfo.islnk() or tarinfo.issym():
   2168             self.makelink(tarinfo, targetpath)
   2169         elif tarinfo.type not in SUPPORTED_TYPES:
   2170             self.makeunknown(tarinfo, targetpath)
   2171         else:
   2172             self.makefile(tarinfo, targetpath)
   2173 
   2174         self.chown(tarinfo, targetpath)
   2175         if not tarinfo.issym():
   2176             self.chmod(tarinfo, targetpath)
   2177             self.utime(tarinfo, targetpath)
   2178 
   2179     #--------------------------------------------------------------------------
   2180     # Below are the different file methods. They are called via
   2181     # _extract_member() when extract() is called. They can be replaced in a
   2182     # subclass to implement other functionality.
   2183 
   2184     def makedir(self, tarinfo, targetpath):
   2185         """Make a directory called targetpath.
   2186         """
   2187         try:
   2188             # Use a safe mode for the directory, the real mode is set
   2189             # later in _extract_member().
   2190             os.mkdir(targetpath, 0700)
   2191         except EnvironmentError, e:
   2192             if e.errno != errno.EEXIST:
   2193                 raise
   2194 
   2195     def makefile(self, tarinfo, targetpath):
   2196         """Make a file called targetpath.
   2197         """
   2198         source = self.extractfile(tarinfo)
   2199         try:
   2200             with bltn_open(targetpath, "wb") as target:
   2201                 copyfileobj(source, target)
   2202         finally:
   2203             source.close()
   2204 
   2205     def makeunknown(self, tarinfo, targetpath):
   2206         """Make a file from a TarInfo object with an unknown type
   2207            at targetpath.
   2208         """
   2209         self.makefile(tarinfo, targetpath)
   2210         self._dbg(1, "tarfile: Unknown file type %r, " \
   2211                      "extracted as regular file." % tarinfo.type)
   2212 
   2213     def makefifo(self, tarinfo, targetpath):
   2214         """Make a fifo called targetpath.
   2215         """
   2216         if hasattr(os, "mkfifo"):
   2217             os.mkfifo(targetpath)
   2218         else:
   2219             raise ExtractError("fifo not supported by system")
   2220 
   2221     def makedev(self, tarinfo, targetpath):
   2222         """Make a character or block device called targetpath.
   2223         """
   2224         if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
   2225             raise ExtractError("special devices not supported by system")
   2226 
   2227         mode = tarinfo.mode
   2228         if tarinfo.isblk():
   2229             mode |= stat.S_IFBLK
   2230         else:
   2231             mode |= stat.S_IFCHR
   2232 
   2233         os.mknod(targetpath, mode,
   2234                  os.makedev(tarinfo.devmajor, tarinfo.devminor))
   2235 
   2236     def makelink(self, tarinfo, targetpath):
   2237         """Make a (symbolic) link called targetpath. If it cannot be created
   2238           (platform limitation), we try to make a copy of the referenced file
   2239           instead of a link.
   2240         """
   2241         if hasattr(os, "symlink") and hasattr(os, "link"):
   2242             # For systems that support symbolic and hard links.
   2243             if tarinfo.issym():
   2244                 if os.path.lexists(targetpath):
   2245                     os.unlink(targetpath)
   2246                 os.symlink(tarinfo.linkname, targetpath)
   2247             else:
   2248                 # See extract().
   2249                 if os.path.exists(tarinfo._link_target):
   2250                     if os.path.lexists(targetpath):
   2251                         os.unlink(targetpath)
   2252                     os.link(tarinfo._link_target, targetpath)
   2253                 else:
   2254                     self._extract_member(self._find_link_target(tarinfo), targetpath)
   2255         else:
   2256             try:
   2257                 self._extract_member(self._find_link_target(tarinfo), targetpath)
   2258             except KeyError:
   2259                 raise ExtractError("unable to resolve link inside archive")
   2260 
   2261     def chown(self, tarinfo, targetpath):
   2262         """Set owner of targetpath according to tarinfo.
   2263         """
   2264         if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
   2265             # We have to be root to do so.
   2266             try:
   2267                 g = grp.getgrnam(tarinfo.gname)[2]
   2268             except KeyError:
   2269                 g = tarinfo.gid
   2270             try:
   2271                 u = pwd.getpwnam(tarinfo.uname)[2]
   2272             except KeyError:
   2273                 u = tarinfo.uid
   2274             try:
   2275                 if tarinfo.issym() and hasattr(os, "lchown"):
   2276                     os.lchown(targetpath, u, g)
   2277                 else:
   2278                     if sys.platform != "os2emx":
   2279                         os.chown(targetpath, u, g)
   2280             except EnvironmentError, e:
   2281                 raise ExtractError("could not change owner")
   2282 
   2283     def chmod(self, tarinfo, targetpath):
   2284         """Set file permissions of targetpath according to tarinfo.
   2285         """
   2286         if hasattr(os, 'chmod'):
   2287             try:
   2288                 os.chmod(targetpath, tarinfo.mode)
   2289             except EnvironmentError, e:
   2290                 raise ExtractError("could not change mode")
   2291 
   2292     def utime(self, tarinfo, targetpath):
   2293         """Set modification time of targetpath according to tarinfo.
   2294         """
   2295         if not hasattr(os, 'utime'):
   2296             return
   2297         try:
   2298             os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
   2299         except EnvironmentError, e:
   2300             raise ExtractError("could not change modification time")
   2301 
   2302     #--------------------------------------------------------------------------
   2303     def next(self):
   2304         """Return the next member of the archive as a TarInfo object, when
   2305            TarFile is opened for reading. Return None if there is no more
   2306            available.
   2307         """
   2308         self._check("ra")
   2309         if self.firstmember is not None:
   2310             m = self.firstmember
   2311             self.firstmember = None
   2312             return m
   2313 
   2314         # Read the next block.
   2315         self.fileobj.seek(self.offset)
   2316         tarinfo = None
   2317         while True:
   2318             try:
   2319                 tarinfo = self.tarinfo.fromtarfile(self)
   2320             except EOFHeaderError, e:
   2321                 if self.ignore_zeros:
   2322                     self._dbg(2, "0x%X: %s" % (self.offset, e))
   2323                     self.offset += BLOCKSIZE
   2324                     continue
   2325             except InvalidHeaderError, e:
   2326                 if self.ignore_zeros:
   2327                     self._dbg(2, "0x%X: %s" % (self.offset, e))
   2328                     self.offset += BLOCKSIZE
   2329                     continue
   2330                 elif self.offset == 0:
   2331                     raise ReadError(str(e))
   2332             except EmptyHeaderError:
   2333                 if self.offset == 0:
   2334                     raise ReadError("empty file")
   2335             except TruncatedHeaderError, e:
   2336                 if self.offset == 0:
   2337                     raise ReadError(str(e))
   2338             except SubsequentHeaderError, e:
   2339                 raise ReadError(str(e))
   2340             break
   2341 
   2342         if tarinfo is not None:
   2343             self.members.append(tarinfo)
   2344         else:
   2345             self._loaded = True
   2346 
   2347         return tarinfo
   2348 
   2349     #--------------------------------------------------------------------------
   2350     # Little helper methods:
   2351 
   2352     def _getmember(self, name, tarinfo=None, normalize=False):
   2353         """Find an archive member by name from bottom to top.
   2354            If tarinfo is given, it is used as the starting point.
   2355         """
   2356         # Ensure that all members have been loaded.
   2357         members = self.getmembers()
   2358 
   2359         # Limit the member search list up to tarinfo.
   2360         if tarinfo is not None:
   2361             members = members[:members.index(tarinfo)]
   2362 
   2363         if normalize:
   2364             name = os.path.normpath(name)
   2365 
   2366         for member in reversed(members):
   2367             if normalize:
   2368                 member_name = os.path.normpath(member.name)
   2369             else:
   2370                 member_name = member.name
   2371 
   2372             if name == member_name:
   2373                 return member
   2374 
   2375     def _load(self):
   2376         """Read through the entire archive file and look for readable
   2377            members.
   2378         """
   2379         while True:
   2380             tarinfo = self.next()
   2381             if tarinfo is None:
   2382                 break
   2383         self._loaded = True
   2384 
   2385     def _check(self, mode=None):
   2386         """Check if TarFile is still open, and if the operation's mode
   2387            corresponds to TarFile's mode.
   2388         """
   2389         if self.closed:
   2390             raise IOError("%s is closed" % self.__class__.__name__)
   2391         if mode is not None and self.mode not in mode:
   2392             raise IOError("bad operation for mode %r" % self.mode)
   2393 
   2394     def _find_link_target(self, tarinfo):
   2395         """Find the target member of a symlink or hardlink member in the
   2396            archive.
   2397         """
   2398         if tarinfo.issym():
   2399             # Always search the entire archive.
   2400             linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
   2401             limit = None
   2402         else:
   2403             # Search the archive before the link, because a hard link is
   2404             # just a reference to an already archived file.
   2405             linkname = tarinfo.linkname
   2406             limit = tarinfo
   2407 
   2408         member = self._getmember(linkname, tarinfo=limit, normalize=True)
   2409         if member is None:
   2410             raise KeyError("linkname %r not found" % linkname)
   2411         return member
   2412 
   2413     def __iter__(self):
   2414         """Provide an iterator object.
   2415         """
   2416         if self._loaded:
   2417             return iter(self.members)
   2418         else:
   2419             return TarIter(self)
   2420 
   2421     def _dbg(self, level, msg):
   2422         """Write debugging output to sys.stderr.
   2423         """
   2424         if level <= self.debug:
   2425             print >> sys.stderr, msg
   2426 
   2427     def __enter__(self):
   2428         self._check()
   2429         return self
   2430 
   2431     def __exit__(self, type, value, traceback):
   2432         if type is None:
   2433             self.close()
   2434         else:
   2435             # An exception occurred. We must not call close() because
   2436             # it would try to write end-of-archive blocks and padding.
   2437             if not self._extfileobj:
   2438                 self.fileobj.close()
   2439             self.closed = True
   2440 # class TarFile
   2441 
   2442 class TarIter:
   2443     """Iterator Class.
   2444 
   2445        for tarinfo in TarFile(...):
   2446            suite...
   2447     """
   2448 
   2449     def __init__(self, tarfile):
   2450         """Construct a TarIter object.
   2451         """
   2452         self.tarfile = tarfile
   2453         self.index = 0
   2454     def __iter__(self):
   2455         """Return iterator object.
   2456         """
   2457         return self
   2458     def next(self):
   2459         """Return the next item using TarFile's next() method.
   2460            When all members have been read, set TarFile as _loaded.
   2461         """
   2462         # Fix for SF #1100429: Under rare circumstances it can
   2463         # happen that getmembers() is called during iteration,
   2464         # which will cause TarIter to stop prematurely.
   2465 
   2466         if self.index == 0 and self.tarfile.firstmember is not None:
   2467             tarinfo = self.tarfile.next()
   2468         elif self.index < len(self.tarfile.members):
   2469             tarinfo = self.tarfile.members[self.index]
   2470         elif not self.tarfile._loaded:
   2471             tarinfo = self.tarfile.next()
   2472             if not tarinfo:
   2473                 self.tarfile._loaded = True
   2474                 raise StopIteration
   2475         else:
   2476             raise StopIteration
   2477         self.index += 1
   2478         return tarinfo
   2479 
   2480 # Helper classes for sparse file support
   2481 class _section:
   2482     """Base class for _data and _hole.
   2483     """
   2484     def __init__(self, offset, size):
   2485         self.offset = offset
   2486         self.size = size
   2487     def __contains__(self, offset):
   2488         return self.offset <= offset < self.offset + self.size
   2489 
   2490 class _data(_section):
   2491     """Represent a data section in a sparse file.
   2492     """
   2493     def __init__(self, offset, size, realpos):
   2494         _section.__init__(self, offset, size)
   2495         self.realpos = realpos
   2496 
   2497 class _hole(_section):
   2498     """Represent a hole section in a sparse file.
   2499     """
   2500     pass
   2501 
   2502 class _ringbuffer(list):
   2503     """Ringbuffer class which increases performance
   2504        over a regular list.
   2505     """
   2506     def __init__(self):
   2507         self.idx = 0
   2508     def find(self, offset):
   2509         idx = self.idx
   2510         while True:
   2511             item = self[idx]
   2512             if offset in item:
   2513                 break
   2514             idx += 1
   2515             if idx == len(self):
   2516                 idx = 0
   2517             if idx == self.idx:
   2518                 # End of File
   2519                 return None
   2520         self.idx = idx
   2521         return item
   2522 
   2523 #---------------------------------------------
   2524 # zipfile compatible TarFile class
   2525 #---------------------------------------------
   2526 TAR_PLAIN = 0           # zipfile.ZIP_STORED
   2527 TAR_GZIPPED = 8         # zipfile.ZIP_DEFLATED
   2528 class TarFileCompat:
   2529     """TarFile class compatible with standard module zipfile's
   2530        ZipFile class.
   2531     """
   2532     def __init__(self, file, mode="r", compression=TAR_PLAIN):
   2533         from warnings import warnpy3k
   2534         warnpy3k("the TarFileCompat class has been removed in Python 3.0",
   2535                 stacklevel=2)
   2536         if compression == TAR_PLAIN:
   2537             self.tarfile = TarFile.taropen(file, mode)
   2538         elif compression == TAR_GZIPPED:
   2539             self.tarfile = TarFile.gzopen(file, mode)
   2540         else:
   2541             raise ValueError("unknown compression constant")
   2542         if mode[0:1] == "r":
   2543             members = self.tarfile.getmembers()
   2544             for m in members:
   2545                 m.filename = m.name
   2546                 m.file_size = m.size
   2547                 m.date_time = time.gmtime(m.mtime)[:6]
   2548     def namelist(self):
   2549         return map(lambda m: m.name, self.infolist())
   2550     def infolist(self):
   2551         return filter(lambda m: m.type in REGULAR_TYPES,
   2552                       self.tarfile.getmembers())
   2553     def printdir(self):
   2554         self.tarfile.list()
   2555     def testzip(self):
   2556         return
   2557     def getinfo(self, name):
   2558         return self.tarfile.getmember(name)
   2559     def read(self, name):
   2560         return self.tarfile.extractfile(self.tarfile.getmember(name)).read()
   2561     def write(self, filename, arcname=None, compress_type=None):
   2562         self.tarfile.add(filename, arcname)
   2563     def writestr(self, zinfo, bytes):
   2564         try:
   2565             from cStringIO import StringIO
   2566         except ImportError:
   2567             from StringIO import StringIO
   2568         import calendar
   2569         tinfo = TarInfo(zinfo.filename)
   2570         tinfo.size = len(bytes)
   2571         tinfo.mtime = calendar.timegm(zinfo.date_time)
   2572         self.tarfile.addfile(tinfo, StringIO(bytes))
   2573     def close(self):
   2574         self.tarfile.close()
   2575 #class TarFileCompat
   2576 
   2577 #--------------------
   2578 # exported functions
   2579 #--------------------
   2580 def is_tarfile(name):
   2581     """Return True if name points to a tar archive that we
   2582        are able to handle, else return False.
   2583     """
   2584     try:
   2585         t = open(name)
   2586         t.close()
   2587         return True
   2588     except TarError:
   2589         return False
   2590 
   2591 bltn_open = open
   2592 open = TarFile.open
   2593