Home | History | Annotate | Download | only in Lib
      1 # -*- coding: iso-8859-1 -*-
      2 #-------------------------------------------------------------------
      3 # tarfile.py
      4 #-------------------------------------------------------------------
      5 # Copyright (C) 2002 Lars Gustbel <lars (at] gustaebel.de>
      6 # All rights reserved.
      7 #
      8 # Permission  is  hereby granted,  free  of charge,  to  any person
      9 # obtaining a  copy of  this software  and associated documentation
     10 # files  (the  "Software"),  to   deal  in  the  Software   without
     11 # restriction,  including  without limitation  the  rights to  use,
     12 # copy, modify, merge, publish, distribute, sublicense, and/or sell
     13 # copies  of  the  Software,  and to  permit  persons  to  whom the
     14 # Software  is  furnished  to  do  so,  subject  to  the  following
     15 # conditions:
     16 #
     17 # The above copyright  notice and this  permission notice shall  be
     18 # included in all copies or substantial portions of the Software.
     19 #
     20 # THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
     21 # EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
     22 # OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
     23 # NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
     24 # HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
     25 # WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
     26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     27 # OTHER DEALINGS IN THE SOFTWARE.
     28 #
     29 """Read from and write to tar format archives.
     30 """
     31 
     32 __version__ = "$Revision: 85213 $"
     33 # $Source$
     34 
     35 version     = "0.9.0"
     36 __author__  = "Lars Gustbel (lars (at] gustaebel.de)"
     37 __date__    = "$Date$"
     38 __cvsid__   = "$Id$"
     39 __credits__ = "Gustavo Niemeyer, Niels Gustbel, Richard Townsend."
     40 
     41 #---------
     42 # Imports
     43 #---------
     44 from __builtin__ import open as bltn_open
     45 import sys
     46 import os
     47 import shutil
     48 import stat
     49 import errno
     50 import time
     51 import struct
     52 import copy
     53 import re
     54 import operator
     55 
     56 try:
     57     import grp, pwd
     58 except ImportError:
     59     grp = pwd = None
     60 
     61 # from tarfile import *
     62 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
     63 
     64 #---------------------------------------------------------
     65 # tar constants
     66 #---------------------------------------------------------
     67 NUL = "\0"                      # the null character
     68 BLOCKSIZE = 512                 # length of processing blocks
     69 RECORDSIZE = BLOCKSIZE * 20     # length of records
     70 GNU_MAGIC = "ustar  \0"         # magic gnu tar string
     71 POSIX_MAGIC = "ustar\x0000"     # magic posix tar string
     72 
     73 LENGTH_NAME = 100               # maximum length of a filename
     74 LENGTH_LINK = 100               # maximum length of a linkname
     75 LENGTH_PREFIX = 155             # maximum length of the prefix field
     76 
     77 REGTYPE = "0"                   # regular file
     78 AREGTYPE = "\0"                 # regular file
     79 LNKTYPE = "1"                   # link (inside tarfile)
     80 SYMTYPE = "2"                   # symbolic link
     81 CHRTYPE = "3"                   # character special device
     82 BLKTYPE = "4"                   # block special device
     83 DIRTYPE = "5"                   # directory
     84 FIFOTYPE = "6"                  # fifo special device
     85 CONTTYPE = "7"                  # contiguous file
     86 
     87 GNUTYPE_LONGNAME = "L"          # GNU tar longname
     88 GNUTYPE_LONGLINK = "K"          # GNU tar longlink
     89 GNUTYPE_SPARSE = "S"            # GNU tar sparse file
     90 
     91 XHDTYPE = "x"                   # POSIX.1-2001 extended header
     92 XGLTYPE = "g"                   # POSIX.1-2001 global header
     93 SOLARIS_XHDTYPE = "X"           # Solaris extended header
     94 
     95 USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
     96 GNU_FORMAT = 1                  # GNU tar format
     97 PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
     98 DEFAULT_FORMAT = GNU_FORMAT
     99 
    100 #---------------------------------------------------------
    101 # tarfile constants
    102 #---------------------------------------------------------
    103 # File types that tarfile supports:
    104 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
    105                    SYMTYPE, DIRTYPE, FIFOTYPE,
    106                    CONTTYPE, CHRTYPE, BLKTYPE,
    107                    GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
    108                    GNUTYPE_SPARSE)
    109 
    110 # File types that will be treated as a regular file.
    111 REGULAR_TYPES = (REGTYPE, AREGTYPE,
    112                  CONTTYPE, GNUTYPE_SPARSE)
    113 
    114 # File types that are part of the GNU tar format.
    115 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
    116              GNUTYPE_SPARSE)
    117 
    118 # Fields from a pax header that override a TarInfo attribute.
    119 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
    120               "uid", "gid", "uname", "gname")
    121 
    122 # Fields in a pax header that are numbers, all other fields
    123 # are treated as strings.
    124 PAX_NUMBER_FIELDS = {
    125     "atime": float,
    126     "ctime": float,
    127     "mtime": float,
    128     "uid": int,
    129     "gid": int,
    130     "size": int
    131 }
    132 
    133 #---------------------------------------------------------
    134 # Bits used in the mode field, values in octal.
    135 #---------------------------------------------------------
    136 S_IFLNK = 0120000        # symbolic link
    137 S_IFREG = 0100000        # regular file
    138 S_IFBLK = 0060000        # block device
    139 S_IFDIR = 0040000        # directory
    140 S_IFCHR = 0020000        # character device
    141 S_IFIFO = 0010000        # fifo
    142 
    143 TSUID   = 04000          # set UID on execution
    144 TSGID   = 02000          # set GID on execution
    145 TSVTX   = 01000          # reserved
    146 
    147 TUREAD  = 0400           # read by owner
    148 TUWRITE = 0200           # write by owner
    149 TUEXEC  = 0100           # execute/search by owner
    150 TGREAD  = 0040           # read by group
    151 TGWRITE = 0020           # write by group
    152 TGEXEC  = 0010           # execute/search by group
    153 TOREAD  = 0004           # read by other
    154 TOWRITE = 0002           # write by other
    155 TOEXEC  = 0001           # execute/search by other
    156 
    157 #---------------------------------------------------------
    158 # initialization
    159 #---------------------------------------------------------
    160 ENCODING = sys.getfilesystemencoding()
    161 if ENCODING is None:
    162     ENCODING = sys.getdefaultencoding()
    163 
    164 #---------------------------------------------------------
    165 # Some useful functions
    166 #---------------------------------------------------------
    167 
    168 def stn(s, length):
    169     """Convert a python string to a null-terminated string buffer.
    170     """
    171     return s[:length] + (length - len(s)) * NUL
    172 
    173 def nts(s):
    174     """Convert a null-terminated string field to a python string.
    175     """
    176     # Use the string up to the first null char.
    177     p = s.find("\0")
    178     if p == -1:
    179         return s
    180     return s[:p]
    181 
    182 def nti(s):
    183     """Convert a number field to a python number.
    184     """
    185     # There are two possible encodings for a number field, see
    186     # itn() below.
    187     if s[0] != chr(0200):
    188         try:
    189             n = int(nts(s).strip() or "0", 8)
    190         except ValueError:
    191             raise InvalidHeaderError("invalid header")
    192     else:
    193         n = 0L
    194         for i in xrange(len(s) - 1):
    195             n <<= 8
    196             n += ord(s[i + 1])
    197     return n
    198 
    199 def itn(n, digits=8, format=DEFAULT_FORMAT):
    200     """Convert a python number to a number field.
    201     """
    202     # POSIX 1003.1-1988 requires numbers to be encoded as a string of
    203     # octal digits followed by a null-byte, this allows values up to
    204     # (8**(digits-1))-1. GNU tar allows storing numbers greater than
    205     # that if necessary. A leading 0200 byte indicates this particular
    206     # encoding, the following digits-1 bytes are a big-endian
    207     # representation. This allows values up to (256**(digits-1))-1.
    208     if 0 <= n < 8 ** (digits - 1):
    209         s = "%0*o" % (digits - 1, n) + NUL
    210     else:
    211         if format != GNU_FORMAT or n >= 256 ** (digits - 1):
    212             raise ValueError("overflow in number field")
    213 
    214         if n < 0:
    215             # XXX We mimic GNU tar's behaviour with negative numbers,
    216             # this could raise OverflowError.
    217             n = struct.unpack("L", struct.pack("l", n))[0]
    218 
    219         s = ""
    220         for i in xrange(digits - 1):
    221             s = chr(n & 0377) + s
    222             n >>= 8
    223         s = chr(0200) + s
    224     return s
    225 
    226 def uts(s, encoding, errors):
    227     """Convert a unicode object to a string.
    228     """
    229     if errors == "utf-8":
    230         # An extra error handler similar to the -o invalid=UTF-8 option
    231         # in POSIX.1-2001. Replace untranslatable characters with their
    232         # UTF-8 representation.
    233         try:
    234             return s.encode(encoding, "strict")
    235         except UnicodeEncodeError:
    236             x = []
    237             for c in s:
    238                 try:
    239                     x.append(c.encode(encoding, "strict"))
    240                 except UnicodeEncodeError:
    241                     x.append(c.encode("utf8"))
    242             return "".join(x)
    243     else:
    244         return s.encode(encoding, errors)
    245 
    246 def calc_chksums(buf):
    247     """Calculate the checksum for a member's header by summing up all
    248        characters except for the chksum field which is treated as if
    249        it was filled with spaces. According to the GNU tar sources,
    250        some tars (Sun and NeXT) calculate chksum with signed char,
    251        which will be different if there are chars in the buffer with
    252        the high bit set. So we calculate two checksums, unsigned and
    253        signed.
    254     """
    255     unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
    256     signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
    257     return unsigned_chksum, signed_chksum
    258 
    259 def copyfileobj(src, dst, length=None):
    260     """Copy length bytes from fileobj src to fileobj dst.
    261        If length is None, copy the entire content.
    262     """
    263     if length == 0:
    264         return
    265     if length is None:
    266         shutil.copyfileobj(src, dst)
    267         return
    268 
    269     BUFSIZE = 16 * 1024
    270     blocks, remainder = divmod(length, BUFSIZE)
    271     for b in xrange(blocks):
    272         buf = src.read(BUFSIZE)
    273         if len(buf) < BUFSIZE:
    274             raise IOError("end of file reached")
    275         dst.write(buf)
    276 
    277     if remainder != 0:
    278         buf = src.read(remainder)
    279         if len(buf) < remainder:
    280             raise IOError("end of file reached")
    281         dst.write(buf)
    282     return
    283 
    284 filemode_table = (
    285     ((S_IFLNK,      "l"),
    286      (S_IFREG,      "-"),
    287      (S_IFBLK,      "b"),
    288      (S_IFDIR,      "d"),
    289      (S_IFCHR,      "c"),
    290      (S_IFIFO,      "p")),
    291 
    292     ((TUREAD,       "r"),),
    293     ((TUWRITE,      "w"),),
    294     ((TUEXEC|TSUID, "s"),
    295      (TSUID,        "S"),
    296      (TUEXEC,       "x")),
    297 
    298     ((TGREAD,       "r"),),
    299     ((TGWRITE,      "w"),),
    300     ((TGEXEC|TSGID, "s"),
    301      (TSGID,        "S"),
    302      (TGEXEC,       "x")),
    303 
    304     ((TOREAD,       "r"),),
    305     ((TOWRITE,      "w"),),
    306     ((TOEXEC|TSVTX, "t"),
    307      (TSVTX,        "T"),
    308      (TOEXEC,       "x"))
    309 )
    310 
    311 def filemode(mode):
    312     """Convert a file's mode to a string of the form
    313        -rwxrwxrwx.
    314        Used by TarFile.list()
    315     """
    316     perm = []
    317     for table in filemode_table:
    318         for bit, char in table:
    319             if mode & bit == bit:
    320                 perm.append(char)
    321                 break
    322         else:
    323             perm.append("-")
    324     return "".join(perm)
    325 
    326 class TarError(Exception):
    327     """Base exception."""
    328     pass
    329 class ExtractError(TarError):
    330     """General exception for extract errors."""
    331     pass
    332 class ReadError(TarError):
    333     """Exception for unreadable tar archives."""
    334     pass
    335 class CompressionError(TarError):
    336     """Exception for unavailable compression methods."""
    337     pass
    338 class StreamError(TarError):
    339     """Exception for unsupported operations on stream-like TarFiles."""
    340     pass
    341 class HeaderError(TarError):
    342     """Base exception for header errors."""
    343     pass
    344 class EmptyHeaderError(HeaderError):
    345     """Exception for empty headers."""
    346     pass
    347 class TruncatedHeaderError(HeaderError):
    348     """Exception for truncated headers."""
    349     pass
    350 class EOFHeaderError(HeaderError):
    351     """Exception for end of file headers."""
    352     pass
    353 class InvalidHeaderError(HeaderError):
    354     """Exception for invalid headers."""
    355     pass
    356 class SubsequentHeaderError(HeaderError):
    357     """Exception for missing and invalid extended headers."""
    358     pass
    359 
    360 #---------------------------
    361 # internal stream interface
    362 #---------------------------
    363 class _LowLevelFile:
    364     """Low-level file object. Supports reading and writing.
    365        It is used instead of a regular file object for streaming
    366        access.
    367     """
    368 
    369     def __init__(self, name, mode):
    370         mode = {
    371             "r": os.O_RDONLY,
    372             "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
    373         }[mode]
    374         if hasattr(os, "O_BINARY"):
    375             mode |= os.O_BINARY
    376         self.fd = os.open(name, mode, 0666)
    377 
    378     def close(self):
    379         os.close(self.fd)
    380 
    381     def read(self, size):
    382         return os.read(self.fd, size)
    383 
    384     def write(self, s):
    385         os.write(self.fd, s)
    386 
    387 class _Stream:
    388     """Class that serves as an adapter between TarFile and
    389        a stream-like object.  The stream-like object only
    390        needs to have a read() or write() method and is accessed
    391        blockwise.  Use of gzip or bzip2 compression is possible.
    392        A stream-like object could be for example: sys.stdin,
    393        sys.stdout, a socket, a tape device etc.
    394 
    395        _Stream is intended to be used only internally.
    396     """
    397 
    398     def __init__(self, name, mode, comptype, fileobj, bufsize):
    399         """Construct a _Stream object.
    400         """
    401         self._extfileobj = True
    402         if fileobj is None:
    403             fileobj = _LowLevelFile(name, mode)
    404             self._extfileobj = False
    405 
    406         if comptype == '*':
    407             # Enable transparent compression detection for the
    408             # stream interface
    409             fileobj = _StreamProxy(fileobj)
    410             comptype = fileobj.getcomptype()
    411 
    412         self.name     = name or ""
    413         self.mode     = mode
    414         self.comptype = comptype
    415         self.fileobj  = fileobj
    416         self.bufsize  = bufsize
    417         self.buf      = ""
    418         self.pos      = 0L
    419         self.closed   = False
    420 
    421         try:
    422             if comptype == "gz":
    423                 try:
    424                     import zlib
    425                 except ImportError:
    426                     raise CompressionError("zlib module is not available")
    427                 self.zlib = zlib
    428                 self.crc = zlib.crc32("") & 0xffffffffL
    429                 if mode == "r":
    430                     self._init_read_gz()
    431                 else:
    432                     self._init_write_gz()
    433 
    434             elif comptype == "bz2":
    435                 try:
    436                     import bz2
    437                 except ImportError:
    438                     raise CompressionError("bz2 module is not available")
    439                 if mode == "r":
    440                     self.dbuf = ""
    441                     self.cmp = bz2.BZ2Decompressor()
    442                 else:
    443                     self.cmp = bz2.BZ2Compressor()
    444         except:
    445             if not self._extfileobj:
    446                 self.fileobj.close()
    447             self.closed = True
    448             raise
    449 
    450     def __del__(self):
    451         if hasattr(self, "closed") and not self.closed:
    452             self.close()
    453 
    454     def _init_write_gz(self):
    455         """Initialize for writing with gzip compression.
    456         """
    457         self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
    458                                             -self.zlib.MAX_WBITS,
    459                                             self.zlib.DEF_MEM_LEVEL,
    460                                             0)
    461         timestamp = struct.pack("<L", long(time.time()))
    462         self.__write("\037\213\010\010%s\002\377" % timestamp)
    463         if type(self.name) is unicode:
    464             self.name = self.name.encode("iso-8859-1", "replace")
    465         if self.name.endswith(".gz"):
    466             self.name = self.name[:-3]
    467         self.__write(self.name + NUL)
    468 
    469     def write(self, s):
    470         """Write string s to the stream.
    471         """
    472         if self.comptype == "gz":
    473             self.crc = self.zlib.crc32(s, self.crc) & 0xffffffffL
    474         self.pos += len(s)
    475         if self.comptype != "tar":
    476             s = self.cmp.compress(s)
    477         self.__write(s)
    478 
    479     def __write(self, s):
    480         """Write string s to the stream if a whole new block
    481            is ready to be written.
    482         """
    483         self.buf += s
    484         while len(self.buf) > self.bufsize:
    485             self.fileobj.write(self.buf[:self.bufsize])
    486             self.buf = self.buf[self.bufsize:]
    487 
    488     def close(self):
    489         """Close the _Stream object. No operation should be
    490            done on it afterwards.
    491         """
    492         if self.closed:
    493             return
    494 
    495         self.closed = True
    496         try:
    497             if self.mode == "w" and self.comptype != "tar":
    498                 self.buf += self.cmp.flush()
    499 
    500             if self.mode == "w" and self.buf:
    501                 self.fileobj.write(self.buf)
    502                 self.buf = ""
    503                 if self.comptype == "gz":
    504                     # The native zlib crc is an unsigned 32-bit integer, but
    505                     # the Python wrapper implicitly casts that to a signed C
    506                     # long.  So, on a 32-bit box self.crc may "look negative",
    507                     # while the same crc on a 64-bit box may "look positive".
    508                     # To avoid irksome warnings from the `struct` module, force
    509                     # it to look positive on all boxes.
    510                     self.fileobj.write(struct.pack("<L", self.crc & 0xffffffffL))
    511                     self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFFL))
    512         finally:
    513             if not self._extfileobj:
    514                 self.fileobj.close()
    515 
    516     def _init_read_gz(self):
    517         """Initialize for reading a gzip compressed fileobj.
    518         """
    519         self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
    520         self.dbuf = ""
    521 
    522         # taken from gzip.GzipFile with some alterations
    523         if self.__read(2) != "\037\213":
    524             raise ReadError("not a gzip file")
    525         if self.__read(1) != "\010":
    526             raise CompressionError("unsupported compression method")
    527 
    528         flag = ord(self.__read(1))
    529         self.__read(6)
    530 
    531         if flag & 4:
    532             xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
    533             self.read(xlen)
    534         if flag & 8:
    535             while True:
    536                 s = self.__read(1)
    537                 if not s or s == NUL:
    538                     break
    539         if flag & 16:
    540             while True:
    541                 s = self.__read(1)
    542                 if not s or s == NUL:
    543                     break
    544         if flag & 2:
    545             self.__read(2)
    546 
    547     def tell(self):
    548         """Return the stream's file pointer position.
    549         """
    550         return self.pos
    551 
    552     def seek(self, pos=0):
    553         """Set the stream's file pointer to pos. Negative seeking
    554            is forbidden.
    555         """
    556         if pos - self.pos >= 0:
    557             blocks, remainder = divmod(pos - self.pos, self.bufsize)
    558             for i in xrange(blocks):
    559                 self.read(self.bufsize)
    560             self.read(remainder)
    561         else:
    562             raise StreamError("seeking backwards is not allowed")
    563         return self.pos
    564 
    565     def read(self, size=None):
    566         """Return the next size number of bytes from the stream.
    567            If size is not defined, return all bytes of the stream
    568            up to EOF.
    569         """
    570         if size is None:
    571             t = []
    572             while True:
    573                 buf = self._read(self.bufsize)
    574                 if not buf:
    575                     break
    576                 t.append(buf)
    577             buf = "".join(t)
    578         else:
    579             buf = self._read(size)
    580         self.pos += len(buf)
    581         return buf
    582 
    583     def _read(self, size):
    584         """Return size bytes from the stream.
    585         """
    586         if self.comptype == "tar":
    587             return self.__read(size)
    588 
    589         c = len(self.dbuf)
    590         t = [self.dbuf]
    591         while c < size:
    592             buf = self.__read(self.bufsize)
    593             if not buf:
    594                 break
    595             try:
    596                 buf = self.cmp.decompress(buf)
    597             except IOError:
    598                 raise ReadError("invalid compressed data")
    599             t.append(buf)
    600             c += len(buf)
    601         t = "".join(t)
    602         self.dbuf = t[size:]
    603         return t[:size]
    604 
    605     def __read(self, size):
    606         """Return size bytes from stream. If internal buffer is empty,
    607            read another block from the stream.
    608         """
    609         c = len(self.buf)
    610         t = [self.buf]
    611         while c < size:
    612             buf = self.fileobj.read(self.bufsize)
    613             if not buf:
    614                 break
    615             t.append(buf)
    616             c += len(buf)
    617         t = "".join(t)
    618         self.buf = t[size:]
    619         return t[:size]
    620 # class _Stream
    621 
    622 class _StreamProxy(object):
    623     """Small proxy class that enables transparent compression
    624        detection for the Stream interface (mode 'r|*').
    625     """
    626 
    627     def __init__(self, fileobj):
    628         self.fileobj = fileobj
    629         self.buf = self.fileobj.read(BLOCKSIZE)
    630 
    631     def read(self, size):
    632         self.read = self.fileobj.read
    633         return self.buf
    634 
    635     def getcomptype(self):
    636         if self.buf.startswith("\037\213\010"):
    637             return "gz"
    638         if self.buf[0:3] == "BZh" and self.buf[4:10] == "1AY&SY":
    639             return "bz2"
    640         return "tar"
    641 
    642     def close(self):
    643         self.fileobj.close()
    644 # class StreamProxy
    645 
    646 class _BZ2Proxy(object):
    647     """Small proxy class that enables external file object
    648        support for "r:bz2" and "w:bz2" modes. This is actually
    649        a workaround for a limitation in bz2 module's BZ2File
    650        class which (unlike gzip.GzipFile) has no support for
    651        a file object argument.
    652     """
    653 
    654     blocksize = 16 * 1024
    655 
    656     def __init__(self, fileobj, mode):
    657         self.fileobj = fileobj
    658         self.mode = mode
    659         self.name = getattr(self.fileobj, "name", None)
    660         self.init()
    661 
    662     def init(self):
    663         import bz2
    664         self.pos = 0
    665         if self.mode == "r":
    666             self.bz2obj = bz2.BZ2Decompressor()
    667             self.fileobj.seek(0)
    668             self.buf = ""
    669         else:
    670             self.bz2obj = bz2.BZ2Compressor()
    671 
    672     def read(self, size):
    673         b = [self.buf]
    674         x = len(self.buf)
    675         while x < size:
    676             raw = self.fileobj.read(self.blocksize)
    677             if not raw:
    678                 break
    679             data = self.bz2obj.decompress(raw)
    680             b.append(data)
    681             x += len(data)
    682         self.buf = "".join(b)
    683 
    684         buf = self.buf[:size]
    685         self.buf = self.buf[size:]
    686         self.pos += len(buf)
    687         return buf
    688 
    689     def seek(self, pos):
    690         if pos < self.pos:
    691             self.init()
    692         self.read(pos - self.pos)
    693 
    694     def tell(self):
    695         return self.pos
    696 
    697     def write(self, data):
    698         self.pos += len(data)
    699         raw = self.bz2obj.compress(data)
    700         self.fileobj.write(raw)
    701 
    702     def close(self):
    703         if self.mode == "w":
    704             raw = self.bz2obj.flush()
    705             self.fileobj.write(raw)
    706 # class _BZ2Proxy
    707 
    708 #------------------------
    709 # Extraction file object
    710 #------------------------
    711 class _FileInFile(object):
    712     """A thin wrapper around an existing file object that
    713        provides a part of its data as an individual file
    714        object.
    715     """
    716 
    717     def __init__(self, fileobj, offset, size, sparse=None):
    718         self.fileobj = fileobj
    719         self.offset = offset
    720         self.size = size
    721         self.sparse = sparse
    722         self.position = 0
    723 
    724     def tell(self):
    725         """Return the current file position.
    726         """
    727         return self.position
    728 
    729     def seek(self, position):
    730         """Seek to a position in the file.
    731         """
    732         self.position = position
    733 
    734     def read(self, size=None):
    735         """Read data from the file.
    736         """
    737         if size is None:
    738             size = self.size - self.position
    739         else:
    740             size = min(size, self.size - self.position)
    741 
    742         if self.sparse is None:
    743             return self.readnormal(size)
    744         else:
    745             return self.readsparse(size)
    746 
    747     def __read(self, size):
    748         buf = self.fileobj.read(size)
    749         if len(buf) != size:
    750             raise ReadError("unexpected end of data")
    751         return buf
    752 
    753     def readnormal(self, size):
    754         """Read operation for regular files.
    755         """
    756         self.fileobj.seek(self.offset + self.position)
    757         self.position += size
    758         return self.__read(size)
    759 
    760     def readsparse(self, size):
    761         """Read operation for sparse files.
    762         """
    763         data = []
    764         while size > 0:
    765             buf = self.readsparsesection(size)
    766             if not buf:
    767                 break
    768             size -= len(buf)
    769             data.append(buf)
    770         return "".join(data)
    771 
    772     def readsparsesection(self, size):
    773         """Read a single section of a sparse file.
    774         """
    775         section = self.sparse.find(self.position)
    776 
    777         if section is None:
    778             return ""
    779 
    780         size = min(size, section.offset + section.size - self.position)
    781 
    782         if isinstance(section, _data):
    783             realpos = section.realpos + self.position - section.offset
    784             self.fileobj.seek(self.offset + realpos)
    785             self.position += size
    786             return self.__read(size)
    787         else:
    788             self.position += size
    789             return NUL * size
    790 #class _FileInFile
    791 
    792 
    793 class ExFileObject(object):
    794     """File-like object for reading an archive member.
    795        Is returned by TarFile.extractfile().
    796     """
    797     blocksize = 1024
    798 
    799     def __init__(self, tarfile, tarinfo):
    800         self.fileobj = _FileInFile(tarfile.fileobj,
    801                                    tarinfo.offset_data,
    802                                    tarinfo.size,
    803                                    getattr(tarinfo, "sparse", None))
    804         self.name = tarinfo.name
    805         self.mode = "r"
    806         self.closed = False
    807         self.size = tarinfo.size
    808 
    809         self.position = 0
    810         self.buffer = ""
    811 
    812     def read(self, size=None):
    813         """Read at most size bytes from the file. If size is not
    814            present or None, read all data until EOF is reached.
    815         """
    816         if self.closed:
    817             raise ValueError("I/O operation on closed file")
    818 
    819         buf = ""
    820         if self.buffer:
    821             if size is None:
    822                 buf = self.buffer
    823                 self.buffer = ""
    824             else:
    825                 buf = self.buffer[:size]
    826                 self.buffer = self.buffer[size:]
    827 
    828         if size is None:
    829             buf += self.fileobj.read()
    830         else:
    831             buf += self.fileobj.read(size - len(buf))
    832 
    833         self.position += len(buf)
    834         return buf
    835 
    836     def readline(self, size=-1):
    837         """Read one entire line from the file. If size is present
    838            and non-negative, return a string with at most that
    839            size, which may be an incomplete line.
    840         """
    841         if self.closed:
    842             raise ValueError("I/O operation on closed file")
    843 
    844         if "\n" in self.buffer:
    845             pos = self.buffer.find("\n") + 1
    846         else:
    847             buffers = [self.buffer]
    848             while True:
    849                 buf = self.fileobj.read(self.blocksize)
    850                 buffers.append(buf)
    851                 if not buf or "\n" in buf:
    852                     self.buffer = "".join(buffers)
    853                     pos = self.buffer.find("\n") + 1
    854                     if pos == 0:
    855                         # no newline found.
    856                         pos = len(self.buffer)
    857                     break
    858 
    859         if size != -1:
    860             pos = min(size, pos)
    861 
    862         buf = self.buffer[:pos]
    863         self.buffer = self.buffer[pos:]
    864         self.position += len(buf)
    865         return buf
    866 
    867     def readlines(self):
    868         """Return a list with all remaining lines.
    869         """
    870         result = []
    871         while True:
    872             line = self.readline()
    873             if not line: break
    874             result.append(line)
    875         return result
    876 
    877     def tell(self):
    878         """Return the current file position.
    879         """
    880         if self.closed:
    881             raise ValueError("I/O operation on closed file")
    882 
    883         return self.position
    884 
    885     def seek(self, pos, whence=os.SEEK_SET):
    886         """Seek to a position in the file.
    887         """
    888         if self.closed:
    889             raise ValueError("I/O operation on closed file")
    890 
    891         if whence == os.SEEK_SET:
    892             self.position = min(max(pos, 0), self.size)
    893         elif whence == os.SEEK_CUR:
    894             if pos < 0:
    895                 self.position = max(self.position + pos, 0)
    896             else:
    897                 self.position = min(self.position + pos, self.size)
    898         elif whence == os.SEEK_END:
    899             self.position = max(min(self.size + pos, self.size), 0)
    900         else:
    901             raise ValueError("Invalid argument")
    902 
    903         self.buffer = ""
    904         self.fileobj.seek(self.position)
    905 
    906     def close(self):
    907         """Close the file object.
    908         """
    909         self.closed = True
    910 
    911     def __iter__(self):
    912         """Get an iterator over the file's lines.
    913         """
    914         while True:
    915             line = self.readline()
    916             if not line:
    917                 break
    918             yield line
    919 #class ExFileObject
    920 
    921 #------------------
    922 # Exported Classes
    923 #------------------
    924 class TarInfo(object):
    925     """Informational class which holds the details about an
    926        archive member given by a tar header block.
    927        TarInfo objects are returned by TarFile.getmember(),
    928        TarFile.getmembers() and TarFile.gettarinfo() and are
    929        usually created internally.
    930     """
    931 
    932     def __init__(self, name=""):
    933         """Construct a TarInfo object. name is the optional name
    934            of the member.
    935         """
    936         self.name = name        # member name
    937         self.mode = 0644        # file permissions
    938         self.uid = 0            # user id
    939         self.gid = 0            # group id
    940         self.size = 0           # file size
    941         self.mtime = 0          # modification time
    942         self.chksum = 0         # header checksum
    943         self.type = REGTYPE     # member type
    944         self.linkname = ""      # link name
    945         self.uname = ""         # user name
    946         self.gname = ""         # group name
    947         self.devmajor = 0       # device major number
    948         self.devminor = 0       # device minor number
    949 
    950         self.offset = 0         # the tar header starts here
    951         self.offset_data = 0    # the file's data starts here
    952 
    953         self.pax_headers = {}   # pax header information
    954 
    955     # In pax headers the "name" and "linkname" field are called
    956     # "path" and "linkpath".
    957     def _getpath(self):
    958         return self.name
    959     def _setpath(self, name):
    960         self.name = name
    961     path = property(_getpath, _setpath)
    962 
    963     def _getlinkpath(self):
    964         return self.linkname
    965     def _setlinkpath(self, linkname):
    966         self.linkname = linkname
    967     linkpath = property(_getlinkpath, _setlinkpath)
    968 
    969     def __repr__(self):
    970         return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
    971 
    972     def get_info(self, encoding, errors):
    973         """Return the TarInfo's attributes as a dictionary.
    974         """
    975         info = {
    976             "name":     self.name,
    977             "mode":     self.mode & 07777,
    978             "uid":      self.uid,
    979             "gid":      self.gid,
    980             "size":     self.size,
    981             "mtime":    self.mtime,
    982             "chksum":   self.chksum,
    983             "type":     self.type,
    984             "linkname": self.linkname,
    985             "uname":    self.uname,
    986             "gname":    self.gname,
    987             "devmajor": self.devmajor,
    988             "devminor": self.devminor
    989         }
    990 
    991         if info["type"] == DIRTYPE and not info["name"].endswith("/"):
    992             info["name"] += "/"
    993 
    994         for key in ("name", "linkname", "uname", "gname"):
    995             if type(info[key]) is unicode:
    996                 info[key] = info[key].encode(encoding, errors)
    997 
    998         return info
    999 
   1000     def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"):
   1001         """Return a tar header as a string of 512 byte blocks.
   1002         """
   1003         info = self.get_info(encoding, errors)
   1004 
   1005         if format == USTAR_FORMAT:
   1006             return self.create_ustar_header(info)
   1007         elif format == GNU_FORMAT:
   1008             return self.create_gnu_header(info)
   1009         elif format == PAX_FORMAT:
   1010             return self.create_pax_header(info, encoding, errors)
   1011         else:
   1012             raise ValueError("invalid format")
   1013 
   1014     def create_ustar_header(self, info):
   1015         """Return the object as a ustar header block.
   1016         """
   1017         info["magic"] = POSIX_MAGIC
   1018 
   1019         if len(info["linkname"]) > LENGTH_LINK:
   1020             raise ValueError("linkname is too long")
   1021 
   1022         if len(info["name"]) > LENGTH_NAME:
   1023             info["prefix"], info["name"] = self._posix_split_name(info["name"])
   1024 
   1025         return self._create_header(info, USTAR_FORMAT)
   1026 
   1027     def create_gnu_header(self, info):
   1028         """Return the object as a GNU header block sequence.
   1029         """
   1030         info["magic"] = GNU_MAGIC
   1031 
   1032         buf = ""
   1033         if len(info["linkname"]) > LENGTH_LINK:
   1034             buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK)
   1035 
   1036         if len(info["name"]) > LENGTH_NAME:
   1037             buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME)
   1038 
   1039         return buf + self._create_header(info, GNU_FORMAT)
   1040 
   1041     def create_pax_header(self, info, encoding, errors):
   1042         """Return the object as a ustar header block. If it cannot be
   1043            represented this way, prepend a pax extended header sequence
   1044            with supplement information.
   1045         """
   1046         info["magic"] = POSIX_MAGIC
   1047         pax_headers = self.pax_headers.copy()
   1048 
   1049         # Test string fields for values that exceed the field length or cannot
   1050         # be represented in ASCII encoding.
   1051         for name, hname, length in (
   1052                 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
   1053                 ("uname", "uname", 32), ("gname", "gname", 32)):
   1054 
   1055             if hname in pax_headers:
   1056                 # The pax header has priority.
   1057                 continue
   1058 
   1059             val = info[name].decode(encoding, errors)
   1060 
   1061             # Try to encode the string as ASCII.
   1062             try:
   1063                 val.encode("ascii")
   1064             except UnicodeEncodeError:
   1065                 pax_headers[hname] = val
   1066                 continue
   1067 
   1068             if len(info[name]) > length:
   1069                 pax_headers[hname] = val
   1070 
   1071         # Test number fields for values that exceed the field limit or values
   1072         # that like to be stored as float.
   1073         for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
   1074             if name in pax_headers:
   1075                 # The pax header has priority. Avoid overflow.
   1076                 info[name] = 0
   1077                 continue
   1078 
   1079             val = info[name]
   1080             if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
   1081                 pax_headers[name] = unicode(val)
   1082                 info[name] = 0
   1083 
   1084         # Create a pax extended header if necessary.
   1085         if pax_headers:
   1086             buf = self._create_pax_generic_header(pax_headers)
   1087         else:
   1088             buf = ""
   1089 
   1090         return buf + self._create_header(info, USTAR_FORMAT)
   1091 
   1092     @classmethod
   1093     def create_pax_global_header(cls, pax_headers):
   1094         """Return the object as a pax global header block sequence.
   1095         """
   1096         return cls._create_pax_generic_header(pax_headers, type=XGLTYPE)
   1097 
   1098     def _posix_split_name(self, name):
   1099         """Split a name longer than 100 chars into a prefix
   1100            and a name part.
   1101         """
   1102         prefix = name[:LENGTH_PREFIX + 1]
   1103         while prefix and prefix[-1] != "/":
   1104             prefix = prefix[:-1]
   1105 
   1106         name = name[len(prefix):]
   1107         prefix = prefix[:-1]
   1108 
   1109         if not prefix or len(name) > LENGTH_NAME:
   1110             raise ValueError("name is too long")
   1111         return prefix, name
   1112 
   1113     @staticmethod
   1114     def _create_header(info, format):
   1115         """Return a header block. info is a dictionary with file
   1116            information, format must be one of the *_FORMAT constants.
   1117         """
   1118         parts = [
   1119             stn(info.get("name", ""), 100),
   1120             itn(info.get("mode", 0) & 07777, 8, format),
   1121             itn(info.get("uid", 0), 8, format),
   1122             itn(info.get("gid", 0), 8, format),
   1123             itn(info.get("size", 0), 12, format),
   1124             itn(info.get("mtime", 0), 12, format),
   1125             "        ", # checksum field
   1126             info.get("type", REGTYPE),
   1127             stn(info.get("linkname", ""), 100),
   1128             stn(info.get("magic", POSIX_MAGIC), 8),
   1129             stn(info.get("uname", ""), 32),
   1130             stn(info.get("gname", ""), 32),
   1131             itn(info.get("devmajor", 0), 8, format),
   1132             itn(info.get("devminor", 0), 8, format),
   1133             stn(info.get("prefix", ""), 155)
   1134         ]
   1135 
   1136         buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts))
   1137         chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
   1138         buf = buf[:-364] + "%06o\0" % chksum + buf[-357:]
   1139         return buf
   1140 
   1141     @staticmethod
   1142     def _create_payload(payload):
   1143         """Return the string payload filled with zero bytes
   1144            up to the next 512 byte border.
   1145         """
   1146         blocks, remainder = divmod(len(payload), BLOCKSIZE)
   1147         if remainder > 0:
   1148             payload += (BLOCKSIZE - remainder) * NUL
   1149         return payload
   1150 
   1151     @classmethod
   1152     def _create_gnu_long_header(cls, name, type):
   1153         """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
   1154            for name.
   1155         """
   1156         name += NUL
   1157 
   1158         info = {}
   1159         info["name"] = "././@LongLink"
   1160         info["type"] = type
   1161         info["size"] = len(name)
   1162         info["magic"] = GNU_MAGIC
   1163 
   1164         # create extended header + name blocks.
   1165         return cls._create_header(info, USTAR_FORMAT) + \
   1166                 cls._create_payload(name)
   1167 
   1168     @classmethod
   1169     def _create_pax_generic_header(cls, pax_headers, type=XHDTYPE):
   1170         """Return a POSIX.1-2001 extended or global header sequence
   1171            that contains a list of keyword, value pairs. The values
   1172            must be unicode objects.
   1173         """
   1174         records = []
   1175         for keyword, value in pax_headers.iteritems():
   1176             keyword = keyword.encode("utf8")
   1177             value = value.encode("utf8")
   1178             l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
   1179             n = p = 0
   1180             while True:
   1181                 n = l + len(str(p))
   1182                 if n == p:
   1183                     break
   1184                 p = n
   1185             records.append("%d %s=%s\n" % (p, keyword, value))
   1186         records = "".join(records)
   1187 
   1188         # We use a hardcoded "././@PaxHeader" name like star does
   1189         # instead of the one that POSIX recommends.
   1190         info = {}
   1191         info["name"] = "././@PaxHeader"
   1192         info["type"] = type
   1193         info["size"] = len(records)
   1194         info["magic"] = POSIX_MAGIC
   1195 
   1196         # Create pax header + record blocks.
   1197         return cls._create_header(info, USTAR_FORMAT) + \
   1198                 cls._create_payload(records)
   1199 
   1200     @classmethod
   1201     def frombuf(cls, buf):
   1202         """Construct a TarInfo object from a 512 byte string buffer.
   1203         """
   1204         if len(buf) == 0:
   1205             raise EmptyHeaderError("empty header")
   1206         if len(buf) != BLOCKSIZE:
   1207             raise TruncatedHeaderError("truncated header")
   1208         if buf.count(NUL) == BLOCKSIZE:
   1209             raise EOFHeaderError("end of file header")
   1210 
   1211         chksum = nti(buf[148:156])
   1212         if chksum not in calc_chksums(buf):
   1213             raise InvalidHeaderError("bad checksum")
   1214 
   1215         obj = cls()
   1216         obj.buf = buf
   1217         obj.name = nts(buf[0:100])
   1218         obj.mode = nti(buf[100:108])
   1219         obj.uid = nti(buf[108:116])
   1220         obj.gid = nti(buf[116:124])
   1221         obj.size = nti(buf[124:136])
   1222         obj.mtime = nti(buf[136:148])
   1223         obj.chksum = chksum
   1224         obj.type = buf[156:157]
   1225         obj.linkname = nts(buf[157:257])
   1226         obj.uname = nts(buf[265:297])
   1227         obj.gname = nts(buf[297:329])
   1228         obj.devmajor = nti(buf[329:337])
   1229         obj.devminor = nti(buf[337:345])
   1230         prefix = nts(buf[345:500])
   1231 
   1232         # Old V7 tar format represents a directory as a regular
   1233         # file with a trailing slash.
   1234         if obj.type == AREGTYPE and obj.name.endswith("/"):
   1235             obj.type = DIRTYPE
   1236 
   1237         # Remove redundant slashes from directories.
   1238         if obj.isdir():
   1239             obj.name = obj.name.rstrip("/")
   1240 
   1241         # Reconstruct a ustar longname.
   1242         if prefix and obj.type not in GNU_TYPES:
   1243             obj.name = prefix + "/" + obj.name
   1244         return obj
   1245 
   1246     @classmethod
   1247     def fromtarfile(cls, tarfile):
   1248         """Return the next TarInfo object from TarFile object
   1249            tarfile.
   1250         """
   1251         buf = tarfile.fileobj.read(BLOCKSIZE)
   1252         obj = cls.frombuf(buf)
   1253         obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
   1254         return obj._proc_member(tarfile)
   1255 
   1256     #--------------------------------------------------------------------------
   1257     # The following are methods that are called depending on the type of a
   1258     # member. The entry point is _proc_member() which can be overridden in a
   1259     # subclass to add custom _proc_*() methods. A _proc_*() method MUST
   1260     # implement the following
   1261     # operations:
   1262     # 1. Set self.offset_data to the position where the data blocks begin,
   1263     #    if there is data that follows.
   1264     # 2. Set tarfile.offset to the position where the next member's header will
   1265     #    begin.
   1266     # 3. Return self or another valid TarInfo object.
   1267     def _proc_member(self, tarfile):
   1268         """Choose the right processing method depending on
   1269            the type and call it.
   1270         """
   1271         if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
   1272             return self._proc_gnulong(tarfile)
   1273         elif self.type == GNUTYPE_SPARSE:
   1274             return self._proc_sparse(tarfile)
   1275         elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
   1276             return self._proc_pax(tarfile)
   1277         else:
   1278             return self._proc_builtin(tarfile)
   1279 
   1280     def _proc_builtin(self, tarfile):
   1281         """Process a builtin type or an unknown type which
   1282            will be treated as a regular file.
   1283         """
   1284         self.offset_data = tarfile.fileobj.tell()
   1285         offset = self.offset_data
   1286         if self.isreg() or self.type not in SUPPORTED_TYPES:
   1287             # Skip the following data blocks.
   1288             offset += self._block(self.size)
   1289         tarfile.offset = offset
   1290 
   1291         # Patch the TarInfo object with saved global
   1292         # header information.
   1293         self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
   1294 
   1295         return self
   1296 
   1297     def _proc_gnulong(self, tarfile):
   1298         """Process the blocks that hold a GNU longname
   1299            or longlink member.
   1300         """
   1301         buf = tarfile.fileobj.read(self._block(self.size))
   1302 
   1303         # Fetch the next header and process it.
   1304         try:
   1305             next = self.fromtarfile(tarfile)
   1306         except HeaderError:
   1307             raise SubsequentHeaderError("missing or bad subsequent header")
   1308 
   1309         # Patch the TarInfo object from the next header with
   1310         # the longname information.
   1311         next.offset = self.offset
   1312         if self.type == GNUTYPE_LONGNAME:
   1313             next.name = nts(buf)
   1314         elif self.type == GNUTYPE_LONGLINK:
   1315             next.linkname = nts(buf)
   1316 
   1317         return next
   1318 
   1319     def _proc_sparse(self, tarfile):
   1320         """Process a GNU sparse header plus extra headers.
   1321         """
   1322         buf = self.buf
   1323         sp = _ringbuffer()
   1324         pos = 386
   1325         lastpos = 0L
   1326         realpos = 0L
   1327         # There are 4 possible sparse structs in the
   1328         # first header.
   1329         for i in xrange(4):
   1330             try:
   1331                 offset = nti(buf[pos:pos + 12])
   1332                 numbytes = nti(buf[pos + 12:pos + 24])
   1333             except ValueError:
   1334                 break
   1335             if offset > lastpos:
   1336                 sp.append(_hole(lastpos, offset - lastpos))
   1337             sp.append(_data(offset, numbytes, realpos))
   1338             realpos += numbytes
   1339             lastpos = offset + numbytes
   1340             pos += 24
   1341 
   1342         isextended = ord(buf[482])
   1343         origsize = nti(buf[483:495])
   1344 
   1345         # If the isextended flag is given,
   1346         # there are extra headers to process.
   1347         while isextended == 1:
   1348             buf = tarfile.fileobj.read(BLOCKSIZE)
   1349             pos = 0
   1350             for i in xrange(21):
   1351                 try:
   1352                     offset = nti(buf[pos:pos + 12])
   1353                     numbytes = nti(buf[pos + 12:pos + 24])
   1354                 except ValueError:
   1355                     break
   1356                 if offset > lastpos:
   1357                     sp.append(_hole(lastpos, offset - lastpos))
   1358                 sp.append(_data(offset, numbytes, realpos))
   1359                 realpos += numbytes
   1360                 lastpos = offset + numbytes
   1361                 pos += 24
   1362             isextended = ord(buf[504])
   1363 
   1364         if lastpos < origsize:
   1365             sp.append(_hole(lastpos, origsize - lastpos))
   1366 
   1367         self.sparse = sp
   1368 
   1369         self.offset_data = tarfile.fileobj.tell()
   1370         tarfile.offset = self.offset_data + self._block(self.size)
   1371         self.size = origsize
   1372 
   1373         return self
   1374 
   1375     def _proc_pax(self, tarfile):
   1376         """Process an extended or global header as described in
   1377            POSIX.1-2001.
   1378         """
   1379         # Read the header information.
   1380         buf = tarfile.fileobj.read(self._block(self.size))
   1381 
   1382         # A pax header stores supplemental information for either
   1383         # the following file (extended) or all following files
   1384         # (global).
   1385         if self.type == XGLTYPE:
   1386             pax_headers = tarfile.pax_headers
   1387         else:
   1388             pax_headers = tarfile.pax_headers.copy()
   1389 
   1390         # Parse pax header information. A record looks like that:
   1391         # "%d %s=%s\n" % (length, keyword, value). length is the size
   1392         # of the complete record including the length field itself and
   1393         # the newline. keyword and value are both UTF-8 encoded strings.
   1394         regex = re.compile(r"(\d+) ([^=]+)=", re.U)
   1395         pos = 0
   1396         while True:
   1397             match = regex.match(buf, pos)
   1398             if not match:
   1399                 break
   1400 
   1401             length, keyword = match.groups()
   1402             length = int(length)
   1403             value = buf[match.end(2) + 1:match.start(1) + length - 1]
   1404 
   1405             keyword = keyword.decode("utf8")
   1406             value = value.decode("utf8")
   1407 
   1408             pax_headers[keyword] = value
   1409             pos += length
   1410 
   1411         # Fetch the next header.
   1412         try:
   1413             next = self.fromtarfile(tarfile)
   1414         except HeaderError:
   1415             raise SubsequentHeaderError("missing or bad subsequent header")
   1416 
   1417         if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
   1418             # Patch the TarInfo object with the extended header info.
   1419             next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
   1420             next.offset = self.offset
   1421 
   1422             if "size" in pax_headers:
   1423                 # If the extended header replaces the size field,
   1424                 # we need to recalculate the offset where the next
   1425                 # header starts.
   1426                 offset = next.offset_data
   1427                 if next.isreg() or next.type not in SUPPORTED_TYPES:
   1428                     offset += next._block(next.size)
   1429                 tarfile.offset = offset
   1430 
   1431         return next
   1432 
   1433     def _apply_pax_info(self, pax_headers, encoding, errors):
   1434         """Replace fields with supplemental information from a previous
   1435            pax extended or global header.
   1436         """
   1437         for keyword, value in pax_headers.iteritems():
   1438             if keyword not in PAX_FIELDS:
   1439                 continue
   1440 
   1441             if keyword == "path":
   1442                 value = value.rstrip("/")
   1443 
   1444             if keyword in PAX_NUMBER_FIELDS:
   1445                 try:
   1446                     value = PAX_NUMBER_FIELDS[keyword](value)
   1447                 except ValueError:
   1448                     value = 0
   1449             else:
   1450                 value = uts(value, encoding, errors)
   1451 
   1452             setattr(self, keyword, value)
   1453 
   1454         self.pax_headers = pax_headers.copy()
   1455 
   1456     def _block(self, count):
   1457         """Round up a byte count by BLOCKSIZE and return it,
   1458            e.g. _block(834) => 1024.
   1459         """
   1460         blocks, remainder = divmod(count, BLOCKSIZE)
   1461         if remainder:
   1462             blocks += 1
   1463         return blocks * BLOCKSIZE
   1464 
   1465     def isreg(self):
   1466         return self.type in REGULAR_TYPES
   1467     def isfile(self):
   1468         return self.isreg()
   1469     def isdir(self):
   1470         return self.type == DIRTYPE
   1471     def issym(self):
   1472         return self.type == SYMTYPE
   1473     def islnk(self):
   1474         return self.type == LNKTYPE
   1475     def ischr(self):
   1476         return self.type == CHRTYPE
   1477     def isblk(self):
   1478         return self.type == BLKTYPE
   1479     def isfifo(self):
   1480         return self.type == FIFOTYPE
   1481     def issparse(self):
   1482         return self.type == GNUTYPE_SPARSE
   1483     def isdev(self):
   1484         return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
   1485 # class TarInfo
   1486 
   1487 class TarFile(object):
   1488     """The TarFile Class provides an interface to tar archives.
   1489     """
   1490 
   1491     debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
   1492 
   1493     dereference = False         # If true, add content of linked file to the
   1494                                 # tar file, else the link.
   1495 
   1496     ignore_zeros = False        # If true, skips empty or invalid blocks and
   1497                                 # continues processing.
   1498 
   1499     errorlevel = 1              # If 0, fatal errors only appear in debug
   1500                                 # messages (if debug >= 0). If > 0, errors
   1501                                 # are passed to the caller as exceptions.
   1502 
   1503     format = DEFAULT_FORMAT     # The format to use when creating an archive.
   1504 
   1505     encoding = ENCODING         # Encoding for 8-bit character strings.
   1506 
   1507     errors = None               # Error handler for unicode conversion.
   1508 
   1509     tarinfo = TarInfo           # The default TarInfo class to use.
   1510 
   1511     fileobject = ExFileObject   # The default ExFileObject class to use.
   1512 
   1513     def __init__(self, name=None, mode="r", fileobj=None, format=None,
   1514             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
   1515             errors=None, pax_headers=None, debug=None, errorlevel=None):
   1516         """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
   1517            read from an existing archive, 'a' to append data to an existing
   1518            file or 'w' to create a new file overwriting an existing one. `mode'
   1519            defaults to 'r'.
   1520            If `fileobj' is given, it is used for reading or writing data. If it
   1521            can be determined, `mode' is overridden by `fileobj's mode.
   1522            `fileobj' is not closed, when TarFile is closed.
   1523         """
   1524         modes = {"r": "rb", "a": "r+b", "w": "wb"}
   1525         if mode not in modes:
   1526             raise ValueError("mode must be 'r', 'a' or 'w'")
   1527         self.mode = mode
   1528         self._mode = modes[mode]
   1529 
   1530         if not fileobj:
   1531             if self.mode == "a" and not os.path.exists(name):
   1532                 # Create nonexistent files in append mode.
   1533                 self.mode = "w"
   1534                 self._mode = "wb"
   1535             fileobj = bltn_open(name, self._mode)
   1536             self._extfileobj = False
   1537         else:
   1538             if name is None and hasattr(fileobj, "name"):
   1539                 name = fileobj.name
   1540             if hasattr(fileobj, "mode"):
   1541                 self._mode = fileobj.mode
   1542             self._extfileobj = True
   1543         self.name = os.path.abspath(name) if name else None
   1544         self.fileobj = fileobj
   1545 
   1546         # Init attributes.
   1547         if format is not None:
   1548             self.format = format
   1549         if tarinfo is not None:
   1550             self.tarinfo = tarinfo
   1551         if dereference is not None:
   1552             self.dereference = dereference
   1553         if ignore_zeros is not None:
   1554             self.ignore_zeros = ignore_zeros
   1555         if encoding is not None:
   1556             self.encoding = encoding
   1557 
   1558         if errors is not None:
   1559             self.errors = errors
   1560         elif mode == "r":
   1561             self.errors = "utf-8"
   1562         else:
   1563             self.errors = "strict"
   1564 
   1565         if pax_headers is not None and self.format == PAX_FORMAT:
   1566             self.pax_headers = pax_headers
   1567         else:
   1568             self.pax_headers = {}
   1569 
   1570         if debug is not None:
   1571             self.debug = debug
   1572         if errorlevel is not None:
   1573             self.errorlevel = errorlevel
   1574 
   1575         # Init datastructures.
   1576         self.closed = False
   1577         self.members = []       # list of members as TarInfo objects
   1578         self._loaded = False    # flag if all members have been read
   1579         self.offset = self.fileobj.tell()
   1580                                 # current position in the archive file
   1581         self.inodes = {}        # dictionary caching the inodes of
   1582                                 # archive members already added
   1583 
   1584         try:
   1585             if self.mode == "r":
   1586                 self.firstmember = None
   1587                 self.firstmember = self.next()
   1588 
   1589             if self.mode == "a":
   1590                 # Move to the end of the archive,
   1591                 # before the first empty block.
   1592                 while True:
   1593                     self.fileobj.seek(self.offset)
   1594                     try:
   1595                         tarinfo = self.tarinfo.fromtarfile(self)
   1596                         self.members.append(tarinfo)
   1597                     except EOFHeaderError:
   1598                         self.fileobj.seek(self.offset)
   1599                         break
   1600                     except HeaderError, e:
   1601                         raise ReadError(str(e))
   1602 
   1603             if self.mode in "aw":
   1604                 self._loaded = True
   1605 
   1606                 if self.pax_headers:
   1607                     buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
   1608                     self.fileobj.write(buf)
   1609                     self.offset += len(buf)
   1610         except:
   1611             if not self._extfileobj:
   1612                 self.fileobj.close()
   1613             self.closed = True
   1614             raise
   1615 
   1616     def _getposix(self):
   1617         return self.format == USTAR_FORMAT
   1618     def _setposix(self, value):
   1619         import warnings
   1620         warnings.warn("use the format attribute instead", DeprecationWarning,
   1621                       2)
   1622         if value:
   1623             self.format = USTAR_FORMAT
   1624         else:
   1625             self.format = GNU_FORMAT
   1626     posix = property(_getposix, _setposix)
   1627 
   1628     #--------------------------------------------------------------------------
   1629     # Below are the classmethods which act as alternate constructors to the
   1630     # TarFile class. The open() method is the only one that is needed for
   1631     # public use; it is the "super"-constructor and is able to select an
   1632     # adequate "sub"-constructor for a particular compression using the mapping
   1633     # from OPEN_METH.
   1634     #
   1635     # This concept allows one to subclass TarFile without losing the comfort of
   1636     # the super-constructor. A sub-constructor is registered and made available
   1637     # by adding it to the mapping in OPEN_METH.
   1638 
   1639     @classmethod
   1640     def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
   1641         """Open a tar archive for reading, writing or appending. Return
   1642            an appropriate TarFile class.
   1643 
   1644            mode:
   1645            'r' or 'r:*' open for reading with transparent compression
   1646            'r:'         open for reading exclusively uncompressed
   1647            'r:gz'       open for reading with gzip compression
   1648            'r:bz2'      open for reading with bzip2 compression
   1649            'a' or 'a:'  open for appending, creating the file if necessary
   1650            'w' or 'w:'  open for writing without compression
   1651            'w:gz'       open for writing with gzip compression
   1652            'w:bz2'      open for writing with bzip2 compression
   1653 
   1654            'r|*'        open a stream of tar blocks with transparent compression
   1655            'r|'         open an uncompressed stream of tar blocks for reading
   1656            'r|gz'       open a gzip compressed stream of tar blocks
   1657            'r|bz2'      open a bzip2 compressed stream of tar blocks
   1658            'w|'         open an uncompressed stream for writing
   1659            'w|gz'       open a gzip compressed stream for writing
   1660            'w|bz2'      open a bzip2 compressed stream for writing
   1661         """
   1662 
   1663         if not name and not fileobj:
   1664             raise ValueError("nothing to open")
   1665 
   1666         if mode in ("r", "r:*"):
   1667             # Find out which *open() is appropriate for opening the file.
   1668             def not_compressed(comptype):
   1669                 return cls.OPEN_METH[comptype] == 'taropen'
   1670             for comptype in sorted(cls.OPEN_METH, key=not_compressed):
   1671                 func = getattr(cls, cls.OPEN_METH[comptype])
   1672                 if fileobj is not None:
   1673                     saved_pos = fileobj.tell()
   1674                 try:
   1675                     return func(name, "r", fileobj, **kwargs)
   1676                 except (ReadError, CompressionError), e:
   1677                     if fileobj is not None:
   1678                         fileobj.seek(saved_pos)
   1679                     continue
   1680             raise ReadError("file could not be opened successfully")
   1681 
   1682         elif ":" in mode:
   1683             filemode, comptype = mode.split(":", 1)
   1684             filemode = filemode or "r"
   1685             comptype = comptype or "tar"
   1686 
   1687             # Select the *open() function according to
   1688             # given compression.
   1689             if comptype in cls.OPEN_METH:
   1690                 func = getattr(cls, cls.OPEN_METH[comptype])
   1691             else:
   1692                 raise CompressionError("unknown compression type %r" % comptype)
   1693             return func(name, filemode, fileobj, **kwargs)
   1694 
   1695         elif "|" in mode:
   1696             filemode, comptype = mode.split("|", 1)
   1697             filemode = filemode or "r"
   1698             comptype = comptype or "tar"
   1699 
   1700             if filemode not in ("r", "w"):
   1701                 raise ValueError("mode must be 'r' or 'w'")
   1702 
   1703             stream = _Stream(name, filemode, comptype, fileobj, bufsize)
   1704             try:
   1705                 t = cls(name, filemode, stream, **kwargs)
   1706             except:
   1707                 stream.close()
   1708                 raise
   1709             t._extfileobj = False
   1710             return t
   1711 
   1712         elif mode in ("a", "w"):
   1713             return cls.taropen(name, mode, fileobj, **kwargs)
   1714 
   1715         raise ValueError("undiscernible mode")
   1716 
   1717     @classmethod
   1718     def taropen(cls, name, mode="r", fileobj=None, **kwargs):
   1719         """Open uncompressed tar archive name for reading or writing.
   1720         """
   1721         if mode not in ("r", "a", "w"):
   1722             raise ValueError("mode must be 'r', 'a' or 'w'")
   1723         return cls(name, mode, fileobj, **kwargs)
   1724 
   1725     @classmethod
   1726     def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
   1727         """Open gzip compressed tar archive name for reading or writing.
   1728            Appending is not allowed.
   1729         """
   1730         if mode not in ("r", "w"):
   1731             raise ValueError("mode must be 'r' or 'w'")
   1732 
   1733         try:
   1734             import gzip
   1735             gzip.GzipFile
   1736         except (ImportError, AttributeError):
   1737             raise CompressionError("gzip module is not available")
   1738 
   1739         try:
   1740             fileobj = gzip.GzipFile(name, mode, compresslevel, fileobj)
   1741         except OSError:
   1742             if fileobj is not None and mode == 'r':
   1743                 raise ReadError("not a gzip file")
   1744             raise
   1745 
   1746         try:
   1747             t = cls.taropen(name, mode, fileobj, **kwargs)
   1748         except IOError:
   1749             fileobj.close()
   1750             if mode == 'r':
   1751                 raise ReadError("not a gzip file")
   1752             raise
   1753         except:
   1754             fileobj.close()
   1755             raise
   1756         t._extfileobj = False
   1757         return t
   1758 
   1759     @classmethod
   1760     def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
   1761         """Open bzip2 compressed tar archive name for reading or writing.
   1762            Appending is not allowed.
   1763         """
   1764         if mode not in ("r", "w"):
   1765             raise ValueError("mode must be 'r' or 'w'.")
   1766 
   1767         try:
   1768             import bz2
   1769         except ImportError:
   1770             raise CompressionError("bz2 module is not available")
   1771 
   1772         if fileobj is not None:
   1773             fileobj = _BZ2Proxy(fileobj, mode)
   1774         else:
   1775             fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
   1776 
   1777         try:
   1778             t = cls.taropen(name, mode, fileobj, **kwargs)
   1779         except (IOError, EOFError):
   1780             fileobj.close()
   1781             if mode == 'r':
   1782                 raise ReadError("not a bzip2 file")
   1783             raise
   1784         except:
   1785             fileobj.close()
   1786             raise
   1787         t._extfileobj = False
   1788         return t
   1789 
   1790     # All *open() methods are registered here.
   1791     OPEN_METH = {
   1792         "tar": "taropen",   # uncompressed tar
   1793         "gz":  "gzopen",    # gzip compressed tar
   1794         "bz2": "bz2open"    # bzip2 compressed tar
   1795     }
   1796 
   1797     #--------------------------------------------------------------------------
   1798     # The public methods which TarFile provides:
   1799 
   1800     def close(self):
   1801         """Close the TarFile. In write-mode, two finishing zero blocks are
   1802            appended to the archive.
   1803         """
   1804         if self.closed:
   1805             return
   1806 
   1807         self.closed = True
   1808         try:
   1809             if self.mode in "aw":
   1810                 self.fileobj.write(NUL * (BLOCKSIZE * 2))
   1811                 self.offset += (BLOCKSIZE * 2)
   1812                 # fill up the end with zero-blocks
   1813                 # (like option -b20 for tar does)
   1814                 blocks, remainder = divmod(self.offset, RECORDSIZE)
   1815                 if remainder > 0:
   1816                     self.fileobj.write(NUL * (RECORDSIZE - remainder))
   1817         finally:
   1818             if not self._extfileobj:
   1819                 self.fileobj.close()
   1820 
   1821     def getmember(self, name):
   1822         """Return a TarInfo object for member `name'. If `name' can not be
   1823            found in the archive, KeyError is raised. If a member occurs more
   1824            than once in the archive, its last occurrence is assumed to be the
   1825            most up-to-date version.
   1826         """
   1827         tarinfo = self._getmember(name)
   1828         if tarinfo is None:
   1829             raise KeyError("filename %r not found" % name)
   1830         return tarinfo
   1831 
   1832     def getmembers(self):
   1833         """Return the members of the archive as a list of TarInfo objects. The
   1834            list has the same order as the members in the archive.
   1835         """
   1836         self._check()
   1837         if not self._loaded:    # if we want to obtain a list of
   1838             self._load()        # all members, we first have to
   1839                                 # scan the whole archive.
   1840         return self.members
   1841 
   1842     def getnames(self):
   1843         """Return the members of the archive as a list of their names. It has
   1844            the same order as the list returned by getmembers().
   1845         """
   1846         return [tarinfo.name for tarinfo in self.getmembers()]
   1847 
   1848     def gettarinfo(self, name=None, arcname=None, fileobj=None):
   1849         """Create a TarInfo object from the result of os.stat or equivalent
   1850            on an existing file. The file is either named by `name', or
   1851            specified as a file object `fileobj' with a file descriptor. If
   1852            given, `arcname' specifies an alternative name for the file in the
   1853            archive, otherwise, the name is taken from the 'name' attribute of
   1854            'fileobj', or the 'name' argument.
   1855         """
   1856         self._check("aw")
   1857 
   1858         # When fileobj is given, replace name by
   1859         # fileobj's real name.
   1860         if fileobj is not None:
   1861             name = fileobj.name
   1862 
   1863         # Building the name of the member in the archive.
   1864         # Backward slashes are converted to forward slashes,
   1865         # Absolute paths are turned to relative paths.
   1866         if arcname is None:
   1867             arcname = name
   1868         drv, arcname = os.path.splitdrive(arcname)
   1869         arcname = arcname.replace(os.sep, "/")
   1870         arcname = arcname.lstrip("/")
   1871 
   1872         # Now, fill the TarInfo object with
   1873         # information specific for the file.
   1874         tarinfo = self.tarinfo()
   1875         tarinfo.tarfile = self  # Not needed
   1876 
   1877         # Use os.stat or os.lstat, depending on platform
   1878         # and if symlinks shall be resolved.
   1879         if fileobj is None:
   1880             if hasattr(os, "lstat") and not self.dereference:
   1881                 statres = os.lstat(name)
   1882             else:
   1883                 statres = os.stat(name)
   1884         else:
   1885             statres = os.fstat(fileobj.fileno())
   1886         linkname = ""
   1887 
   1888         stmd = statres.st_mode
   1889         if stat.S_ISREG(stmd):
   1890             inode = (statres.st_ino, statres.st_dev)
   1891             if not self.dereference and statres.st_nlink > 1 and \
   1892                     inode in self.inodes and arcname != self.inodes[inode]:
   1893                 # Is it a hardlink to an already
   1894                 # archived file?
   1895                 type = LNKTYPE
   1896                 linkname = self.inodes[inode]
   1897             else:
   1898                 # The inode is added only if its valid.
   1899                 # For win32 it is always 0.
   1900                 type = REGTYPE
   1901                 if inode[0]:
   1902                     self.inodes[inode] = arcname
   1903         elif stat.S_ISDIR(stmd):
   1904             type = DIRTYPE
   1905         elif stat.S_ISFIFO(stmd):
   1906             type = FIFOTYPE
   1907         elif stat.S_ISLNK(stmd):
   1908             type = SYMTYPE
   1909             linkname = os.readlink(name)
   1910         elif stat.S_ISCHR(stmd):
   1911             type = CHRTYPE
   1912         elif stat.S_ISBLK(stmd):
   1913             type = BLKTYPE
   1914         else:
   1915             return None
   1916 
   1917         # Fill the TarInfo object with all
   1918         # information we can get.
   1919         tarinfo.name = arcname
   1920         tarinfo.mode = stmd
   1921         tarinfo.uid = statres.st_uid
   1922         tarinfo.gid = statres.st_gid
   1923         if type == REGTYPE:
   1924             tarinfo.size = statres.st_size
   1925         else:
   1926             tarinfo.size = 0L
   1927         tarinfo.mtime = statres.st_mtime
   1928         tarinfo.type = type
   1929         tarinfo.linkname = linkname
   1930         if pwd:
   1931             try:
   1932                 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
   1933             except KeyError:
   1934                 pass
   1935         if grp:
   1936             try:
   1937                 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
   1938             except KeyError:
   1939                 pass
   1940 
   1941         if type in (CHRTYPE, BLKTYPE):
   1942             if hasattr(os, "major") and hasattr(os, "minor"):
   1943                 tarinfo.devmajor = os.major(statres.st_rdev)
   1944                 tarinfo.devminor = os.minor(statres.st_rdev)
   1945         return tarinfo
   1946 
   1947     def list(self, verbose=True):
   1948         """Print a table of contents to sys.stdout. If `verbose' is False, only
   1949            the names of the members are printed. If it is True, an `ls -l'-like
   1950            output is produced.
   1951         """
   1952         self._check()
   1953 
   1954         for tarinfo in self:
   1955             if verbose:
   1956                 print filemode(tarinfo.mode),
   1957                 print "%s/%s" % (tarinfo.uname or tarinfo.uid,
   1958                                  tarinfo.gname or tarinfo.gid),
   1959                 if tarinfo.ischr() or tarinfo.isblk():
   1960                     print "%10s" % ("%d,%d" \
   1961                                     % (tarinfo.devmajor, tarinfo.devminor)),
   1962                 else:
   1963                     print "%10d" % tarinfo.size,
   1964                 print "%d-%02d-%02d %02d:%02d:%02d" \
   1965                       % time.localtime(tarinfo.mtime)[:6],
   1966 
   1967             print tarinfo.name + ("/" if tarinfo.isdir() else ""),
   1968 
   1969             if verbose:
   1970                 if tarinfo.issym():
   1971                     print "->", tarinfo.linkname,
   1972                 if tarinfo.islnk():
   1973                     print "link to", tarinfo.linkname,
   1974             print
   1975 
   1976     def add(self, name, arcname=None, recursive=True, exclude=None, filter=None):
   1977         """Add the file `name' to the archive. `name' may be any type of file
   1978            (directory, fifo, symbolic link, etc.). If given, `arcname'
   1979            specifies an alternative name for the file in the archive.
   1980            Directories are added recursively by default. This can be avoided by
   1981            setting `recursive' to False. `exclude' is a function that should
   1982            return True for each filename to be excluded. `filter' is a function
   1983            that expects a TarInfo object argument and returns the changed
   1984            TarInfo object, if it returns None the TarInfo object will be
   1985            excluded from the archive.
   1986         """
   1987         self._check("aw")
   1988 
   1989         if arcname is None:
   1990             arcname = name
   1991 
   1992         # Exclude pathnames.
   1993         if exclude is not None:
   1994             import warnings
   1995             warnings.warn("use the filter argument instead",
   1996                     DeprecationWarning, 2)
   1997             if exclude(name):
   1998                 self._dbg(2, "tarfile: Excluded %r" % name)
   1999                 return
   2000 
   2001         # Skip if somebody tries to archive the archive...
   2002         if self.name is not None and os.path.abspath(name) == self.name:
   2003             self._dbg(2, "tarfile: Skipped %r" % name)
   2004             return
   2005 
   2006         self._dbg(1, name)
   2007 
   2008         # Create a TarInfo object from the file.
   2009         tarinfo = self.gettarinfo(name, arcname)
   2010 
   2011         if tarinfo is None:
   2012             self._dbg(1, "tarfile: Unsupported type %r" % name)
   2013             return
   2014 
   2015         # Change or exclude the TarInfo object.
   2016         if filter is not None:
   2017             tarinfo = filter(tarinfo)
   2018             if tarinfo is None:
   2019                 self._dbg(2, "tarfile: Excluded %r" % name)
   2020                 return
   2021 
   2022         # Append the tar header and data to the archive.
   2023         if tarinfo.isreg():
   2024             with bltn_open(name, "rb") as f:
   2025                 self.addfile(tarinfo, f)
   2026 
   2027         elif tarinfo.isdir():
   2028             self.addfile(tarinfo)
   2029             if recursive:
   2030                 for f in os.listdir(name):
   2031                     self.add(os.path.join(name, f), os.path.join(arcname, f),
   2032                             recursive, exclude, filter)
   2033 
   2034         else:
   2035             self.addfile(tarinfo)
   2036 
   2037     def addfile(self, tarinfo, fileobj=None):
   2038         """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
   2039            given, tarinfo.size bytes are read from it and added to the archive.
   2040            You can create TarInfo objects directly, or by using gettarinfo().
   2041            On Windows platforms, `fileobj' should always be opened with mode
   2042            'rb' to avoid irritation about the file size.
   2043         """
   2044         self._check("aw")
   2045 
   2046         tarinfo = copy.copy(tarinfo)
   2047 
   2048         buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
   2049         self.fileobj.write(buf)
   2050         self.offset += len(buf)
   2051 
   2052         # If there's data to follow, append it.
   2053         if fileobj is not None:
   2054             copyfileobj(fileobj, self.fileobj, tarinfo.size)
   2055             blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
   2056             if remainder > 0:
   2057                 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
   2058                 blocks += 1
   2059             self.offset += blocks * BLOCKSIZE
   2060 
   2061         self.members.append(tarinfo)
   2062 
   2063     def extractall(self, path=".", members=None):
   2064         """Extract all members from the archive to the current working
   2065            directory and set owner, modification time and permissions on
   2066            directories afterwards. `path' specifies a different directory
   2067            to extract to. `members' is optional and must be a subset of the
   2068            list returned by getmembers().
   2069         """
   2070         directories = []
   2071 
   2072         if members is None:
   2073             members = self
   2074 
   2075         for tarinfo in members:
   2076             if tarinfo.isdir():
   2077                 # Extract directories with a safe mode.
   2078                 directories.append(tarinfo)
   2079                 tarinfo = copy.copy(tarinfo)
   2080                 tarinfo.mode = 0700
   2081             self.extract(tarinfo, path)
   2082 
   2083         # Reverse sort directories.
   2084         directories.sort(key=operator.attrgetter('name'))
   2085         directories.reverse()
   2086 
   2087         # Set correct owner, mtime and filemode on directories.
   2088         for tarinfo in directories:
   2089             dirpath = os.path.join(path, tarinfo.name)
   2090             try:
   2091                 self.chown(tarinfo, dirpath)
   2092                 self.utime(tarinfo, dirpath)
   2093                 self.chmod(tarinfo, dirpath)
   2094             except ExtractError, e:
   2095                 if self.errorlevel > 1:
   2096                     raise
   2097                 else:
   2098                     self._dbg(1, "tarfile: %s" % e)
   2099 
   2100     def extract(self, member, path=""):
   2101         """Extract a member from the archive to the current working directory,
   2102            using its full name. Its file information is extracted as accurately
   2103            as possible. `member' may be a filename or a TarInfo object. You can
   2104            specify a different directory using `path'.
   2105         """
   2106         self._check("r")
   2107 
   2108         if isinstance(member, basestring):
   2109             tarinfo = self.getmember(member)
   2110         else:
   2111             tarinfo = member
   2112 
   2113         # Prepare the link target for makelink().
   2114         if tarinfo.islnk():
   2115             tarinfo._link_target = os.path.join(path, tarinfo.linkname)
   2116 
   2117         try:
   2118             self._extract_member(tarinfo, os.path.join(path, tarinfo.name))
   2119         except EnvironmentError, e:
   2120             if self.errorlevel > 0:
   2121                 raise
   2122             else:
   2123                 if e.filename is None:
   2124                     self._dbg(1, "tarfile: %s" % e.strerror)
   2125                 else:
   2126                     self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
   2127         except ExtractError, e:
   2128             if self.errorlevel > 1:
   2129                 raise
   2130             else:
   2131                 self._dbg(1, "tarfile: %s" % e)
   2132 
   2133     def extractfile(self, member):
   2134         """Extract a member from the archive as a file object. `member' may be
   2135            a filename or a TarInfo object. If `member' is a regular file, a
   2136            file-like object is returned. If `member' is a link, a file-like
   2137            object is constructed from the link's target. If `member' is none of
   2138            the above, None is returned.
   2139            The file-like object is read-only and provides the following
   2140            methods: read(), readline(), readlines(), seek() and tell()
   2141         """
   2142         self._check("r")
   2143 
   2144         if isinstance(member, basestring):
   2145             tarinfo = self.getmember(member)
   2146         else:
   2147             tarinfo = member
   2148 
   2149         if tarinfo.isreg():
   2150             return self.fileobject(self, tarinfo)
   2151 
   2152         elif tarinfo.type not in SUPPORTED_TYPES:
   2153             # If a member's type is unknown, it is treated as a
   2154             # regular file.
   2155             return self.fileobject(self, tarinfo)
   2156 
   2157         elif tarinfo.islnk() or tarinfo.issym():
   2158             if isinstance(self.fileobj, _Stream):
   2159                 # A small but ugly workaround for the case that someone tries
   2160                 # to extract a (sym)link as a file-object from a non-seekable
   2161                 # stream of tar blocks.
   2162                 raise StreamError("cannot extract (sym)link as file object")
   2163             else:
   2164                 # A (sym)link's file object is its target's file object.
   2165                 return self.extractfile(self._find_link_target(tarinfo))
   2166         else:
   2167             # If there's no data associated with the member (directory, chrdev,
   2168             # blkdev, etc.), return None instead of a file object.
   2169             return None
   2170 
   2171     def _extract_member(self, tarinfo, targetpath):
   2172         """Extract the TarInfo object tarinfo to a physical
   2173            file called targetpath.
   2174         """
   2175         # Fetch the TarInfo object for the given name
   2176         # and build the destination pathname, replacing
   2177         # forward slashes to platform specific separators.
   2178         targetpath = targetpath.rstrip("/")
   2179         targetpath = targetpath.replace("/", os.sep)
   2180 
   2181         # Create all upper directories.
   2182         upperdirs = os.path.dirname(targetpath)
   2183         if upperdirs and not os.path.exists(upperdirs):
   2184             # Create directories that are not part of the archive with
   2185             # default permissions.
   2186             os.makedirs(upperdirs)
   2187 
   2188         if tarinfo.islnk() or tarinfo.issym():
   2189             self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
   2190         else:
   2191             self._dbg(1, tarinfo.name)
   2192 
   2193         if tarinfo.isreg():
   2194             self.makefile(tarinfo, targetpath)
   2195         elif tarinfo.isdir():
   2196             self.makedir(tarinfo, targetpath)
   2197         elif tarinfo.isfifo():
   2198             self.makefifo(tarinfo, targetpath)
   2199         elif tarinfo.ischr() or tarinfo.isblk():
   2200             self.makedev(tarinfo, targetpath)
   2201         elif tarinfo.islnk() or tarinfo.issym():
   2202             self.makelink(tarinfo, targetpath)
   2203         elif tarinfo.type not in SUPPORTED_TYPES:
   2204             self.makeunknown(tarinfo, targetpath)
   2205         else:
   2206             self.makefile(tarinfo, targetpath)
   2207 
   2208         self.chown(tarinfo, targetpath)
   2209         if not tarinfo.issym():
   2210             self.chmod(tarinfo, targetpath)
   2211             self.utime(tarinfo, targetpath)
   2212 
   2213     #--------------------------------------------------------------------------
   2214     # Below are the different file methods. They are called via
   2215     # _extract_member() when extract() is called. They can be replaced in a
   2216     # subclass to implement other functionality.
   2217 
   2218     def makedir(self, tarinfo, targetpath):
   2219         """Make a directory called targetpath.
   2220         """
   2221         try:
   2222             # Use a safe mode for the directory, the real mode is set
   2223             # later in _extract_member().
   2224             os.mkdir(targetpath, 0700)
   2225         except EnvironmentError, e:
   2226             if e.errno != errno.EEXIST:
   2227                 raise
   2228 
   2229     def makefile(self, tarinfo, targetpath):
   2230         """Make a file called targetpath.
   2231         """
   2232         source = self.extractfile(tarinfo)
   2233         try:
   2234             with bltn_open(targetpath, "wb") as target:
   2235                 copyfileobj(source, target)
   2236         finally:
   2237             source.close()
   2238 
   2239     def makeunknown(self, tarinfo, targetpath):
   2240         """Make a file from a TarInfo object with an unknown type
   2241            at targetpath.
   2242         """
   2243         self.makefile(tarinfo, targetpath)
   2244         self._dbg(1, "tarfile: Unknown file type %r, " \
   2245                      "extracted as regular file." % tarinfo.type)
   2246 
   2247     def makefifo(self, tarinfo, targetpath):
   2248         """Make a fifo called targetpath.
   2249         """
   2250         if hasattr(os, "mkfifo"):
   2251             os.mkfifo(targetpath)
   2252         else:
   2253             raise ExtractError("fifo not supported by system")
   2254 
   2255     def makedev(self, tarinfo, targetpath):
   2256         """Make a character or block device called targetpath.
   2257         """
   2258         if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
   2259             raise ExtractError("special devices not supported by system")
   2260 
   2261         mode = tarinfo.mode
   2262         if tarinfo.isblk():
   2263             mode |= stat.S_IFBLK
   2264         else:
   2265             mode |= stat.S_IFCHR
   2266 
   2267         os.mknod(targetpath, mode,
   2268                  os.makedev(tarinfo.devmajor, tarinfo.devminor))
   2269 
   2270     def makelink(self, tarinfo, targetpath):
   2271         """Make a (symbolic) link called targetpath. If it cannot be created
   2272           (platform limitation), we try to make a copy of the referenced file
   2273           instead of a link.
   2274         """
   2275         if hasattr(os, "symlink") and hasattr(os, "link"):
   2276             # For systems that support symbolic and hard links.
   2277             if tarinfo.issym():
   2278                 if os.path.lexists(targetpath):
   2279                     os.unlink(targetpath)
   2280                 os.symlink(tarinfo.linkname, targetpath)
   2281             else:
   2282                 # See extract().
   2283                 if os.path.exists(tarinfo._link_target):
   2284                     if os.path.lexists(targetpath):
   2285                         os.unlink(targetpath)
   2286                     os.link(tarinfo._link_target, targetpath)
   2287                 else:
   2288                     self._extract_member(self._find_link_target(tarinfo), targetpath)
   2289         else:
   2290             try:
   2291                 self._extract_member(self._find_link_target(tarinfo), targetpath)
   2292             except KeyError:
   2293                 raise ExtractError("unable to resolve link inside archive")
   2294 
   2295     def chown(self, tarinfo, targetpath):
   2296         """Set owner of targetpath according to tarinfo.
   2297         """
   2298         if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
   2299             # We have to be root to do so.
   2300             try:
   2301                 g = grp.getgrnam(tarinfo.gname)[2]
   2302             except KeyError:
   2303                 g = tarinfo.gid
   2304             try:
   2305                 u = pwd.getpwnam(tarinfo.uname)[2]
   2306             except KeyError:
   2307                 u = tarinfo.uid
   2308             try:
   2309                 if tarinfo.issym() and hasattr(os, "lchown"):
   2310                     os.lchown(targetpath, u, g)
   2311                 else:
   2312                     if sys.platform != "os2emx":
   2313                         os.chown(targetpath, u, g)
   2314             except EnvironmentError, e:
   2315                 raise ExtractError("could not change owner")
   2316 
   2317     def chmod(self, tarinfo, targetpath):
   2318         """Set file permissions of targetpath according to tarinfo.
   2319         """
   2320         if hasattr(os, 'chmod'):
   2321             try:
   2322                 os.chmod(targetpath, tarinfo.mode)
   2323             except EnvironmentError, e:
   2324                 raise ExtractError("could not change mode")
   2325 
   2326     def utime(self, tarinfo, targetpath):
   2327         """Set modification time of targetpath according to tarinfo.
   2328         """
   2329         if not hasattr(os, 'utime'):
   2330             return
   2331         try:
   2332             os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
   2333         except EnvironmentError, e:
   2334             raise ExtractError("could not change modification time")
   2335 
   2336     #--------------------------------------------------------------------------
   2337     def next(self):
   2338         """Return the next member of the archive as a TarInfo object, when
   2339            TarFile is opened for reading. Return None if there is no more
   2340            available.
   2341         """
   2342         self._check("ra")
   2343         if self.firstmember is not None:
   2344             m = self.firstmember
   2345             self.firstmember = None
   2346             return m
   2347 
   2348         # Advance the file pointer.
   2349         if self.offset != self.fileobj.tell():
   2350             self.fileobj.seek(self.offset - 1)
   2351             if not self.fileobj.read(1):
   2352                 raise ReadError("unexpected end of data")
   2353 
   2354         # Read the next block.
   2355         tarinfo = None
   2356         while True:
   2357             try:
   2358                 tarinfo = self.tarinfo.fromtarfile(self)
   2359             except EOFHeaderError, e:
   2360                 if self.ignore_zeros:
   2361                     self._dbg(2, "0x%X: %s" % (self.offset, e))
   2362                     self.offset += BLOCKSIZE
   2363                     continue
   2364             except InvalidHeaderError, e:
   2365                 if self.ignore_zeros:
   2366                     self._dbg(2, "0x%X: %s" % (self.offset, e))
   2367                     self.offset += BLOCKSIZE
   2368                     continue
   2369                 elif self.offset == 0:
   2370                     raise ReadError(str(e))
   2371             except EmptyHeaderError:
   2372                 if self.offset == 0:
   2373                     raise ReadError("empty file")
   2374             except TruncatedHeaderError, e:
   2375                 if self.offset == 0:
   2376                     raise ReadError(str(e))
   2377             except SubsequentHeaderError, e:
   2378                 raise ReadError(str(e))
   2379             break
   2380 
   2381         if tarinfo is not None:
   2382             self.members.append(tarinfo)
   2383         else:
   2384             self._loaded = True
   2385 
   2386         return tarinfo
   2387 
   2388     #--------------------------------------------------------------------------
   2389     # Little helper methods:
   2390 
   2391     def _getmember(self, name, tarinfo=None, normalize=False):
   2392         """Find an archive member by name from bottom to top.
   2393            If tarinfo is given, it is used as the starting point.
   2394         """
   2395         # Ensure that all members have been loaded.
   2396         members = self.getmembers()
   2397 
   2398         # Limit the member search list up to tarinfo.
   2399         if tarinfo is not None:
   2400             members = members[:members.index(tarinfo)]
   2401 
   2402         if normalize:
   2403             name = os.path.normpath(name)
   2404 
   2405         for member in reversed(members):
   2406             if normalize:
   2407                 member_name = os.path.normpath(member.name)
   2408             else:
   2409                 member_name = member.name
   2410 
   2411             if name == member_name:
   2412                 return member
   2413 
   2414     def _load(self):
   2415         """Read through the entire archive file and look for readable
   2416            members.
   2417         """
   2418         while True:
   2419             tarinfo = self.next()
   2420             if tarinfo is None:
   2421                 break
   2422         self._loaded = True
   2423 
   2424     def _check(self, mode=None):
   2425         """Check if TarFile is still open, and if the operation's mode
   2426            corresponds to TarFile's mode.
   2427         """
   2428         if self.closed:
   2429             raise IOError("%s is closed" % self.__class__.__name__)
   2430         if mode is not None and self.mode not in mode:
   2431             raise IOError("bad operation for mode %r" % self.mode)
   2432 
   2433     def _find_link_target(self, tarinfo):
   2434         """Find the target member of a symlink or hardlink member in the
   2435            archive.
   2436         """
   2437         if tarinfo.issym():
   2438             # Always search the entire archive.
   2439             linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
   2440             limit = None
   2441         else:
   2442             # Search the archive before the link, because a hard link is
   2443             # just a reference to an already archived file.
   2444             linkname = tarinfo.linkname
   2445             limit = tarinfo
   2446 
   2447         member = self._getmember(linkname, tarinfo=limit, normalize=True)
   2448         if member is None:
   2449             raise KeyError("linkname %r not found" % linkname)
   2450         return member
   2451 
   2452     def __iter__(self):
   2453         """Provide an iterator object.
   2454         """
   2455         if self._loaded:
   2456             return iter(self.members)
   2457         else:
   2458             return TarIter(self)
   2459 
   2460     def _dbg(self, level, msg):
   2461         """Write debugging output to sys.stderr.
   2462         """
   2463         if level <= self.debug:
   2464             print >> sys.stderr, msg
   2465 
   2466     def __enter__(self):
   2467         self._check()
   2468         return self
   2469 
   2470     def __exit__(self, type, value, traceback):
   2471         if type is None:
   2472             self.close()
   2473         else:
   2474             # An exception occurred. We must not call close() because
   2475             # it would try to write end-of-archive blocks and padding.
   2476             if not self._extfileobj:
   2477                 self.fileobj.close()
   2478             self.closed = True
   2479 # class TarFile
   2480 
   2481 class TarIter:
   2482     """Iterator Class.
   2483 
   2484        for tarinfo in TarFile(...):
   2485            suite...
   2486     """
   2487 
   2488     def __init__(self, tarfile):
   2489         """Construct a TarIter object.
   2490         """
   2491         self.tarfile = tarfile
   2492         self.index = 0
   2493     def __iter__(self):
   2494         """Return iterator object.
   2495         """
   2496         return self
   2497     def next(self):
   2498         """Return the next item using TarFile's next() method.
   2499            When all members have been read, set TarFile as _loaded.
   2500         """
   2501         # Fix for SF #1100429: Under rare circumstances it can
   2502         # happen that getmembers() is called during iteration,
   2503         # which will cause TarIter to stop prematurely.
   2504 
   2505         if self.index == 0 and self.tarfile.firstmember is not None:
   2506             tarinfo = self.tarfile.next()
   2507         elif self.index < len(self.tarfile.members):
   2508             tarinfo = self.tarfile.members[self.index]
   2509         elif not self.tarfile._loaded:
   2510             tarinfo = self.tarfile.next()
   2511             if not tarinfo:
   2512                 self.tarfile._loaded = True
   2513                 raise StopIteration
   2514         else:
   2515             raise StopIteration
   2516         self.index += 1
   2517         return tarinfo
   2518 
   2519 # Helper classes for sparse file support
   2520 class _section:
   2521     """Base class for _data and _hole.
   2522     """
   2523     def __init__(self, offset, size):
   2524         self.offset = offset
   2525         self.size = size
   2526     def __contains__(self, offset):
   2527         return self.offset <= offset < self.offset + self.size
   2528 
   2529 class _data(_section):
   2530     """Represent a data section in a sparse file.
   2531     """
   2532     def __init__(self, offset, size, realpos):
   2533         _section.__init__(self, offset, size)
   2534         self.realpos = realpos
   2535 
   2536 class _hole(_section):
   2537     """Represent a hole section in a sparse file.
   2538     """
   2539     pass
   2540 
   2541 class _ringbuffer(list):
   2542     """Ringbuffer class which increases performance
   2543        over a regular list.
   2544     """
   2545     def __init__(self):
   2546         self.idx = 0
   2547     def find(self, offset):
   2548         idx = self.idx
   2549         while True:
   2550             item = self[idx]
   2551             if offset in item:
   2552                 break
   2553             idx += 1
   2554             if idx == len(self):
   2555                 idx = 0
   2556             if idx == self.idx:
   2557                 # End of File
   2558                 return None
   2559         self.idx = idx
   2560         return item
   2561 
   2562 #---------------------------------------------
   2563 # zipfile compatible TarFile class
   2564 #---------------------------------------------
   2565 TAR_PLAIN = 0           # zipfile.ZIP_STORED
   2566 TAR_GZIPPED = 8         # zipfile.ZIP_DEFLATED
   2567 class TarFileCompat:
   2568     """TarFile class compatible with standard module zipfile's
   2569        ZipFile class.
   2570     """
   2571     def __init__(self, file, mode="r", compression=TAR_PLAIN):
   2572         from warnings import warnpy3k
   2573         warnpy3k("the TarFileCompat class has been removed in Python 3.0",
   2574                 stacklevel=2)
   2575         if compression == TAR_PLAIN:
   2576             self.tarfile = TarFile.taropen(file, mode)
   2577         elif compression == TAR_GZIPPED:
   2578             self.tarfile = TarFile.gzopen(file, mode)
   2579         else:
   2580             raise ValueError("unknown compression constant")
   2581         if mode[0:1] == "r":
   2582             members = self.tarfile.getmembers()
   2583             for m in members:
   2584                 m.filename = m.name
   2585                 m.file_size = m.size
   2586                 m.date_time = time.gmtime(m.mtime)[:6]
   2587     def namelist(self):
   2588         return map(lambda m: m.name, self.infolist())
   2589     def infolist(self):
   2590         return filter(lambda m: m.type in REGULAR_TYPES,
   2591                       self.tarfile.getmembers())
   2592     def printdir(self):
   2593         self.tarfile.list()
   2594     def testzip(self):
   2595         return
   2596     def getinfo(self, name):
   2597         return self.tarfile.getmember(name)
   2598     def read(self, name):
   2599         return self.tarfile.extractfile(self.tarfile.getmember(name)).read()
   2600     def write(self, filename, arcname=None, compress_type=None):
   2601         self.tarfile.add(filename, arcname)
   2602     def writestr(self, zinfo, bytes):
   2603         try:
   2604             from cStringIO import StringIO
   2605         except ImportError:
   2606             from StringIO import StringIO
   2607         import calendar
   2608         tinfo = TarInfo(zinfo.filename)
   2609         tinfo.size = len(bytes)
   2610         tinfo.mtime = calendar.timegm(zinfo.date_time)
   2611         self.tarfile.addfile(tinfo, StringIO(bytes))
   2612     def close(self):
   2613         self.tarfile.close()
   2614 #class TarFileCompat
   2615 
   2616 #--------------------
   2617 # exported functions
   2618 #--------------------
   2619 def is_tarfile(name):
   2620     """Return True if name points to a tar archive that we
   2621        are able to handle, else return False.
   2622     """
   2623     try:
   2624         t = open(name)
   2625         t.close()
   2626         return True
   2627     except TarError:
   2628         return False
   2629 
   2630 open = TarFile.open
   2631