Home | History | Annotate | Download | only in tar
      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package tar
      6 
      7 import "strings"
      8 
      9 // Format represents the tar archive format.
     10 //
     11 // The original tar format was introduced in Unix V7.
     12 // Since then, there have been multiple competing formats attempting to
     13 // standardize or extend the V7 format to overcome its limitations.
     14 // The most common formats are the USTAR, PAX, and GNU formats,
     15 // each with their own advantages and limitations.
     16 //
     17 // The following table captures the capabilities of each format:
     18 //
     19 //	                  |  USTAR |       PAX |       GNU
     20 //	------------------+--------+-----------+----------
     21 //	Name              |   256B | unlimited | unlimited
     22 //	Linkname          |   100B | unlimited | unlimited
     23 //	Size              | uint33 | unlimited |    uint89
     24 //	Mode              | uint21 |    uint21 |    uint57
     25 //	Uid/Gid           | uint21 | unlimited |    uint57
     26 //	Uname/Gname       |    32B | unlimited |       32B
     27 //	ModTime           | uint33 | unlimited |     int89
     28 //	AccessTime        |    n/a | unlimited |     int89
     29 //	ChangeTime        |    n/a | unlimited |     int89
     30 //	Devmajor/Devminor | uint21 |    uint21 |    uint57
     31 //	------------------+--------+-----------+----------
     32 //	string encoding   |  ASCII |     UTF-8 |    binary
     33 //	sub-second times  |     no |       yes |        no
     34 //	sparse files      |     no |       yes |       yes
     35 //
     36 // The table's upper portion shows the Header fields, where each format reports
     37 // the maximum number of bytes allowed for each string field and
     38 // the integer type used to store each numeric field
     39 // (where timestamps are stored as the number of seconds since the Unix epoch).
     40 //
     41 // The table's lower portion shows specialized features of each format,
     42 // such as supported string encodings, support for sub-second timestamps,
     43 // or support for sparse files.
     44 //
     45 // The Writer currently provides no support for sparse files.
     46 type Format int
     47 
     48 // Constants to identify various tar formats.
     49 const (
     50 	// Deliberately hide the meaning of constants from public API.
     51 	_ Format = (1 << iota) / 4 // Sequence of 0, 0, 1, 2, 4, 8, etc...
     52 
     53 	// FormatUnknown indicates that the format is unknown.
     54 	FormatUnknown
     55 
     56 	// The format of the original Unix V7 tar tool prior to standardization.
     57 	formatV7
     58 
     59 	// FormatUSTAR represents the USTAR header format defined in POSIX.1-1988.
     60 	//
     61 	// While this format is compatible with most tar readers,
     62 	// the format has several limitations making it unsuitable for some usages.
     63 	// Most notably, it cannot support sparse files, files larger than 8GiB,
     64 	// filenames larger than 256 characters, and non-ASCII filenames.
     65 	//
     66 	// Reference:
     67 	//	http://pubs.opengroup.org/onlinepubs/9699919799/utilities/pax.html#tag_20_92_13_06
     68 	FormatUSTAR
     69 
     70 	// FormatPAX represents the PAX header format defined in POSIX.1-2001.
     71 	//
     72 	// PAX extends USTAR by writing a special file with Typeflag TypeXHeader
     73 	// preceding the original header. This file contains a set of key-value
     74 	// records, which are used to overcome USTAR's shortcomings, in addition to
     75 	// providing the ability to have sub-second resolution for timestamps.
     76 	//
     77 	// Some newer formats add their own extensions to PAX by defining their
     78 	// own keys and assigning certain semantic meaning to the associated values.
     79 	// For example, sparse file support in PAX is implemented using keys
     80 	// defined by the GNU manual (e.g., "GNU.sparse.map").
     81 	//
     82 	// Reference:
     83 	//	http://pubs.opengroup.org/onlinepubs/009695399/utilities/pax.html
     84 	FormatPAX
     85 
     86 	// FormatGNU represents the GNU header format.
     87 	//
     88 	// The GNU header format is older than the USTAR and PAX standards and
     89 	// is not compatible with them. The GNU format supports
     90 	// arbitrary file sizes, filenames of arbitrary encoding and length,
     91 	// sparse files, and other features.
     92 	//
     93 	// It is recommended that PAX be chosen over GNU unless the target
     94 	// application can only parse GNU formatted archives.
     95 	//
     96 	// Reference:
     97 	//	http://www.gnu.org/software/tar/manual/html_node/Standard.html
     98 	FormatGNU
     99 
    100 	// Schily's tar format, which is incompatible with USTAR.
    101 	// This does not cover STAR extensions to the PAX format; these fall under
    102 	// the PAX format.
    103 	formatSTAR
    104 
    105 	formatMax
    106 )
    107 
    108 func (f Format) has(f2 Format) bool   { return f&f2 != 0 }
    109 func (f *Format) mayBe(f2 Format)     { *f |= f2 }
    110 func (f *Format) mayOnlyBe(f2 Format) { *f &= f2 }
    111 func (f *Format) mustNotBe(f2 Format) { *f &^= f2 }
    112 
    113 var formatNames = map[Format]string{
    114 	formatV7: "V7", FormatUSTAR: "USTAR", FormatPAX: "PAX", FormatGNU: "GNU", formatSTAR: "STAR",
    115 }
    116 
    117 func (f Format) String() string {
    118 	var ss []string
    119 	for f2 := Format(1); f2 < formatMax; f2 <<= 1 {
    120 		if f.has(f2) {
    121 			ss = append(ss, formatNames[f2])
    122 		}
    123 	}
    124 	switch len(ss) {
    125 	case 0:
    126 		return "<unknown>"
    127 	case 1:
    128 		return ss[0]
    129 	default:
    130 		return "(" + strings.Join(ss, " | ") + ")"
    131 	}
    132 }
    133 
    134 // Magics used to identify various formats.
    135 const (
    136 	magicGNU, versionGNU     = "ustar ", " \x00"
    137 	magicUSTAR, versionUSTAR = "ustar\x00", "00"
    138 	trailerSTAR              = "tar\x00"
    139 )
    140 
    141 // Size constants from various tar specifications.
    142 const (
    143 	blockSize  = 512 // Size of each block in a tar stream
    144 	nameSize   = 100 // Max length of the name field in USTAR format
    145 	prefixSize = 155 // Max length of the prefix field in USTAR format
    146 )
    147 
    148 // blockPadding computes the number of bytes needed to pad offset up to the
    149 // nearest block edge where 0 <= n < blockSize.
    150 func blockPadding(offset int64) (n int64) {
    151 	return -offset & (blockSize - 1)
    152 }
    153 
    154 var zeroBlock block
    155 
    156 type block [blockSize]byte
    157 
    158 // Convert block to any number of formats.
    159 func (b *block) V7() *headerV7       { return (*headerV7)(b) }
    160 func (b *block) GNU() *headerGNU     { return (*headerGNU)(b) }
    161 func (b *block) STAR() *headerSTAR   { return (*headerSTAR)(b) }
    162 func (b *block) USTAR() *headerUSTAR { return (*headerUSTAR)(b) }
    163 func (b *block) Sparse() sparseArray { return (sparseArray)(b[:]) }
    164 
    165 // GetFormat checks that the block is a valid tar header based on the checksum.
    166 // It then attempts to guess the specific format based on magic values.
    167 // If the checksum fails, then FormatUnknown is returned.
    168 func (b *block) GetFormat() Format {
    169 	// Verify checksum.
    170 	var p parser
    171 	value := p.parseOctal(b.V7().Chksum())
    172 	chksum1, chksum2 := b.ComputeChecksum()
    173 	if p.err != nil || (value != chksum1 && value != chksum2) {
    174 		return FormatUnknown
    175 	}
    176 
    177 	// Guess the magic values.
    178 	magic := string(b.USTAR().Magic())
    179 	version := string(b.USTAR().Version())
    180 	trailer := string(b.STAR().Trailer())
    181 	switch {
    182 	case magic == magicUSTAR && trailer == trailerSTAR:
    183 		return formatSTAR
    184 	case magic == magicUSTAR:
    185 		return FormatUSTAR | FormatPAX
    186 	case magic == magicGNU && version == versionGNU:
    187 		return FormatGNU
    188 	default:
    189 		return formatV7
    190 	}
    191 }
    192 
    193 // SetFormat writes the magic values necessary for specified format
    194 // and then updates the checksum accordingly.
    195 func (b *block) SetFormat(format Format) {
    196 	// Set the magic values.
    197 	switch {
    198 	case format.has(formatV7):
    199 		// Do nothing.
    200 	case format.has(FormatGNU):
    201 		copy(b.GNU().Magic(), magicGNU)
    202 		copy(b.GNU().Version(), versionGNU)
    203 	case format.has(formatSTAR):
    204 		copy(b.STAR().Magic(), magicUSTAR)
    205 		copy(b.STAR().Version(), versionUSTAR)
    206 		copy(b.STAR().Trailer(), trailerSTAR)
    207 	case format.has(FormatUSTAR | FormatPAX):
    208 		copy(b.USTAR().Magic(), magicUSTAR)
    209 		copy(b.USTAR().Version(), versionUSTAR)
    210 	default:
    211 		panic("invalid format")
    212 	}
    213 
    214 	// Update checksum.
    215 	// This field is special in that it is terminated by a NULL then space.
    216 	var f formatter
    217 	field := b.V7().Chksum()
    218 	chksum, _ := b.ComputeChecksum() // Possible values are 256..128776
    219 	f.formatOctal(field[:7], chksum) // Never fails since 128776 < 262143
    220 	field[7] = ' '
    221 }
    222 
    223 // ComputeChecksum computes the checksum for the header block.
    224 // POSIX specifies a sum of the unsigned byte values, but the Sun tar used
    225 // signed byte values.
    226 // We compute and return both.
    227 func (b *block) ComputeChecksum() (unsigned, signed int64) {
    228 	for i, c := range b {
    229 		if 148 <= i && i < 156 {
    230 			c = ' ' // Treat the checksum field itself as all spaces.
    231 		}
    232 		unsigned += int64(c)
    233 		signed += int64(int8(c))
    234 	}
    235 	return unsigned, signed
    236 }
    237 
    238 // Reset clears the block with all zeros.
    239 func (b *block) Reset() {
    240 	*b = block{}
    241 }
    242 
    243 type headerV7 [blockSize]byte
    244 
    245 func (h *headerV7) Name() []byte     { return h[000:][:100] }
    246 func (h *headerV7) Mode() []byte     { return h[100:][:8] }
    247 func (h *headerV7) UID() []byte      { return h[108:][:8] }
    248 func (h *headerV7) GID() []byte      { return h[116:][:8] }
    249 func (h *headerV7) Size() []byte     { return h[124:][:12] }
    250 func (h *headerV7) ModTime() []byte  { return h[136:][:12] }
    251 func (h *headerV7) Chksum() []byte   { return h[148:][:8] }
    252 func (h *headerV7) TypeFlag() []byte { return h[156:][:1] }
    253 func (h *headerV7) LinkName() []byte { return h[157:][:100] }
    254 
    255 type headerGNU [blockSize]byte
    256 
    257 func (h *headerGNU) V7() *headerV7       { return (*headerV7)(h) }
    258 func (h *headerGNU) Magic() []byte       { return h[257:][:6] }
    259 func (h *headerGNU) Version() []byte     { return h[263:][:2] }
    260 func (h *headerGNU) UserName() []byte    { return h[265:][:32] }
    261 func (h *headerGNU) GroupName() []byte   { return h[297:][:32] }
    262 func (h *headerGNU) DevMajor() []byte    { return h[329:][:8] }
    263 func (h *headerGNU) DevMinor() []byte    { return h[337:][:8] }
    264 func (h *headerGNU) AccessTime() []byte  { return h[345:][:12] }
    265 func (h *headerGNU) ChangeTime() []byte  { return h[357:][:12] }
    266 func (h *headerGNU) Sparse() sparseArray { return (sparseArray)(h[386:][:24*4+1]) }
    267 func (h *headerGNU) RealSize() []byte    { return h[483:][:12] }
    268 
    269 type headerSTAR [blockSize]byte
    270 
    271 func (h *headerSTAR) V7() *headerV7      { return (*headerV7)(h) }
    272 func (h *headerSTAR) Magic() []byte      { return h[257:][:6] }
    273 func (h *headerSTAR) Version() []byte    { return h[263:][:2] }
    274 func (h *headerSTAR) UserName() []byte   { return h[265:][:32] }
    275 func (h *headerSTAR) GroupName() []byte  { return h[297:][:32] }
    276 func (h *headerSTAR) DevMajor() []byte   { return h[329:][:8] }
    277 func (h *headerSTAR) DevMinor() []byte   { return h[337:][:8] }
    278 func (h *headerSTAR) Prefix() []byte     { return h[345:][:131] }
    279 func (h *headerSTAR) AccessTime() []byte { return h[476:][:12] }
    280 func (h *headerSTAR) ChangeTime() []byte { return h[488:][:12] }
    281 func (h *headerSTAR) Trailer() []byte    { return h[508:][:4] }
    282 
    283 type headerUSTAR [blockSize]byte
    284 
    285 func (h *headerUSTAR) V7() *headerV7     { return (*headerV7)(h) }
    286 func (h *headerUSTAR) Magic() []byte     { return h[257:][:6] }
    287 func (h *headerUSTAR) Version() []byte   { return h[263:][:2] }
    288 func (h *headerUSTAR) UserName() []byte  { return h[265:][:32] }
    289 func (h *headerUSTAR) GroupName() []byte { return h[297:][:32] }
    290 func (h *headerUSTAR) DevMajor() []byte  { return h[329:][:8] }
    291 func (h *headerUSTAR) DevMinor() []byte  { return h[337:][:8] }
    292 func (h *headerUSTAR) Prefix() []byte    { return h[345:][:155] }
    293 
    294 type sparseArray []byte
    295 
    296 func (s sparseArray) Entry(i int) sparseElem { return (sparseElem)(s[i*24:]) }
    297 func (s sparseArray) IsExtended() []byte     { return s[24*s.MaxEntries():][:1] }
    298 func (s sparseArray) MaxEntries() int        { return len(s) / 24 }
    299 
    300 type sparseElem []byte
    301 
    302 func (s sparseElem) Offset() []byte { return s[00:][:12] }
    303 func (s sparseElem) Length() []byte { return s[12:][:12] }
    304