Home | History | Annotate | Download | only in zip
      1 // Copyright 2010 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package zip
      6 
      7 import (
      8 	"bufio"
      9 	"encoding/binary"
     10 	"errors"
     11 	"fmt"
     12 	"hash"
     13 	"hash/crc32"
     14 	"io"
     15 	"os"
     16 	"time"
     17 )
     18 
     19 var (
     20 	ErrFormat    = errors.New("zip: not a valid zip file")
     21 	ErrAlgorithm = errors.New("zip: unsupported compression algorithm")
     22 	ErrChecksum  = errors.New("zip: checksum error")
     23 )
     24 
     25 type Reader struct {
     26 	r             io.ReaderAt
     27 	File          []*File
     28 	Comment       string
     29 	decompressors map[uint16]Decompressor
     30 }
     31 
     32 type ReadCloser struct {
     33 	f *os.File
     34 	Reader
     35 }
     36 
     37 type File struct {
     38 	FileHeader
     39 	zip          *Reader
     40 	zipr         io.ReaderAt
     41 	zipsize      int64
     42 	headerOffset int64
     43 }
     44 
     45 func (f *File) hasDataDescriptor() bool {
     46 	return f.Flags&0x8 != 0
     47 }
     48 
     49 // OpenReader will open the Zip file specified by name and return a ReadCloser.
     50 func OpenReader(name string) (*ReadCloser, error) {
     51 	f, err := os.Open(name)
     52 	if err != nil {
     53 		return nil, err
     54 	}
     55 	fi, err := f.Stat()
     56 	if err != nil {
     57 		f.Close()
     58 		return nil, err
     59 	}
     60 	r := new(ReadCloser)
     61 	if err := r.init(f, fi.Size()); err != nil {
     62 		f.Close()
     63 		return nil, err
     64 	}
     65 	r.f = f
     66 	return r, nil
     67 }
     68 
     69 // NewReader returns a new Reader reading from r, which is assumed to
     70 // have the given size in bytes.
     71 func NewReader(r io.ReaderAt, size int64) (*Reader, error) {
     72 	zr := new(Reader)
     73 	if err := zr.init(r, size); err != nil {
     74 		return nil, err
     75 	}
     76 	return zr, nil
     77 }
     78 
     79 func (z *Reader) init(r io.ReaderAt, size int64) error {
     80 	end, err := readDirectoryEnd(r, size)
     81 	if err != nil {
     82 		return err
     83 	}
     84 	if end.directoryRecords > uint64(size)/fileHeaderLen {
     85 		return fmt.Errorf("archive/zip: TOC declares impossible %d files in %d byte zip", end.directoryRecords, size)
     86 	}
     87 	z.r = r
     88 	z.File = make([]*File, 0, end.directoryRecords)
     89 	z.Comment = end.comment
     90 	rs := io.NewSectionReader(r, 0, size)
     91 	if _, err = rs.Seek(int64(end.directoryOffset), io.SeekStart); err != nil {
     92 		return err
     93 	}
     94 	buf := bufio.NewReader(rs)
     95 
     96 	// The count of files inside a zip is truncated to fit in a uint16.
     97 	// Gloss over this by reading headers until we encounter
     98 	// a bad one, and then only report an ErrFormat or UnexpectedEOF if
     99 	// the file count modulo 65536 is incorrect.
    100 	for {
    101 		f := &File{zip: z, zipr: r, zipsize: size}
    102 		err = readDirectoryHeader(f, buf)
    103 		if err == ErrFormat || err == io.ErrUnexpectedEOF {
    104 			break
    105 		}
    106 		if err != nil {
    107 			return err
    108 		}
    109 		z.File = append(z.File, f)
    110 	}
    111 	if uint16(len(z.File)) != uint16(end.directoryRecords) { // only compare 16 bits here
    112 		// Return the readDirectoryHeader error if we read
    113 		// the wrong number of directory entries.
    114 		return err
    115 	}
    116 	return nil
    117 }
    118 
    119 // RegisterDecompressor registers or overrides a custom decompressor for a
    120 // specific method ID. If a decompressor for a given method is not found,
    121 // Reader will default to looking up the decompressor at the package level.
    122 func (z *Reader) RegisterDecompressor(method uint16, dcomp Decompressor) {
    123 	if z.decompressors == nil {
    124 		z.decompressors = make(map[uint16]Decompressor)
    125 	}
    126 	z.decompressors[method] = dcomp
    127 }
    128 
    129 func (z *Reader) decompressor(method uint16) Decompressor {
    130 	dcomp := z.decompressors[method]
    131 	if dcomp == nil {
    132 		dcomp = decompressor(method)
    133 	}
    134 	return dcomp
    135 }
    136 
    137 // Close closes the Zip file, rendering it unusable for I/O.
    138 func (rc *ReadCloser) Close() error {
    139 	return rc.f.Close()
    140 }
    141 
    142 // DataOffset returns the offset of the file's possibly-compressed
    143 // data, relative to the beginning of the zip file.
    144 //
    145 // Most callers should instead use Open, which transparently
    146 // decompresses data and verifies checksums.
    147 func (f *File) DataOffset() (offset int64, err error) {
    148 	bodyOffset, err := f.findBodyOffset()
    149 	if err != nil {
    150 		return
    151 	}
    152 	return f.headerOffset + bodyOffset, nil
    153 }
    154 
    155 // Open returns a ReadCloser that provides access to the File's contents.
    156 // Multiple files may be read concurrently.
    157 func (f *File) Open() (io.ReadCloser, error) {
    158 	bodyOffset, err := f.findBodyOffset()
    159 	if err != nil {
    160 		return nil, err
    161 	}
    162 	size := int64(f.CompressedSize64)
    163 	r := io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset, size)
    164 	dcomp := f.zip.decompressor(f.Method)
    165 	if dcomp == nil {
    166 		return nil, ErrAlgorithm
    167 	}
    168 	var rc io.ReadCloser = dcomp(r)
    169 	var desr io.Reader
    170 	if f.hasDataDescriptor() {
    171 		desr = io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset+size, dataDescriptorLen)
    172 	}
    173 	rc = &checksumReader{
    174 		rc:   rc,
    175 		hash: crc32.NewIEEE(),
    176 		f:    f,
    177 		desr: desr,
    178 	}
    179 	return rc, nil
    180 }
    181 
    182 type checksumReader struct {
    183 	rc    io.ReadCloser
    184 	hash  hash.Hash32
    185 	nread uint64 // number of bytes read so far
    186 	f     *File
    187 	desr  io.Reader // if non-nil, where to read the data descriptor
    188 	err   error     // sticky error
    189 }
    190 
    191 func (r *checksumReader) Read(b []byte) (n int, err error) {
    192 	if r.err != nil {
    193 		return 0, r.err
    194 	}
    195 	n, err = r.rc.Read(b)
    196 	r.hash.Write(b[:n])
    197 	r.nread += uint64(n)
    198 	if err == nil {
    199 		return
    200 	}
    201 	if err == io.EOF {
    202 		if r.nread != r.f.UncompressedSize64 {
    203 			return 0, io.ErrUnexpectedEOF
    204 		}
    205 		if r.desr != nil {
    206 			if err1 := readDataDescriptor(r.desr, r.f); err1 != nil {
    207 				if err1 == io.EOF {
    208 					err = io.ErrUnexpectedEOF
    209 				} else {
    210 					err = err1
    211 				}
    212 			} else if r.hash.Sum32() != r.f.CRC32 {
    213 				err = ErrChecksum
    214 			}
    215 		} else {
    216 			// If there's not a data descriptor, we still compare
    217 			// the CRC32 of what we've read against the file header
    218 			// or TOC's CRC32, if it seems like it was set.
    219 			if r.f.CRC32 != 0 && r.hash.Sum32() != r.f.CRC32 {
    220 				err = ErrChecksum
    221 			}
    222 		}
    223 	}
    224 	r.err = err
    225 	return
    226 }
    227 
    228 func (r *checksumReader) Close() error { return r.rc.Close() }
    229 
    230 // findBodyOffset does the minimum work to verify the file has a header
    231 // and returns the file body offset.
    232 func (f *File) findBodyOffset() (int64, error) {
    233 	var buf [fileHeaderLen]byte
    234 	if _, err := f.zipr.ReadAt(buf[:], f.headerOffset); err != nil {
    235 		return 0, err
    236 	}
    237 	b := readBuf(buf[:])
    238 	if sig := b.uint32(); sig != fileHeaderSignature {
    239 		return 0, ErrFormat
    240 	}
    241 	b = b[22:] // skip over most of the header
    242 	filenameLen := int(b.uint16())
    243 	extraLen := int(b.uint16())
    244 	return int64(fileHeaderLen + filenameLen + extraLen), nil
    245 }
    246 
    247 // readDirectoryHeader attempts to read a directory header from r.
    248 // It returns io.ErrUnexpectedEOF if it cannot read a complete header,
    249 // and ErrFormat if it doesn't find a valid header signature.
    250 func readDirectoryHeader(f *File, r io.Reader) error {
    251 	var buf [directoryHeaderLen]byte
    252 	if _, err := io.ReadFull(r, buf[:]); err != nil {
    253 		return err
    254 	}
    255 	b := readBuf(buf[:])
    256 	if sig := b.uint32(); sig != directoryHeaderSignature {
    257 		return ErrFormat
    258 	}
    259 	f.CreatorVersion = b.uint16()
    260 	f.ReaderVersion = b.uint16()
    261 	f.Flags = b.uint16()
    262 	f.Method = b.uint16()
    263 	f.ModifiedTime = b.uint16()
    264 	f.ModifiedDate = b.uint16()
    265 	f.CRC32 = b.uint32()
    266 	f.CompressedSize = b.uint32()
    267 	f.UncompressedSize = b.uint32()
    268 	f.CompressedSize64 = uint64(f.CompressedSize)
    269 	f.UncompressedSize64 = uint64(f.UncompressedSize)
    270 	filenameLen := int(b.uint16())
    271 	extraLen := int(b.uint16())
    272 	commentLen := int(b.uint16())
    273 	b = b[4:] // skipped start disk number and internal attributes (2x uint16)
    274 	f.ExternalAttrs = b.uint32()
    275 	f.headerOffset = int64(b.uint32())
    276 	d := make([]byte, filenameLen+extraLen+commentLen)
    277 	if _, err := io.ReadFull(r, d); err != nil {
    278 		return err
    279 	}
    280 	f.Name = string(d[:filenameLen])
    281 	f.Extra = d[filenameLen : filenameLen+extraLen]
    282 	f.Comment = string(d[filenameLen+extraLen:])
    283 
    284 	// Determine the character encoding.
    285 	utf8Valid1, utf8Require1 := detectUTF8(f.Name)
    286 	utf8Valid2, utf8Require2 := detectUTF8(f.Comment)
    287 	switch {
    288 	case !utf8Valid1 || !utf8Valid2:
    289 		// Name and Comment definitely not UTF-8.
    290 		f.NonUTF8 = true
    291 	case !utf8Require1 && !utf8Require2:
    292 		// Name and Comment use only single-byte runes that overlap with UTF-8.
    293 		f.NonUTF8 = false
    294 	default:
    295 		// Might be UTF-8, might be some other encoding; preserve existing flag.
    296 		// Some ZIP writers use UTF-8 encoding without setting the UTF-8 flag.
    297 		// Since it is impossible to always distinguish valid UTF-8 from some
    298 		// other encoding (e.g., GBK or Shift-JIS), we trust the flag.
    299 		f.NonUTF8 = f.Flags&0x800 == 0
    300 	}
    301 
    302 	needUSize := f.UncompressedSize == ^uint32(0)
    303 	needCSize := f.CompressedSize == ^uint32(0)
    304 	needHeaderOffset := f.headerOffset == int64(^uint32(0))
    305 
    306 	// Best effort to find what we need.
    307 	// Other zip authors might not even follow the basic format,
    308 	// and we'll just ignore the Extra content in that case.
    309 	var modified time.Time
    310 parseExtras:
    311 	for extra := readBuf(f.Extra); len(extra) >= 4; { // need at least tag and size
    312 		fieldTag := extra.uint16()
    313 		fieldSize := int(extra.uint16())
    314 		if len(extra) < fieldSize {
    315 			break
    316 		}
    317 		fieldBuf := extra.sub(fieldSize)
    318 
    319 		switch fieldTag {
    320 		case zip64ExtraID:
    321 			// update directory values from the zip64 extra block.
    322 			// They should only be consulted if the sizes read earlier
    323 			// are maxed out.
    324 			// See golang.org/issue/13367.
    325 			if needUSize {
    326 				needUSize = false
    327 				if len(fieldBuf) < 8 {
    328 					return ErrFormat
    329 				}
    330 				f.UncompressedSize64 = fieldBuf.uint64()
    331 			}
    332 			if needCSize {
    333 				needCSize = false
    334 				if len(fieldBuf) < 8 {
    335 					return ErrFormat
    336 				}
    337 				f.CompressedSize64 = fieldBuf.uint64()
    338 			}
    339 			if needHeaderOffset {
    340 				needHeaderOffset = false
    341 				if len(fieldBuf) < 8 {
    342 					return ErrFormat
    343 				}
    344 				f.headerOffset = int64(fieldBuf.uint64())
    345 			}
    346 		case ntfsExtraID:
    347 			if len(fieldBuf) < 4 {
    348 				continue parseExtras
    349 			}
    350 			fieldBuf.uint32()        // reserved (ignored)
    351 			for len(fieldBuf) >= 4 { // need at least tag and size
    352 				attrTag := fieldBuf.uint16()
    353 				attrSize := int(fieldBuf.uint16())
    354 				if len(fieldBuf) < attrSize {
    355 					continue parseExtras
    356 				}
    357 				attrBuf := fieldBuf.sub(attrSize)
    358 				if attrTag != 1 || attrSize != 24 {
    359 					continue // Ignore irrelevant attributes
    360 				}
    361 
    362 				const ticksPerSecond = 1e7    // Windows timestamp resolution
    363 				ts := int64(attrBuf.uint64()) // ModTime since Windows epoch
    364 				secs := int64(ts / ticksPerSecond)
    365 				nsecs := (1e9 / ticksPerSecond) * int64(ts%ticksPerSecond)
    366 				epoch := time.Date(1601, time.January, 1, 0, 0, 0, 0, time.UTC)
    367 				modified = time.Unix(epoch.Unix()+secs, nsecs)
    368 			}
    369 		case unixExtraID:
    370 			if len(fieldBuf) < 8 {
    371 				continue parseExtras
    372 			}
    373 			fieldBuf.uint32()              // AcTime (ignored)
    374 			ts := int64(fieldBuf.uint32()) // ModTime since Unix epoch
    375 			modified = time.Unix(ts, 0)
    376 		case extTimeExtraID:
    377 			if len(fieldBuf) < 5 || fieldBuf.uint8()&1 == 0 {
    378 				continue parseExtras
    379 			}
    380 			ts := int64(fieldBuf.uint32()) // ModTime since Unix epoch
    381 			modified = time.Unix(ts, 0)
    382 		case infoZipUnixExtraID:
    383 			if len(fieldBuf) < 4 {
    384 				continue parseExtras
    385 			}
    386 			ts := int64(fieldBuf.uint32()) // ModTime since Unix epoch
    387 			modified = time.Unix(ts, 0)
    388 		}
    389 	}
    390 
    391 	msdosModified := msDosTimeToTime(f.ModifiedDate, f.ModifiedTime)
    392 	f.Modified = msdosModified
    393 	if !modified.IsZero() {
    394 		f.Modified = modified.UTC()
    395 
    396 		// If legacy MS-DOS timestamps are set, we can use the delta between
    397 		// the legacy and extended versions to estimate timezone offset.
    398 		//
    399 		// A non-UTC timezone is always used (even if offset is zero).
    400 		// Thus, FileHeader.Modified.Location() == time.UTC is useful for
    401 		// determining whether extended timestamps are present.
    402 		// This is necessary for users that need to do additional time
    403 		// calculations when dealing with legacy ZIP formats.
    404 		if f.ModifiedTime != 0 || f.ModifiedDate != 0 {
    405 			f.Modified = modified.In(timeZone(msdosModified.Sub(modified)))
    406 		}
    407 	}
    408 
    409 	// Assume that uncompressed size 2-1 could plausibly happen in
    410 	// an old zip32 file that was sharding inputs into the largest chunks
    411 	// possible (or is just malicious; search the web for 42.zip).
    412 	// If needUSize is true still, it means we didn't see a zip64 extension.
    413 	// As long as the compressed size is not also 2-1 (implausible)
    414 	// and the header is not also 2-1 (equally implausible),
    415 	// accept the uncompressed size 2-1 as valid.
    416 	// If nothing else, this keeps archive/zip working with 42.zip.
    417 	_ = needUSize
    418 
    419 	if needCSize || needHeaderOffset {
    420 		return ErrFormat
    421 	}
    422 
    423 	return nil
    424 }
    425 
    426 func readDataDescriptor(r io.Reader, f *File) error {
    427 	var buf [dataDescriptorLen]byte
    428 
    429 	// The spec says: "Although not originally assigned a
    430 	// signature, the value 0x08074b50 has commonly been adopted
    431 	// as a signature value for the data descriptor record.
    432 	// Implementers should be aware that ZIP files may be
    433 	// encountered with or without this signature marking data
    434 	// descriptors and should account for either case when reading
    435 	// ZIP files to ensure compatibility."
    436 	//
    437 	// dataDescriptorLen includes the size of the signature but
    438 	// first read just those 4 bytes to see if it exists.
    439 	if _, err := io.ReadFull(r, buf[:4]); err != nil {
    440 		return err
    441 	}
    442 	off := 0
    443 	maybeSig := readBuf(buf[:4])
    444 	if maybeSig.uint32() != dataDescriptorSignature {
    445 		// No data descriptor signature. Keep these four
    446 		// bytes.
    447 		off += 4
    448 	}
    449 	if _, err := io.ReadFull(r, buf[off:12]); err != nil {
    450 		return err
    451 	}
    452 	b := readBuf(buf[:12])
    453 	if b.uint32() != f.CRC32 {
    454 		return ErrChecksum
    455 	}
    456 
    457 	// The two sizes that follow here can be either 32 bits or 64 bits
    458 	// but the spec is not very clear on this and different
    459 	// interpretations has been made causing incompatibilities. We
    460 	// already have the sizes from the central directory so we can
    461 	// just ignore these.
    462 
    463 	return nil
    464 }
    465 
    466 func readDirectoryEnd(r io.ReaderAt, size int64) (dir *directoryEnd, err error) {
    467 	// look for directoryEndSignature in the last 1k, then in the last 65k
    468 	var buf []byte
    469 	var directoryEndOffset int64
    470 	for i, bLen := range []int64{1024, 65 * 1024} {
    471 		if bLen > size {
    472 			bLen = size
    473 		}
    474 		buf = make([]byte, int(bLen))
    475 		if _, err := r.ReadAt(buf, size-bLen); err != nil && err != io.EOF {
    476 			return nil, err
    477 		}
    478 		if p := findSignatureInBlock(buf); p >= 0 {
    479 			buf = buf[p:]
    480 			directoryEndOffset = size - bLen + int64(p)
    481 			break
    482 		}
    483 		if i == 1 || bLen == size {
    484 			return nil, ErrFormat
    485 		}
    486 	}
    487 
    488 	// read header into struct
    489 	b := readBuf(buf[4:]) // skip signature
    490 	d := &directoryEnd{
    491 		diskNbr:            uint32(b.uint16()),
    492 		dirDiskNbr:         uint32(b.uint16()),
    493 		dirRecordsThisDisk: uint64(b.uint16()),
    494 		directoryRecords:   uint64(b.uint16()),
    495 		directorySize:      uint64(b.uint32()),
    496 		directoryOffset:    uint64(b.uint32()),
    497 		commentLen:         b.uint16(),
    498 	}
    499 	l := int(d.commentLen)
    500 	if l > len(b) {
    501 		return nil, errors.New("zip: invalid comment length")
    502 	}
    503 	d.comment = string(b[:l])
    504 
    505 	// These values mean that the file can be a zip64 file
    506 	if d.directoryRecords == 0xffff || d.directorySize == 0xffff || d.directoryOffset == 0xffffffff {
    507 		p, err := findDirectory64End(r, directoryEndOffset)
    508 		if err == nil && p >= 0 {
    509 			err = readDirectory64End(r, p, d)
    510 		}
    511 		if err != nil {
    512 			return nil, err
    513 		}
    514 	}
    515 	// Make sure directoryOffset points to somewhere in our file.
    516 	if o := int64(d.directoryOffset); o < 0 || o >= size {
    517 		return nil, ErrFormat
    518 	}
    519 	return d, nil
    520 }
    521 
    522 // findDirectory64End tries to read the zip64 locator just before the
    523 // directory end and returns the offset of the zip64 directory end if
    524 // found.
    525 func findDirectory64End(r io.ReaderAt, directoryEndOffset int64) (int64, error) {
    526 	locOffset := directoryEndOffset - directory64LocLen
    527 	if locOffset < 0 {
    528 		return -1, nil // no need to look for a header outside the file
    529 	}
    530 	buf := make([]byte, directory64LocLen)
    531 	if _, err := r.ReadAt(buf, locOffset); err != nil {
    532 		return -1, err
    533 	}
    534 	b := readBuf(buf)
    535 	if sig := b.uint32(); sig != directory64LocSignature {
    536 		return -1, nil
    537 	}
    538 	if b.uint32() != 0 { // number of the disk with the start of the zip64 end of central directory
    539 		return -1, nil // the file is not a valid zip64-file
    540 	}
    541 	p := b.uint64()      // relative offset of the zip64 end of central directory record
    542 	if b.uint32() != 1 { // total number of disks
    543 		return -1, nil // the file is not a valid zip64-file
    544 	}
    545 	return int64(p), nil
    546 }
    547 
    548 // readDirectory64End reads the zip64 directory end and updates the
    549 // directory end with the zip64 directory end values.
    550 func readDirectory64End(r io.ReaderAt, offset int64, d *directoryEnd) (err error) {
    551 	buf := make([]byte, directory64EndLen)
    552 	if _, err := r.ReadAt(buf, offset); err != nil {
    553 		return err
    554 	}
    555 
    556 	b := readBuf(buf)
    557 	if sig := b.uint32(); sig != directory64EndSignature {
    558 		return ErrFormat
    559 	}
    560 
    561 	b = b[12:]                        // skip dir size, version and version needed (uint64 + 2x uint16)
    562 	d.diskNbr = b.uint32()            // number of this disk
    563 	d.dirDiskNbr = b.uint32()         // number of the disk with the start of the central directory
    564 	d.dirRecordsThisDisk = b.uint64() // total number of entries in the central directory on this disk
    565 	d.directoryRecords = b.uint64()   // total number of entries in the central directory
    566 	d.directorySize = b.uint64()      // size of the central directory
    567 	d.directoryOffset = b.uint64()    // offset of start of central directory with respect to the starting disk number
    568 
    569 	return nil
    570 }
    571 
    572 func findSignatureInBlock(b []byte) int {
    573 	for i := len(b) - directoryEndLen; i >= 0; i-- {
    574 		// defined from directoryEndSignature in struct.go
    575 		if b[i] == 'P' && b[i+1] == 'K' && b[i+2] == 0x05 && b[i+3] == 0x06 {
    576 			// n is length of comment
    577 			n := int(b[i+directoryEndLen-2]) | int(b[i+directoryEndLen-1])<<8
    578 			if n+directoryEndLen+i <= len(b) {
    579 				return i
    580 			}
    581 		}
    582 	}
    583 	return -1
    584 }
    585 
    586 type readBuf []byte
    587 
    588 func (b *readBuf) uint8() uint8 {
    589 	v := (*b)[0]
    590 	*b = (*b)[1:]
    591 	return v
    592 }
    593 
    594 func (b *readBuf) uint16() uint16 {
    595 	v := binary.LittleEndian.Uint16(*b)
    596 	*b = (*b)[2:]
    597 	return v
    598 }
    599 
    600 func (b *readBuf) uint32() uint32 {
    601 	v := binary.LittleEndian.Uint32(*b)
    602 	*b = (*b)[4:]
    603 	return v
    604 }
    605 
    606 func (b *readBuf) uint64() uint64 {
    607 	v := binary.LittleEndian.Uint64(*b)
    608 	*b = (*b)[8:]
    609 	return v
    610 }
    611 
    612 func (b *readBuf) sub(n int) readBuf {
    613 	b2 := (*b)[:n]
    614 	*b = (*b)[n:]
    615 	return b2
    616 }
    617