Home | History | Annotate | Download | only in gzip
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package gzip implements reading and writing of gzip format compressed files,
      6 // as specified in RFC 1952.
      7 package gzip
      8 
      9 import (
     10 	"bufio"
     11 	"compress/flate"
     12 	"errors"
     13 	"hash"
     14 	"hash/crc32"
     15 	"io"
     16 	"time"
     17 )
     18 
     19 const (
     20 	gzipID1     = 0x1f
     21 	gzipID2     = 0x8b
     22 	gzipDeflate = 8
     23 	flagText    = 1 << 0
     24 	flagHdrCrc  = 1 << 1
     25 	flagExtra   = 1 << 2
     26 	flagName    = 1 << 3
     27 	flagComment = 1 << 4
     28 )
     29 
     30 func makeReader(r io.Reader) flate.Reader {
     31 	if rr, ok := r.(flate.Reader); ok {
     32 		return rr
     33 	}
     34 	return bufio.NewReader(r)
     35 }
     36 
     37 var (
     38 	// ErrChecksum is returned when reading GZIP data that has an invalid checksum.
     39 	ErrChecksum = errors.New("gzip: invalid checksum")
     40 	// ErrHeader is returned when reading GZIP data that has an invalid header.
     41 	ErrHeader = errors.New("gzip: invalid header")
     42 )
     43 
     44 // The gzip file stores a header giving metadata about the compressed file.
     45 // That header is exposed as the fields of the Writer and Reader structs.
     46 type Header struct {
     47 	Comment string    // comment
     48 	Extra   []byte    // "extra data"
     49 	ModTime time.Time // modification time
     50 	Name    string    // file name
     51 	OS      byte      // operating system type
     52 }
     53 
     54 // A Reader is an io.Reader that can be read to retrieve
     55 // uncompressed data from a gzip-format compressed file.
     56 //
     57 // In general, a gzip file can be a concatenation of gzip files,
     58 // each with its own header.  Reads from the Reader
     59 // return the concatenation of the uncompressed data of each.
     60 // Only the first header is recorded in the Reader fields.
     61 //
     62 // Gzip files store a length and checksum of the uncompressed data.
     63 // The Reader will return a ErrChecksum when Read
     64 // reaches the end of the uncompressed data if it does not
     65 // have the expected length or checksum.  Clients should treat data
     66 // returned by Read as tentative until they receive the io.EOF
     67 // marking the end of the data.
     68 type Reader struct {
     69 	Header
     70 	r            flate.Reader
     71 	decompressor io.ReadCloser
     72 	digest       hash.Hash32
     73 	size         uint32
     74 	flg          byte
     75 	buf          [512]byte
     76 	err          error
     77 	multistream  bool
     78 }
     79 
     80 // NewReader creates a new Reader reading the given reader.
     81 // If r does not also implement io.ByteReader,
     82 // the decompressor may read more data than necessary from r.
     83 // It is the caller's responsibility to call Close on the Reader when done.
     84 func NewReader(r io.Reader) (*Reader, error) {
     85 	z := new(Reader)
     86 	z.r = makeReader(r)
     87 	z.multistream = true
     88 	z.digest = crc32.NewIEEE()
     89 	if err := z.readHeader(true); err != nil {
     90 		return nil, err
     91 	}
     92 	return z, nil
     93 }
     94 
     95 // Reset discards the Reader z's state and makes it equivalent to the
     96 // result of its original state from NewReader, but reading from r instead.
     97 // This permits reusing a Reader rather than allocating a new one.
     98 func (z *Reader) Reset(r io.Reader) error {
     99 	z.r = makeReader(r)
    100 	if z.digest == nil {
    101 		z.digest = crc32.NewIEEE()
    102 	} else {
    103 		z.digest.Reset()
    104 	}
    105 	z.size = 0
    106 	z.err = nil
    107 	z.multistream = true
    108 	return z.readHeader(true)
    109 }
    110 
    111 // Multistream controls whether the reader supports multistream files.
    112 //
    113 // If enabled (the default), the Reader expects the input to be a sequence
    114 // of individually gzipped data streams, each with its own header and
    115 // trailer, ending at EOF. The effect is that the concatenation of a sequence
    116 // of gzipped files is treated as equivalent to the gzip of the concatenation
    117 // of the sequence. This is standard behavior for gzip readers.
    118 //
    119 // Calling Multistream(false) disables this behavior; disabling the behavior
    120 // can be useful when reading file formats that distinguish individual gzip
    121 // data streams or mix gzip data streams with other data streams.
    122 // In this mode, when the Reader reaches the end of the data stream,
    123 // Read returns io.EOF. If the underlying reader implements io.ByteReader,
    124 // it will be left positioned just after the gzip stream.
    125 // To start the next stream, call z.Reset(r) followed by z.Multistream(false).
    126 // If there is no next stream, z.Reset(r) will return io.EOF.
    127 func (z *Reader) Multistream(ok bool) {
    128 	z.multistream = ok
    129 }
    130 
    131 // GZIP (RFC 1952) is little-endian, unlike ZLIB (RFC 1950).
    132 func get4(p []byte) uint32 {
    133 	return uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
    134 }
    135 
    136 func (z *Reader) readString() (string, error) {
    137 	var err error
    138 	needconv := false
    139 	for i := 0; ; i++ {
    140 		if i >= len(z.buf) {
    141 			return "", ErrHeader
    142 		}
    143 		z.buf[i], err = z.r.ReadByte()
    144 		if err != nil {
    145 			return "", err
    146 		}
    147 		if z.buf[i] > 0x7f {
    148 			needconv = true
    149 		}
    150 		if z.buf[i] == 0 {
    151 			// GZIP (RFC 1952) specifies that strings are NUL-terminated ISO 8859-1 (Latin-1).
    152 			if needconv {
    153 				s := make([]rune, 0, i)
    154 				for _, v := range z.buf[0:i] {
    155 					s = append(s, rune(v))
    156 				}
    157 				return string(s), nil
    158 			}
    159 			return string(z.buf[0:i]), nil
    160 		}
    161 	}
    162 }
    163 
    164 func (z *Reader) read2() (uint32, error) {
    165 	_, err := io.ReadFull(z.r, z.buf[0:2])
    166 	if err != nil {
    167 		return 0, err
    168 	}
    169 	return uint32(z.buf[0]) | uint32(z.buf[1])<<8, nil
    170 }
    171 
    172 func (z *Reader) readHeader(save bool) error {
    173 	_, err := io.ReadFull(z.r, z.buf[0:10])
    174 	if err != nil {
    175 		return err
    176 	}
    177 	if z.buf[0] != gzipID1 || z.buf[1] != gzipID2 || z.buf[2] != gzipDeflate {
    178 		return ErrHeader
    179 	}
    180 	z.flg = z.buf[3]
    181 	if save {
    182 		z.ModTime = time.Unix(int64(get4(z.buf[4:8])), 0)
    183 		// z.buf[8] is xfl, ignored
    184 		z.OS = z.buf[9]
    185 	}
    186 	z.digest.Reset()
    187 	z.digest.Write(z.buf[0:10])
    188 
    189 	if z.flg&flagExtra != 0 {
    190 		n, err := z.read2()
    191 		if err != nil {
    192 			return err
    193 		}
    194 		data := make([]byte, n)
    195 		if _, err = io.ReadFull(z.r, data); err != nil {
    196 			return err
    197 		}
    198 		if save {
    199 			z.Extra = data
    200 		}
    201 	}
    202 
    203 	var s string
    204 	if z.flg&flagName != 0 {
    205 		if s, err = z.readString(); err != nil {
    206 			return err
    207 		}
    208 		if save {
    209 			z.Name = s
    210 		}
    211 	}
    212 
    213 	if z.flg&flagComment != 0 {
    214 		if s, err = z.readString(); err != nil {
    215 			return err
    216 		}
    217 		if save {
    218 			z.Comment = s
    219 		}
    220 	}
    221 
    222 	if z.flg&flagHdrCrc != 0 {
    223 		n, err := z.read2()
    224 		if err != nil {
    225 			return err
    226 		}
    227 		sum := z.digest.Sum32() & 0xFFFF
    228 		if n != sum {
    229 			return ErrHeader
    230 		}
    231 	}
    232 
    233 	z.digest.Reset()
    234 	if z.decompressor == nil {
    235 		z.decompressor = flate.NewReader(z.r)
    236 	} else {
    237 		z.decompressor.(flate.Resetter).Reset(z.r, nil)
    238 	}
    239 	return nil
    240 }
    241 
    242 func (z *Reader) Read(p []byte) (n int, err error) {
    243 	if z.err != nil {
    244 		return 0, z.err
    245 	}
    246 	if len(p) == 0 {
    247 		return 0, nil
    248 	}
    249 
    250 	n, err = z.decompressor.Read(p)
    251 	z.digest.Write(p[0:n])
    252 	z.size += uint32(n)
    253 	if n != 0 || err != io.EOF {
    254 		z.err = err
    255 		return
    256 	}
    257 
    258 	// Finished file; check checksum + size.
    259 	if _, err := io.ReadFull(z.r, z.buf[0:8]); err != nil {
    260 		z.err = err
    261 		return 0, err
    262 	}
    263 	crc32, isize := get4(z.buf[0:4]), get4(z.buf[4:8])
    264 	sum := z.digest.Sum32()
    265 	if sum != crc32 || isize != z.size {
    266 		z.err = ErrChecksum
    267 		return 0, z.err
    268 	}
    269 
    270 	// File is ok; is there another?
    271 	if !z.multistream {
    272 		return 0, io.EOF
    273 	}
    274 
    275 	if err = z.readHeader(false); err != nil {
    276 		z.err = err
    277 		return
    278 	}
    279 
    280 	// Yes.  Reset and read from it.
    281 	z.digest.Reset()
    282 	z.size = 0
    283 	return z.Read(p)
    284 }
    285 
    286 // Close closes the Reader. It does not close the underlying io.Reader.
    287 func (z *Reader) Close() error { return z.decompressor.Close() }
    288