1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package gzip implements reading and writing of gzip format compressed files, 6 // as specified in RFC 1952. 7 package gzip 8 9 import ( 10 "bufio" 11 "compress/flate" 12 "encoding/binary" 13 "errors" 14 "hash/crc32" 15 "io" 16 "time" 17 ) 18 19 const ( 20 gzipID1 = 0x1f 21 gzipID2 = 0x8b 22 gzipDeflate = 8 23 flagText = 1 << 0 24 flagHdrCrc = 1 << 1 25 flagExtra = 1 << 2 26 flagName = 1 << 3 27 flagComment = 1 << 4 28 ) 29 30 var ( 31 // ErrChecksum is returned when reading GZIP data that has an invalid checksum. 32 ErrChecksum = errors.New("gzip: invalid checksum") 33 // ErrHeader is returned when reading GZIP data that has an invalid header. 34 ErrHeader = errors.New("gzip: invalid header") 35 ) 36 37 var le = binary.LittleEndian 38 39 // noEOF converts io.EOF to io.ErrUnexpectedEOF. 40 func noEOF(err error) error { 41 if err == io.EOF { 42 return io.ErrUnexpectedEOF 43 } 44 return err 45 } 46 47 // The gzip file stores a header giving metadata about the compressed file. 48 // That header is exposed as the fields of the Writer and Reader structs. 49 // 50 // Strings must be UTF-8 encoded and may only contain Unicode code points 51 // U+0001 through U+00FF, due to limitations of the GZIP file format. 52 type Header struct { 53 Comment string // comment 54 Extra []byte // "extra data" 55 ModTime time.Time // modification time 56 Name string // file name 57 OS byte // operating system type 58 } 59 60 // A Reader is an io.Reader that can be read to retrieve 61 // uncompressed data from a gzip-format compressed file. 62 // 63 // In general, a gzip file can be a concatenation of gzip files, 64 // each with its own header. Reads from the Reader 65 // return the concatenation of the uncompressed data of each. 66 // Only the first header is recorded in the Reader fields. 67 // 68 // Gzip files store a length and checksum of the uncompressed data. 69 // The Reader will return a ErrChecksum when Read 70 // reaches the end of the uncompressed data if it does not 71 // have the expected length or checksum. Clients should treat data 72 // returned by Read as tentative until they receive the io.EOF 73 // marking the end of the data. 74 type Reader struct { 75 Header // valid after NewReader or Reader.Reset 76 r flate.Reader 77 decompressor io.ReadCloser 78 digest uint32 // CRC-32, IEEE polynomial (section 8) 79 size uint32 // Uncompressed size (section 2.3.1) 80 buf [512]byte 81 err error 82 multistream bool 83 } 84 85 // NewReader creates a new Reader reading the given reader. 86 // If r does not also implement io.ByteReader, 87 // the decompressor may read more data than necessary from r. 88 // 89 // It is the caller's responsibility to call Close on the Reader when done. 90 // 91 // The Reader.Header fields will be valid in the Reader returned. 92 func NewReader(r io.Reader) (*Reader, error) { 93 z := new(Reader) 94 if err := z.Reset(r); err != nil { 95 return nil, err 96 } 97 return z, nil 98 } 99 100 // Reset discards the Reader z's state and makes it equivalent to the 101 // result of its original state from NewReader, but reading from r instead. 102 // This permits reusing a Reader rather than allocating a new one. 103 func (z *Reader) Reset(r io.Reader) error { 104 *z = Reader{ 105 decompressor: z.decompressor, 106 multistream: true, 107 } 108 if rr, ok := r.(flate.Reader); ok { 109 z.r = rr 110 } else { 111 z.r = bufio.NewReader(r) 112 } 113 z.Header, z.err = z.readHeader() 114 return z.err 115 } 116 117 // Multistream controls whether the reader supports multistream files. 118 // 119 // If enabled (the default), the Reader expects the input to be a sequence 120 // of individually gzipped data streams, each with its own header and 121 // trailer, ending at EOF. The effect is that the concatenation of a sequence 122 // of gzipped files is treated as equivalent to the gzip of the concatenation 123 // of the sequence. This is standard behavior for gzip readers. 124 // 125 // Calling Multistream(false) disables this behavior; disabling the behavior 126 // can be useful when reading file formats that distinguish individual gzip 127 // data streams or mix gzip data streams with other data streams. 128 // In this mode, when the Reader reaches the end of the data stream, 129 // Read returns io.EOF. If the underlying reader implements io.ByteReader, 130 // it will be left positioned just after the gzip stream. 131 // To start the next stream, call z.Reset(r) followed by z.Multistream(false). 132 // If there is no next stream, z.Reset(r) will return io.EOF. 133 func (z *Reader) Multistream(ok bool) { 134 z.multistream = ok 135 } 136 137 // readString reads a NUL-terminated string from z.r. 138 // It treats the bytes read as being encoded as ISO 8859-1 (Latin-1) and 139 // will output a string encoded using UTF-8. 140 // This method always updates z.digest with the data read. 141 func (z *Reader) readString() (string, error) { 142 var err error 143 needConv := false 144 for i := 0; ; i++ { 145 if i >= len(z.buf) { 146 return "", ErrHeader 147 } 148 z.buf[i], err = z.r.ReadByte() 149 if err != nil { 150 return "", err 151 } 152 if z.buf[i] > 0x7f { 153 needConv = true 154 } 155 if z.buf[i] == 0 { 156 // Digest covers the NUL terminator. 157 z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:i+1]) 158 159 // Strings are ISO 8859-1, Latin-1 (RFC 1952, section 2.3.1). 160 if needConv { 161 s := make([]rune, 0, i) 162 for _, v := range z.buf[:i] { 163 s = append(s, rune(v)) 164 } 165 return string(s), nil 166 } 167 return string(z.buf[:i]), nil 168 } 169 } 170 } 171 172 // readHeader reads the GZIP header according to section 2.3.1. 173 // This method does not set z.err. 174 func (z *Reader) readHeader() (hdr Header, err error) { 175 if _, err = io.ReadFull(z.r, z.buf[:10]); err != nil { 176 // RFC 1952, section 2.2, says the following: 177 // A gzip file consists of a series of "members" (compressed data sets). 178 // 179 // Other than this, the specification does not clarify whether a 180 // "series" is defined as "one or more" or "zero or more". To err on the 181 // side of caution, Go interprets this to mean "zero or more". 182 // Thus, it is okay to return io.EOF here. 183 return hdr, err 184 } 185 if z.buf[0] != gzipID1 || z.buf[1] != gzipID2 || z.buf[2] != gzipDeflate { 186 return hdr, ErrHeader 187 } 188 flg := z.buf[3] 189 if t := int64(le.Uint32(z.buf[4:8])); t > 0 { 190 // Section 2.3.1, the zero value for MTIME means that the 191 // modified time is not set. 192 hdr.ModTime = time.Unix(t, 0) 193 } 194 // z.buf[8] is XFL and is currently ignored. 195 hdr.OS = z.buf[9] 196 z.digest = crc32.ChecksumIEEE(z.buf[:10]) 197 198 if flg&flagExtra != 0 { 199 if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil { 200 return hdr, noEOF(err) 201 } 202 z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:2]) 203 data := make([]byte, le.Uint16(z.buf[:2])) 204 if _, err = io.ReadFull(z.r, data); err != nil { 205 return hdr, noEOF(err) 206 } 207 z.digest = crc32.Update(z.digest, crc32.IEEETable, data) 208 hdr.Extra = data 209 } 210 211 var s string 212 if flg&flagName != 0 { 213 if s, err = z.readString(); err != nil { 214 return hdr, err 215 } 216 hdr.Name = s 217 } 218 219 if flg&flagComment != 0 { 220 if s, err = z.readString(); err != nil { 221 return hdr, err 222 } 223 hdr.Comment = s 224 } 225 226 if flg&flagHdrCrc != 0 { 227 if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil { 228 return hdr, noEOF(err) 229 } 230 digest := le.Uint16(z.buf[:2]) 231 if digest != uint16(z.digest) { 232 return hdr, ErrHeader 233 } 234 } 235 236 z.digest = 0 237 if z.decompressor == nil { 238 z.decompressor = flate.NewReader(z.r) 239 } else { 240 z.decompressor.(flate.Resetter).Reset(z.r, nil) 241 } 242 return hdr, nil 243 } 244 245 // Read implements io.Reader, reading uncompressed bytes from its underlying Reader. 246 func (z *Reader) Read(p []byte) (n int, err error) { 247 if z.err != nil { 248 return 0, z.err 249 } 250 251 n, z.err = z.decompressor.Read(p) 252 z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n]) 253 z.size += uint32(n) 254 if z.err != io.EOF { 255 // In the normal case we return here. 256 return n, z.err 257 } 258 259 // Finished file; check checksum and size. 260 if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil { 261 z.err = noEOF(err) 262 return n, z.err 263 } 264 digest := le.Uint32(z.buf[:4]) 265 size := le.Uint32(z.buf[4:8]) 266 if digest != z.digest || size != z.size { 267 z.err = ErrChecksum 268 return n, z.err 269 } 270 z.digest, z.size = 0, 0 271 272 // File is ok; check if there is another. 273 if !z.multistream { 274 return n, io.EOF 275 } 276 z.err = nil // Remove io.EOF 277 278 if _, z.err = z.readHeader(); z.err != nil { 279 return n, z.err 280 } 281 282 // Read from next file, if necessary. 283 if n > 0 { 284 return n, nil 285 } 286 return z.Read(p) 287 } 288 289 // Close closes the Reader. It does not close the underlying io.Reader. 290 // In order for the GZIP checksum to be verified, the reader must be 291 // fully consumed until the io.EOF. 292 func (z *Reader) Close() error { return z.decompressor.Close() } 293