1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:generate go run gen.go -output fixedhuff.go 6 7 // Package flate implements the DEFLATE compressed data format, described in 8 // RFC 1951. The gzip and zlib packages implement access to DEFLATE-based file 9 // formats. 10 package flate 11 12 import ( 13 "bufio" 14 "io" 15 "strconv" 16 ) 17 18 const ( 19 maxCodeLen = 16 // max length of Huffman code 20 maxHist = 32768 // max history required 21 // The next three numbers come from the RFC section 3.2.7, with the 22 // additional proviso in section 3.2.5 which implies that distance codes 23 // 30 and 31 should never occur in compressed data. 24 maxNumLit = 286 25 maxNumDist = 30 26 numCodes = 19 // number of codes in Huffman meta-code 27 ) 28 29 // A CorruptInputError reports the presence of corrupt input at a given offset. 30 type CorruptInputError int64 31 32 func (e CorruptInputError) Error() string { 33 return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10) 34 } 35 36 // An InternalError reports an error in the flate code itself. 37 type InternalError string 38 39 func (e InternalError) Error() string { return "flate: internal error: " + string(e) } 40 41 // A ReadError reports an error encountered while reading input. 42 type ReadError struct { 43 Offset int64 // byte offset where error occurred 44 Err error // error returned by underlying Read 45 } 46 47 func (e *ReadError) Error() string { 48 return "flate: read error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error() 49 } 50 51 // A WriteError reports an error encountered while writing output. 52 type WriteError struct { 53 Offset int64 // byte offset where error occurred 54 Err error // error returned by underlying Write 55 } 56 57 func (e *WriteError) Error() string { 58 return "flate: write error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error() 59 } 60 61 // Resetter resets a ReadCloser returned by NewReader or NewReaderDict to 62 // to switch to a new underlying Reader. This permits reusing a ReadCloser 63 // instead of allocating a new one. 64 type Resetter interface { 65 // Reset discards any buffered data and resets the Resetter as if it was 66 // newly initialized with the given reader. 67 Reset(r io.Reader, dict []byte) error 68 } 69 70 // Note that much of the implementation of huffmanDecoder is also copied 71 // into gen.go (in package main) for the purpose of precomputing the 72 // fixed huffman tables so they can be included statically. 73 74 // The data structure for decoding Huffman tables is based on that of 75 // zlib. There is a lookup table of a fixed bit width (huffmanChunkBits), 76 // For codes smaller than the table width, there are multiple entries 77 // (each combination of trailing bits has the same value). For codes 78 // larger than the table width, the table contains a link to an overflow 79 // table. The width of each entry in the link table is the maximum code 80 // size minus the chunk width. 81 82 // Note that you can do a lookup in the table even without all bits 83 // filled. Since the extra bits are zero, and the DEFLATE Huffman codes 84 // have the property that shorter codes come before longer ones, the 85 // bit length estimate in the result is a lower bound on the actual 86 // number of bits. 87 88 // chunk & 15 is number of bits 89 // chunk >> 4 is value, including table link 90 91 const ( 92 huffmanChunkBits = 9 93 huffmanNumChunks = 1 << huffmanChunkBits 94 huffmanCountMask = 15 95 huffmanValueShift = 4 96 ) 97 98 type huffmanDecoder struct { 99 min int // the minimum code length 100 chunks [huffmanNumChunks]uint32 // chunks as described above 101 links [][]uint32 // overflow links 102 linkMask uint32 // mask the width of the link table 103 } 104 105 // Initialize Huffman decoding tables from array of code lengths. 106 // Following this function, h is guaranteed to be initialized into a complete 107 // tree (i.e., neither over-subscribed nor under-subscribed). The exception is a 108 // degenerate case where the tree has only a single symbol with length 1. Empty 109 // trees are permitted. 110 func (h *huffmanDecoder) init(bits []int) bool { 111 // Sanity enables additional runtime tests during Huffman 112 // table construction. It's intended to be used during 113 // development to supplement the currently ad-hoc unit tests. 114 const sanity = false 115 116 if h.min != 0 { 117 *h = huffmanDecoder{} 118 } 119 120 // Count number of codes of each length, 121 // compute min and max length. 122 var count [maxCodeLen]int 123 var min, max int 124 for _, n := range bits { 125 if n == 0 { 126 continue 127 } 128 if min == 0 || n < min { 129 min = n 130 } 131 if n > max { 132 max = n 133 } 134 count[n]++ 135 } 136 137 // Empty tree. The decompressor.huffSym function will fail later if the tree 138 // is used. Technically, an empty tree is only valid for the HDIST tree and 139 // not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree 140 // is guaranteed to fail since it will attempt to use the tree to decode the 141 // codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is 142 // guaranteed to fail later since the compressed data section must be 143 // composed of at least one symbol (the end-of-block marker). 144 if max == 0 { 145 return true 146 } 147 148 code := 0 149 var nextcode [maxCodeLen]int 150 for i := min; i <= max; i++ { 151 code <<= 1 152 nextcode[i] = code 153 code += count[i] 154 } 155 156 // Check that the coding is complete (i.e., that we've 157 // assigned all 2-to-the-max possible bit sequences). 158 // Exception: To be compatible with zlib, we also need to 159 // accept degenerate single-code codings. See also 160 // TestDegenerateHuffmanCoding. 161 if code != 1<<uint(max) && !(code == 1 && max == 1) { 162 return false 163 } 164 165 h.min = min 166 if max > huffmanChunkBits { 167 numLinks := 1 << (uint(max) - huffmanChunkBits) 168 h.linkMask = uint32(numLinks - 1) 169 170 // create link tables 171 link := nextcode[huffmanChunkBits+1] >> 1 172 h.links = make([][]uint32, huffmanNumChunks-link) 173 for j := uint(link); j < huffmanNumChunks; j++ { 174 reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8 175 reverse >>= uint(16 - huffmanChunkBits) 176 off := j - uint(link) 177 if sanity && h.chunks[reverse] != 0 { 178 panic("impossible: overwriting existing chunk") 179 } 180 h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1)) 181 h.links[off] = make([]uint32, numLinks) 182 } 183 } 184 185 for i, n := range bits { 186 if n == 0 { 187 continue 188 } 189 code := nextcode[n] 190 nextcode[n]++ 191 chunk := uint32(i<<huffmanValueShift | n) 192 reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8 193 reverse >>= uint(16 - n) 194 if n <= huffmanChunkBits { 195 for off := reverse; off < len(h.chunks); off += 1 << uint(n) { 196 // We should never need to overwrite 197 // an existing chunk. Also, 0 is 198 // never a valid chunk, because the 199 // lower 4 "count" bits should be 200 // between 1 and 15. 201 if sanity && h.chunks[off] != 0 { 202 panic("impossible: overwriting existing chunk") 203 } 204 h.chunks[off] = chunk 205 } 206 } else { 207 j := reverse & (huffmanNumChunks - 1) 208 if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 { 209 // Longer codes should have been 210 // associated with a link table above. 211 panic("impossible: not an indirect chunk") 212 } 213 value := h.chunks[j] >> huffmanValueShift 214 linktab := h.links[value] 215 reverse >>= huffmanChunkBits 216 for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) { 217 if sanity && linktab[off] != 0 { 218 panic("impossible: overwriting existing chunk") 219 } 220 linktab[off] = chunk 221 } 222 } 223 } 224 225 if sanity { 226 // Above we've sanity checked that we never overwrote 227 // an existing entry. Here we additionally check that 228 // we filled the tables completely. 229 for i, chunk := range h.chunks { 230 if chunk == 0 { 231 // As an exception, in the degenerate 232 // single-code case, we allow odd 233 // chunks to be missing. 234 if code == 1 && i%2 == 1 { 235 continue 236 } 237 panic("impossible: missing chunk") 238 } 239 } 240 for _, linktab := range h.links { 241 for _, chunk := range linktab { 242 if chunk == 0 { 243 panic("impossible: missing chunk") 244 } 245 } 246 } 247 } 248 249 return true 250 } 251 252 // The actual read interface needed by NewReader. 253 // If the passed in io.Reader does not also have ReadByte, 254 // the NewReader will introduce its own buffering. 255 type Reader interface { 256 io.Reader 257 io.ByteReader 258 } 259 260 // Decompress state. 261 type decompressor struct { 262 // Input source. 263 r Reader 264 roffset int64 265 woffset int64 266 267 // Input bits, in top of b. 268 b uint32 269 nb uint 270 271 // Huffman decoders for literal/length, distance. 272 h1, h2 huffmanDecoder 273 274 // Length arrays used to define Huffman codes. 275 bits *[maxNumLit + maxNumDist]int 276 codebits *[numCodes]int 277 278 // Output history, buffer. 279 hist *[maxHist]byte 280 hp int // current output position in buffer 281 hw int // have written hist[0:hw] already 282 hfull bool // buffer has filled at least once 283 284 // Temporary buffer (avoids repeated allocation). 285 buf [4]byte 286 287 // Next step in the decompression, 288 // and decompression state. 289 step func(*decompressor) 290 final bool 291 err error 292 toRead []byte 293 hl, hd *huffmanDecoder 294 copyLen int 295 copyDist int 296 } 297 298 func (f *decompressor) nextBlock() { 299 if f.final { 300 if f.hw != f.hp { 301 f.flush((*decompressor).nextBlock) 302 return 303 } 304 f.err = io.EOF 305 return 306 } 307 for f.nb < 1+2 { 308 if f.err = f.moreBits(); f.err != nil { 309 return 310 } 311 } 312 f.final = f.b&1 == 1 313 f.b >>= 1 314 typ := f.b & 3 315 f.b >>= 2 316 f.nb -= 1 + 2 317 switch typ { 318 case 0: 319 f.dataBlock() 320 case 1: 321 // compressed, fixed Huffman tables 322 f.hl = &fixedHuffmanDecoder 323 f.hd = nil 324 f.huffmanBlock() 325 case 2: 326 // compressed, dynamic Huffman tables 327 if f.err = f.readHuffman(); f.err != nil { 328 break 329 } 330 f.hl = &f.h1 331 f.hd = &f.h2 332 f.huffmanBlock() 333 default: 334 // 3 is reserved. 335 f.err = CorruptInputError(f.roffset) 336 } 337 } 338 339 func (f *decompressor) Read(b []byte) (int, error) { 340 for { 341 if len(f.toRead) > 0 { 342 n := copy(b, f.toRead) 343 f.toRead = f.toRead[n:] 344 return n, nil 345 } 346 if f.err != nil { 347 return 0, f.err 348 } 349 f.step(f) 350 } 351 } 352 353 func (f *decompressor) Close() error { 354 if f.err == io.EOF { 355 return nil 356 } 357 return f.err 358 } 359 360 // RFC 1951 section 3.2.7. 361 // Compression with dynamic Huffman codes 362 363 var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15} 364 365 func (f *decompressor) readHuffman() error { 366 // HLIT[5], HDIST[5], HCLEN[4]. 367 for f.nb < 5+5+4 { 368 if err := f.moreBits(); err != nil { 369 return err 370 } 371 } 372 nlit := int(f.b&0x1F) + 257 373 if nlit > maxNumLit { 374 return CorruptInputError(f.roffset) 375 } 376 f.b >>= 5 377 ndist := int(f.b&0x1F) + 1 378 if ndist > maxNumDist { 379 return CorruptInputError(f.roffset) 380 } 381 f.b >>= 5 382 nclen := int(f.b&0xF) + 4 383 // numCodes is 19, so nclen is always valid. 384 f.b >>= 4 385 f.nb -= 5 + 5 + 4 386 387 // (HCLEN+4)*3 bits: code lengths in the magic codeOrder order. 388 for i := 0; i < nclen; i++ { 389 for f.nb < 3 { 390 if err := f.moreBits(); err != nil { 391 return err 392 } 393 } 394 f.codebits[codeOrder[i]] = int(f.b & 0x7) 395 f.b >>= 3 396 f.nb -= 3 397 } 398 for i := nclen; i < len(codeOrder); i++ { 399 f.codebits[codeOrder[i]] = 0 400 } 401 if !f.h1.init(f.codebits[0:]) { 402 return CorruptInputError(f.roffset) 403 } 404 405 // HLIT + 257 code lengths, HDIST + 1 code lengths, 406 // using the code length Huffman code. 407 for i, n := 0, nlit+ndist; i < n; { 408 x, err := f.huffSym(&f.h1) 409 if err != nil { 410 return err 411 } 412 if x < 16 { 413 // Actual length. 414 f.bits[i] = x 415 i++ 416 continue 417 } 418 // Repeat previous length or zero. 419 var rep int 420 var nb uint 421 var b int 422 switch x { 423 default: 424 return InternalError("unexpected length code") 425 case 16: 426 rep = 3 427 nb = 2 428 if i == 0 { 429 return CorruptInputError(f.roffset) 430 } 431 b = f.bits[i-1] 432 case 17: 433 rep = 3 434 nb = 3 435 b = 0 436 case 18: 437 rep = 11 438 nb = 7 439 b = 0 440 } 441 for f.nb < nb { 442 if err := f.moreBits(); err != nil { 443 return err 444 } 445 } 446 rep += int(f.b & uint32(1<<nb-1)) 447 f.b >>= nb 448 f.nb -= nb 449 if i+rep > n { 450 return CorruptInputError(f.roffset) 451 } 452 for j := 0; j < rep; j++ { 453 f.bits[i] = b 454 i++ 455 } 456 } 457 458 if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) { 459 return CorruptInputError(f.roffset) 460 } 461 462 return nil 463 } 464 465 // Decode a single Huffman block from f. 466 // hl and hd are the Huffman states for the lit/length values 467 // and the distance values, respectively. If hd == nil, using the 468 // fixed distance encoding associated with fixed Huffman blocks. 469 func (f *decompressor) huffmanBlock() { 470 for { 471 v, err := f.huffSym(f.hl) 472 if err != nil { 473 f.err = err 474 return 475 } 476 var n uint // number of bits extra 477 var length int 478 switch { 479 case v < 256: 480 f.hist[f.hp] = byte(v) 481 f.hp++ 482 if f.hp == len(f.hist) { 483 // After the flush, continue this loop. 484 f.flush((*decompressor).huffmanBlock) 485 return 486 } 487 continue 488 case v == 256: 489 // Done with huffman block; read next block. 490 f.step = (*decompressor).nextBlock 491 return 492 // otherwise, reference to older data 493 case v < 265: 494 length = v - (257 - 3) 495 n = 0 496 case v < 269: 497 length = v*2 - (265*2 - 11) 498 n = 1 499 case v < 273: 500 length = v*4 - (269*4 - 19) 501 n = 2 502 case v < 277: 503 length = v*8 - (273*8 - 35) 504 n = 3 505 case v < 281: 506 length = v*16 - (277*16 - 67) 507 n = 4 508 case v < 285: 509 length = v*32 - (281*32 - 131) 510 n = 5 511 case v < maxNumLit: 512 length = 258 513 n = 0 514 default: 515 f.err = CorruptInputError(f.roffset) 516 return 517 } 518 if n > 0 { 519 for f.nb < n { 520 if err = f.moreBits(); err != nil { 521 f.err = err 522 return 523 } 524 } 525 length += int(f.b & uint32(1<<n-1)) 526 f.b >>= n 527 f.nb -= n 528 } 529 530 var dist int 531 if f.hd == nil { 532 for f.nb < 5 { 533 if err = f.moreBits(); err != nil { 534 f.err = err 535 return 536 } 537 } 538 dist = int(reverseByte[(f.b&0x1F)<<3]) 539 f.b >>= 5 540 f.nb -= 5 541 } else { 542 if dist, err = f.huffSym(f.hd); err != nil { 543 f.err = err 544 return 545 } 546 } 547 548 switch { 549 case dist < 4: 550 dist++ 551 case dist < maxNumDist: 552 nb := uint(dist-2) >> 1 553 // have 1 bit in bottom of dist, need nb more. 554 extra := (dist & 1) << nb 555 for f.nb < nb { 556 if err = f.moreBits(); err != nil { 557 f.err = err 558 return 559 } 560 } 561 extra |= int(f.b & uint32(1<<nb-1)) 562 f.b >>= nb 563 f.nb -= nb 564 dist = 1<<(nb+1) + 1 + extra 565 default: 566 f.err = CorruptInputError(f.roffset) 567 return 568 } 569 570 // Copy history[-dist:-dist+length] into output. 571 if dist > len(f.hist) { 572 f.err = InternalError("bad history distance") 573 return 574 } 575 576 // No check on length; encoding can be prescient. 577 if !f.hfull && dist > f.hp { 578 f.err = CorruptInputError(f.roffset) 579 return 580 } 581 582 f.copyLen, f.copyDist = length, dist 583 if f.copyHist() { 584 return 585 } 586 } 587 } 588 589 // copyHist copies f.copyLen bytes from f.hist (f.copyDist bytes ago) to itself. 590 // It reports whether the f.hist buffer is full. 591 func (f *decompressor) copyHist() bool { 592 p := f.hp - f.copyDist 593 if p < 0 { 594 p += len(f.hist) 595 } 596 for f.copyLen > 0 { 597 n := f.copyLen 598 if x := len(f.hist) - f.hp; n > x { 599 n = x 600 } 601 if x := len(f.hist) - p; n > x { 602 n = x 603 } 604 forwardCopy(f.hist[:], f.hp, p, n) 605 p += n 606 f.hp += n 607 f.copyLen -= n 608 if f.hp == len(f.hist) { 609 // After flush continue copying out of history. 610 f.flush((*decompressor).copyHuff) 611 return true 612 } 613 if p == len(f.hist) { 614 p = 0 615 } 616 } 617 return false 618 } 619 620 func (f *decompressor) copyHuff() { 621 if f.copyHist() { 622 return 623 } 624 f.huffmanBlock() 625 } 626 627 // Copy a single uncompressed data block from input to output. 628 func (f *decompressor) dataBlock() { 629 // Uncompressed. 630 // Discard current half-byte. 631 f.nb = 0 632 f.b = 0 633 634 // Length then ones-complement of length. 635 nr, err := io.ReadFull(f.r, f.buf[0:4]) 636 f.roffset += int64(nr) 637 if err != nil { 638 f.err = &ReadError{f.roffset, err} 639 return 640 } 641 n := int(f.buf[0]) | int(f.buf[1])<<8 642 nn := int(f.buf[2]) | int(f.buf[3])<<8 643 if uint16(nn) != uint16(^n) { 644 f.err = CorruptInputError(f.roffset) 645 return 646 } 647 648 if n == 0 { 649 // 0-length block means sync 650 f.flush((*decompressor).nextBlock) 651 return 652 } 653 654 f.copyLen = n 655 f.copyData() 656 } 657 658 // copyData copies f.copyLen bytes from the underlying reader into f.hist. 659 // It pauses for reads when f.hist is full. 660 func (f *decompressor) copyData() { 661 n := f.copyLen 662 for n > 0 { 663 m := len(f.hist) - f.hp 664 if m > n { 665 m = n 666 } 667 m, err := io.ReadFull(f.r, f.hist[f.hp:f.hp+m]) 668 f.roffset += int64(m) 669 if err != nil { 670 f.err = &ReadError{f.roffset, err} 671 return 672 } 673 n -= m 674 f.hp += m 675 if f.hp == len(f.hist) { 676 f.copyLen = n 677 f.flush((*decompressor).copyData) 678 return 679 } 680 } 681 f.step = (*decompressor).nextBlock 682 } 683 684 func (f *decompressor) setDict(dict []byte) { 685 if len(dict) > len(f.hist) { 686 // Will only remember the tail. 687 dict = dict[len(dict)-len(f.hist):] 688 } 689 690 f.hp = copy(f.hist[:], dict) 691 if f.hp == len(f.hist) { 692 f.hp = 0 693 f.hfull = true 694 } 695 f.hw = f.hp 696 } 697 698 func (f *decompressor) moreBits() error { 699 c, err := f.r.ReadByte() 700 if err != nil { 701 if err == io.EOF { 702 err = io.ErrUnexpectedEOF 703 } 704 return err 705 } 706 f.roffset++ 707 f.b |= uint32(c) << f.nb 708 f.nb += 8 709 return nil 710 } 711 712 // Read the next Huffman-encoded symbol from f according to h. 713 func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) { 714 // Since a huffmanDecoder can be empty or be composed of a degenerate tree 715 // with single element, huffSym must error on these two edge cases. In both 716 // cases, the chunks slice will be 0 for the invalid sequence, leading it 717 // satisfy the n == 0 check below. 718 n := uint(h.min) 719 for { 720 for f.nb < n { 721 if err := f.moreBits(); err != nil { 722 return 0, err 723 } 724 } 725 chunk := h.chunks[f.b&(huffmanNumChunks-1)] 726 n = uint(chunk & huffmanCountMask) 727 if n > huffmanChunkBits { 728 chunk = h.links[chunk>>huffmanValueShift][(f.b>>huffmanChunkBits)&h.linkMask] 729 n = uint(chunk & huffmanCountMask) 730 } 731 if n <= f.nb { 732 if n == 0 { 733 f.err = CorruptInputError(f.roffset) 734 return 0, f.err 735 } 736 f.b >>= n 737 f.nb -= n 738 return int(chunk >> huffmanValueShift), nil 739 } 740 } 741 } 742 743 // Flush any buffered output to the underlying writer. 744 func (f *decompressor) flush(step func(*decompressor)) { 745 f.toRead = f.hist[f.hw:f.hp] 746 f.woffset += int64(f.hp - f.hw) 747 f.hw = f.hp 748 if f.hp == len(f.hist) { 749 f.hp = 0 750 f.hw = 0 751 f.hfull = true 752 } 753 f.step = step 754 } 755 756 func makeReader(r io.Reader) Reader { 757 if rr, ok := r.(Reader); ok { 758 return rr 759 } 760 return bufio.NewReader(r) 761 } 762 763 func (f *decompressor) Reset(r io.Reader, dict []byte) error { 764 *f = decompressor{ 765 r: makeReader(r), 766 bits: f.bits, 767 codebits: f.codebits, 768 hist: f.hist, 769 step: (*decompressor).nextBlock, 770 } 771 if dict != nil { 772 f.setDict(dict) 773 } 774 return nil 775 } 776 777 // NewReader returns a new ReadCloser that can be used 778 // to read the uncompressed version of r. 779 // If r does not also implement io.ByteReader, 780 // the decompressor may read more data than necessary from r. 781 // It is the caller's responsibility to call Close on the ReadCloser 782 // when finished reading. 783 // 784 // The ReadCloser returned by NewReader also implements Resetter. 785 func NewReader(r io.Reader) io.ReadCloser { 786 var f decompressor 787 f.bits = new([maxNumLit + maxNumDist]int) 788 f.codebits = new([numCodes]int) 789 f.r = makeReader(r) 790 f.hist = new([maxHist]byte) 791 f.step = (*decompressor).nextBlock 792 return &f 793 } 794 795 // NewReaderDict is like NewReader but initializes the reader 796 // with a preset dictionary. The returned Reader behaves as if 797 // the uncompressed data stream started with the given dictionary, 798 // which has already been read. NewReaderDict is typically used 799 // to read data compressed by NewWriterDict. 800 // 801 // The ReadCloser returned by NewReader also implements Resetter. 802 func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser { 803 var f decompressor 804 f.r = makeReader(r) 805 f.hist = new([maxHist]byte) 806 f.bits = new([maxNumLit + maxNumDist]int) 807 f.codebits = new([numCodes]int) 808 f.step = (*decompressor).nextBlock 809 f.setDict(dict) 810 return &f 811 } 812