Home | History | Annotate | Download | only in bzip2
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package bzip2 implements bzip2 decompression.
      6 package bzip2
      7 
      8 import "io"
      9 
     10 // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
     11 // of guessing: http://en.wikipedia.org/wiki/Bzip2
     12 // The source code to pyflate was useful for debugging:
     13 // http://www.paul.sladen.org/projects/pyflate
     14 
     15 // A StructuralError is returned when the bzip2 data is found to be
     16 // syntactically invalid.
     17 type StructuralError string
     18 
     19 func (s StructuralError) Error() string {
     20 	return "bzip2 data invalid: " + string(s)
     21 }
     22 
     23 // A reader decompresses bzip2 compressed data.
     24 type reader struct {
     25 	br           bitReader
     26 	fileCRC      uint32
     27 	blockCRC     uint32
     28 	wantBlockCRC uint32
     29 	setupDone    bool // true if we have parsed the bzip2 header.
     30 	blockSize    int  // blockSize in bytes, i.e. 900 * 1024.
     31 	eof          bool
     32 	buf          []byte    // stores Burrows-Wheeler transformed data.
     33 	c            [256]uint // the `C' array for the inverse BWT.
     34 	tt           []uint32  // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
     35 	tPos         uint32    // Index of the next output byte in tt.
     36 
     37 	preRLE      []uint32 // contains the RLE data still to be processed.
     38 	preRLEUsed  int      // number of entries of preRLE used.
     39 	lastByte    int      // the last byte value seen.
     40 	byteRepeats uint     // the number of repeats of lastByte seen.
     41 	repeats     uint     // the number of copies of lastByte to output.
     42 }
     43 
     44 // NewReader returns an io.Reader which decompresses bzip2 data from r.
     45 // If r does not also implement io.ByteReader,
     46 // the decompressor may read more data than necessary from r.
     47 func NewReader(r io.Reader) io.Reader {
     48 	bz2 := new(reader)
     49 	bz2.br = newBitReader(r)
     50 	return bz2
     51 }
     52 
     53 const bzip2FileMagic = 0x425a // "BZ"
     54 const bzip2BlockMagic = 0x314159265359
     55 const bzip2FinalMagic = 0x177245385090
     56 
     57 // setup parses the bzip2 header.
     58 func (bz2 *reader) setup(needMagic bool) error {
     59 	br := &bz2.br
     60 
     61 	if needMagic {
     62 		magic := br.ReadBits(16)
     63 		if magic != bzip2FileMagic {
     64 			return StructuralError("bad magic value")
     65 		}
     66 	}
     67 
     68 	t := br.ReadBits(8)
     69 	if t != 'h' {
     70 		return StructuralError("non-Huffman entropy encoding")
     71 	}
     72 
     73 	level := br.ReadBits(8)
     74 	if level < '1' || level > '9' {
     75 		return StructuralError("invalid compression level")
     76 	}
     77 
     78 	bz2.fileCRC = 0
     79 	bz2.blockSize = 100 * 1024 * (int(level) - '0')
     80 	if bz2.blockSize > len(bz2.tt) {
     81 		bz2.tt = make([]uint32, bz2.blockSize)
     82 	}
     83 	return nil
     84 }
     85 
     86 func (bz2 *reader) Read(buf []byte) (n int, err error) {
     87 	if bz2.eof {
     88 		return 0, io.EOF
     89 	}
     90 
     91 	if !bz2.setupDone {
     92 		err = bz2.setup(true)
     93 		brErr := bz2.br.Err()
     94 		if brErr != nil {
     95 			err = brErr
     96 		}
     97 		if err != nil {
     98 			return 0, err
     99 		}
    100 		bz2.setupDone = true
    101 	}
    102 
    103 	n, err = bz2.read(buf)
    104 	brErr := bz2.br.Err()
    105 	if brErr != nil {
    106 		err = brErr
    107 	}
    108 	return
    109 }
    110 
    111 func (bz2 *reader) readFromBlock(buf []byte) int {
    112 	// bzip2 is a block based compressor, except that it has a run-length
    113 	// preprocessing step. The block based nature means that we can
    114 	// preallocate fixed-size buffers and reuse them. However, the RLE
    115 	// preprocessing would require allocating huge buffers to store the
    116 	// maximum expansion. Thus we process blocks all at once, except for
    117 	// the RLE which we decompress as required.
    118 	n := 0
    119 	for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
    120 		// We have RLE data pending.
    121 
    122 		// The run-length encoding works like this:
    123 		// Any sequence of four equal bytes is followed by a length
    124 		// byte which contains the number of repeats of that byte to
    125 		// include. (The number of repeats can be zero.) Because we are
    126 		// decompressing on-demand our state is kept in the reader
    127 		// object.
    128 
    129 		if bz2.repeats > 0 {
    130 			buf[n] = byte(bz2.lastByte)
    131 			n++
    132 			bz2.repeats--
    133 			if bz2.repeats == 0 {
    134 				bz2.lastByte = -1
    135 			}
    136 			continue
    137 		}
    138 
    139 		bz2.tPos = bz2.preRLE[bz2.tPos]
    140 		b := byte(bz2.tPos)
    141 		bz2.tPos >>= 8
    142 		bz2.preRLEUsed++
    143 
    144 		if bz2.byteRepeats == 3 {
    145 			bz2.repeats = uint(b)
    146 			bz2.byteRepeats = 0
    147 			continue
    148 		}
    149 
    150 		if bz2.lastByte == int(b) {
    151 			bz2.byteRepeats++
    152 		} else {
    153 			bz2.byteRepeats = 0
    154 		}
    155 		bz2.lastByte = int(b)
    156 
    157 		buf[n] = b
    158 		n++
    159 	}
    160 
    161 	return n
    162 }
    163 
    164 func (bz2 *reader) read(buf []byte) (int, error) {
    165 	for {
    166 		n := bz2.readFromBlock(buf)
    167 		if n > 0 {
    168 			bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
    169 			return n, nil
    170 		}
    171 
    172 		// End of block. Check CRC.
    173 		if bz2.blockCRC != bz2.wantBlockCRC {
    174 			bz2.br.err = StructuralError("block checksum mismatch")
    175 			return 0, bz2.br.err
    176 		}
    177 
    178 		// Find next block.
    179 		br := &bz2.br
    180 		switch br.ReadBits64(48) {
    181 		default:
    182 			return 0, StructuralError("bad magic value found")
    183 
    184 		case bzip2BlockMagic:
    185 			// Start of block.
    186 			err := bz2.readBlock()
    187 			if err != nil {
    188 				return 0, err
    189 			}
    190 
    191 		case bzip2FinalMagic:
    192 			// Check end-of-file CRC.
    193 			wantFileCRC := uint32(br.ReadBits64(32))
    194 			if br.err != nil {
    195 				return 0, br.err
    196 			}
    197 			if bz2.fileCRC != wantFileCRC {
    198 				br.err = StructuralError("file checksum mismatch")
    199 				return 0, br.err
    200 			}
    201 
    202 			// Skip ahead to byte boundary.
    203 			// Is there a file concatenated to this one?
    204 			// It would start with BZ.
    205 			if br.bits%8 != 0 {
    206 				br.ReadBits(br.bits % 8)
    207 			}
    208 			b, err := br.r.ReadByte()
    209 			if err == io.EOF {
    210 				br.err = io.EOF
    211 				bz2.eof = true
    212 				return 0, io.EOF
    213 			}
    214 			if err != nil {
    215 				br.err = err
    216 				return 0, err
    217 			}
    218 			z, err := br.r.ReadByte()
    219 			if err != nil {
    220 				if err == io.EOF {
    221 					err = io.ErrUnexpectedEOF
    222 				}
    223 				br.err = err
    224 				return 0, err
    225 			}
    226 			if b != 'B' || z != 'Z' {
    227 				return 0, StructuralError("bad magic value in continuation file")
    228 			}
    229 			if err := bz2.setup(false); err != nil {
    230 				return 0, err
    231 			}
    232 		}
    233 	}
    234 }
    235 
    236 // readBlock reads a bzip2 block. The magic number should already have been consumed.
    237 func (bz2 *reader) readBlock() (err error) {
    238 	br := &bz2.br
    239 	bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
    240 	bz2.blockCRC = 0
    241 	bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
    242 	randomized := br.ReadBits(1)
    243 	if randomized != 0 {
    244 		return StructuralError("deprecated randomized files")
    245 	}
    246 	origPtr := uint(br.ReadBits(24))
    247 
    248 	// If not every byte value is used in the block (i.e., it's text) then
    249 	// the symbol set is reduced. The symbols used are stored as a
    250 	// two-level, 16x16 bitmap.
    251 	symbolRangeUsedBitmap := br.ReadBits(16)
    252 	symbolPresent := make([]bool, 256)
    253 	numSymbols := 0
    254 	for symRange := uint(0); symRange < 16; symRange++ {
    255 		if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
    256 			bits := br.ReadBits(16)
    257 			for symbol := uint(0); symbol < 16; symbol++ {
    258 				if bits&(1<<(15-symbol)) != 0 {
    259 					symbolPresent[16*symRange+symbol] = true
    260 					numSymbols++
    261 				}
    262 			}
    263 		}
    264 	}
    265 
    266 	if numSymbols == 0 {
    267 		// There must be an EOF symbol.
    268 		return StructuralError("no symbols in input")
    269 	}
    270 
    271 	// A block uses between two and six different Huffman trees.
    272 	numHuffmanTrees := br.ReadBits(3)
    273 	if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
    274 		return StructuralError("invalid number of Huffman trees")
    275 	}
    276 
    277 	// The Huffman tree can switch every 50 symbols so there's a list of
    278 	// tree indexes telling us which tree to use for each 50 symbol block.
    279 	numSelectors := br.ReadBits(15)
    280 	treeIndexes := make([]uint8, numSelectors)
    281 
    282 	// The tree indexes are move-to-front transformed and stored as unary
    283 	// numbers.
    284 	mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
    285 	for i := range treeIndexes {
    286 		c := 0
    287 		for {
    288 			inc := br.ReadBits(1)
    289 			if inc == 0 {
    290 				break
    291 			}
    292 			c++
    293 		}
    294 		if c >= numHuffmanTrees {
    295 			return StructuralError("tree index too large")
    296 		}
    297 		treeIndexes[i] = uint8(mtfTreeDecoder.Decode(c))
    298 	}
    299 
    300 	// The list of symbols for the move-to-front transform is taken from
    301 	// the previously decoded symbol bitmap.
    302 	symbols := make([]byte, numSymbols)
    303 	nextSymbol := 0
    304 	for i := 0; i < 256; i++ {
    305 		if symbolPresent[i] {
    306 			symbols[nextSymbol] = byte(i)
    307 			nextSymbol++
    308 		}
    309 	}
    310 	mtf := newMTFDecoder(symbols)
    311 
    312 	numSymbols += 2 // to account for RUNA and RUNB symbols
    313 	huffmanTrees := make([]huffmanTree, numHuffmanTrees)
    314 
    315 	// Now we decode the arrays of code-lengths for each tree.
    316 	lengths := make([]uint8, numSymbols)
    317 	for i := range huffmanTrees {
    318 		// The code lengths are delta encoded from a 5-bit base value.
    319 		length := br.ReadBits(5)
    320 		for j := range lengths {
    321 			for {
    322 				if !br.ReadBit() {
    323 					break
    324 				}
    325 				if br.ReadBit() {
    326 					length--
    327 				} else {
    328 					length++
    329 				}
    330 			}
    331 			if length < 0 || length > 20 {
    332 				return StructuralError("Huffman length out of range")
    333 			}
    334 			lengths[j] = uint8(length)
    335 		}
    336 		huffmanTrees[i], err = newHuffmanTree(lengths)
    337 		if err != nil {
    338 			return err
    339 		}
    340 	}
    341 
    342 	selectorIndex := 1 // the next tree index to use
    343 	if len(treeIndexes) == 0 {
    344 		return StructuralError("no tree selectors given")
    345 	}
    346 	if int(treeIndexes[0]) >= len(huffmanTrees) {
    347 		return StructuralError("tree selector out of range")
    348 	}
    349 	currentHuffmanTree := huffmanTrees[treeIndexes[0]]
    350 	bufIndex := 0 // indexes bz2.buf, the output buffer.
    351 	// The output of the move-to-front transform is run-length encoded and
    352 	// we merge the decoding into the Huffman parsing loop. These two
    353 	// variables accumulate the repeat count. See the Wikipedia page for
    354 	// details.
    355 	repeat := 0
    356 	repeatPower := 0
    357 
    358 	// The `C' array (used by the inverse BWT) needs to be zero initialized.
    359 	for i := range bz2.c {
    360 		bz2.c[i] = 0
    361 	}
    362 
    363 	decoded := 0 // counts the number of symbols decoded by the current tree.
    364 	for {
    365 		if decoded == 50 {
    366 			if selectorIndex >= numSelectors {
    367 				return StructuralError("insufficient selector indices for number of symbols")
    368 			}
    369 			if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
    370 				return StructuralError("tree selector out of range")
    371 			}
    372 			currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
    373 			selectorIndex++
    374 			decoded = 0
    375 		}
    376 
    377 		v := currentHuffmanTree.Decode(br)
    378 		decoded++
    379 
    380 		if v < 2 {
    381 			// This is either the RUNA or RUNB symbol.
    382 			if repeat == 0 {
    383 				repeatPower = 1
    384 			}
    385 			repeat += repeatPower << v
    386 			repeatPower <<= 1
    387 
    388 			// This limit of 2 million comes from the bzip2 source
    389 			// code. It prevents repeat from overflowing.
    390 			if repeat > 2*1024*1024 {
    391 				return StructuralError("repeat count too large")
    392 			}
    393 			continue
    394 		}
    395 
    396 		if repeat > 0 {
    397 			// We have decoded a complete run-length so we need to
    398 			// replicate the last output symbol.
    399 			if repeat > bz2.blockSize-bufIndex {
    400 				return StructuralError("repeats past end of block")
    401 			}
    402 			for i := 0; i < repeat; i++ {
    403 				b := byte(mtf.First())
    404 				bz2.tt[bufIndex] = uint32(b)
    405 				bz2.c[b]++
    406 				bufIndex++
    407 			}
    408 			repeat = 0
    409 		}
    410 
    411 		if int(v) == numSymbols-1 {
    412 			// This is the EOF symbol. Because it's always at the
    413 			// end of the move-to-front list, and never gets moved
    414 			// to the front, it has this unique value.
    415 			break
    416 		}
    417 
    418 		// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
    419 		// one would expect |v-2| to be passed to the MTF decoder.
    420 		// However, the front of the MTF list is never referenced as 0,
    421 		// it's always referenced with a run-length of 1. Thus 0
    422 		// doesn't need to be encoded and we have |v-1| in the next
    423 		// line.
    424 		b := byte(mtf.Decode(int(v - 1)))
    425 		if bufIndex >= bz2.blockSize {
    426 			return StructuralError("data exceeds block size")
    427 		}
    428 		bz2.tt[bufIndex] = uint32(b)
    429 		bz2.c[b]++
    430 		bufIndex++
    431 	}
    432 
    433 	if origPtr >= uint(bufIndex) {
    434 		return StructuralError("origPtr out of bounds")
    435 	}
    436 
    437 	// We have completed the entropy decoding. Now we can perform the
    438 	// inverse BWT and setup the RLE buffer.
    439 	bz2.preRLE = bz2.tt[:bufIndex]
    440 	bz2.preRLEUsed = 0
    441 	bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
    442 	bz2.lastByte = -1
    443 	bz2.byteRepeats = 0
    444 	bz2.repeats = 0
    445 
    446 	return nil
    447 }
    448 
    449 // inverseBWT implements the inverse Burrows-Wheeler transform as described in
    450 // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
    451 // In that document, origPtr is called `I' and c is the `C' array after the
    452 // first pass over the data. It's an argument here because we merge the first
    453 // pass with the Huffman decoding.
    454 //
    455 // This also implements the `single array' method from the bzip2 source code
    456 // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
    457 // index of the next byte in the top 24-bits. The index of the first byte is
    458 // returned.
    459 func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
    460 	sum := uint(0)
    461 	for i := 0; i < 256; i++ {
    462 		sum += c[i]
    463 		c[i] = sum - c[i]
    464 	}
    465 
    466 	for i := range tt {
    467 		b := tt[i] & 0xff
    468 		tt[c[b]] |= uint32(i) << 8
    469 		c[b]++
    470 	}
    471 
    472 	return tt[origPtr] >> 8
    473 }
    474 
    475 // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
    476 // causing the bits in the input to be processed in the reverse of the usual order.
    477 
    478 var crctab [256]uint32
    479 
    480 func init() {
    481 	const poly = 0x04C11DB7
    482 	for i := range crctab {
    483 		crc := uint32(i) << 24
    484 		for j := 0; j < 8; j++ {
    485 			if crc&0x80000000 != 0 {
    486 				crc = (crc << 1) ^ poly
    487 			} else {
    488 				crc <<= 1
    489 			}
    490 		}
    491 		crctab[i] = crc
    492 	}
    493 }
    494 
    495 // updateCRC updates the crc value to incorporate the data in b.
    496 // The initial value is 0.
    497 func updateCRC(val uint32, b []byte) uint32 {
    498 	crc := ^val
    499 	for _, v := range b {
    500 		crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
    501 	}
    502 	return ^crc
    503 }
    504