1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package bzip2 implements bzip2 decompression. 6 package bzip2 7 8 import "io" 9 10 // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot 11 // of guessing: http://en.wikipedia.org/wiki/Bzip2 12 // The source code to pyflate was useful for debugging: 13 // http://www.paul.sladen.org/projects/pyflate 14 15 // A StructuralError is returned when the bzip2 data is found to be 16 // syntactically invalid. 17 type StructuralError string 18 19 func (s StructuralError) Error() string { 20 return "bzip2 data invalid: " + string(s) 21 } 22 23 // A reader decompresses bzip2 compressed data. 24 type reader struct { 25 br bitReader 26 fileCRC uint32 27 blockCRC uint32 28 wantBlockCRC uint32 29 setupDone bool // true if we have parsed the bzip2 header. 30 blockSize int // blockSize in bytes, i.e. 900 * 1024. 31 eof bool 32 buf []byte // stores Burrows-Wheeler transformed data. 33 c [256]uint // the `C' array for the inverse BWT. 34 tt []uint32 // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits. 35 tPos uint32 // Index of the next output byte in tt. 36 37 preRLE []uint32 // contains the RLE data still to be processed. 38 preRLEUsed int // number of entries of preRLE used. 39 lastByte int // the last byte value seen. 40 byteRepeats uint // the number of repeats of lastByte seen. 41 repeats uint // the number of copies of lastByte to output. 42 } 43 44 // NewReader returns an io.Reader which decompresses bzip2 data from r. 45 // If r does not also implement io.ByteReader, 46 // the decompressor may read more data than necessary from r. 47 func NewReader(r io.Reader) io.Reader { 48 bz2 := new(reader) 49 bz2.br = newBitReader(r) 50 return bz2 51 } 52 53 const bzip2FileMagic = 0x425a // "BZ" 54 const bzip2BlockMagic = 0x314159265359 55 const bzip2FinalMagic = 0x177245385090 56 57 // setup parses the bzip2 header. 58 func (bz2 *reader) setup(needMagic bool) error { 59 br := &bz2.br 60 61 if needMagic { 62 magic := br.ReadBits(16) 63 if magic != bzip2FileMagic { 64 return StructuralError("bad magic value") 65 } 66 } 67 68 t := br.ReadBits(8) 69 if t != 'h' { 70 return StructuralError("non-Huffman entropy encoding") 71 } 72 73 level := br.ReadBits(8) 74 if level < '1' || level > '9' { 75 return StructuralError("invalid compression level") 76 } 77 78 bz2.fileCRC = 0 79 bz2.blockSize = 100 * 1024 * (int(level) - '0') 80 if bz2.blockSize > len(bz2.tt) { 81 bz2.tt = make([]uint32, bz2.blockSize) 82 } 83 return nil 84 } 85 86 func (bz2 *reader) Read(buf []byte) (n int, err error) { 87 if bz2.eof { 88 return 0, io.EOF 89 } 90 91 if !bz2.setupDone { 92 err = bz2.setup(true) 93 brErr := bz2.br.Err() 94 if brErr != nil { 95 err = brErr 96 } 97 if err != nil { 98 return 0, err 99 } 100 bz2.setupDone = true 101 } 102 103 n, err = bz2.read(buf) 104 brErr := bz2.br.Err() 105 if brErr != nil { 106 err = brErr 107 } 108 return 109 } 110 111 func (bz2 *reader) readFromBlock(buf []byte) int { 112 // bzip2 is a block based compressor, except that it has a run-length 113 // preprocessing step. The block based nature means that we can 114 // preallocate fixed-size buffers and reuse them. However, the RLE 115 // preprocessing would require allocating huge buffers to store the 116 // maximum expansion. Thus we process blocks all at once, except for 117 // the RLE which we decompress as required. 118 n := 0 119 for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) { 120 // We have RLE data pending. 121 122 // The run-length encoding works like this: 123 // Any sequence of four equal bytes is followed by a length 124 // byte which contains the number of repeats of that byte to 125 // include. (The number of repeats can be zero.) Because we are 126 // decompressing on-demand our state is kept in the reader 127 // object. 128 129 if bz2.repeats > 0 { 130 buf[n] = byte(bz2.lastByte) 131 n++ 132 bz2.repeats-- 133 if bz2.repeats == 0 { 134 bz2.lastByte = -1 135 } 136 continue 137 } 138 139 bz2.tPos = bz2.preRLE[bz2.tPos] 140 b := byte(bz2.tPos) 141 bz2.tPos >>= 8 142 bz2.preRLEUsed++ 143 144 if bz2.byteRepeats == 3 { 145 bz2.repeats = uint(b) 146 bz2.byteRepeats = 0 147 continue 148 } 149 150 if bz2.lastByte == int(b) { 151 bz2.byteRepeats++ 152 } else { 153 bz2.byteRepeats = 0 154 } 155 bz2.lastByte = int(b) 156 157 buf[n] = b 158 n++ 159 } 160 161 return n 162 } 163 164 func (bz2 *reader) read(buf []byte) (int, error) { 165 for { 166 n := bz2.readFromBlock(buf) 167 if n > 0 { 168 bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n]) 169 return n, nil 170 } 171 172 // End of block. Check CRC. 173 if bz2.blockCRC != bz2.wantBlockCRC { 174 bz2.br.err = StructuralError("block checksum mismatch") 175 return 0, bz2.br.err 176 } 177 178 // Find next block. 179 br := &bz2.br 180 switch br.ReadBits64(48) { 181 default: 182 return 0, StructuralError("bad magic value found") 183 184 case bzip2BlockMagic: 185 // Start of block. 186 err := bz2.readBlock() 187 if err != nil { 188 return 0, err 189 } 190 191 case bzip2FinalMagic: 192 // Check end-of-file CRC. 193 wantFileCRC := uint32(br.ReadBits64(32)) 194 if br.err != nil { 195 return 0, br.err 196 } 197 if bz2.fileCRC != wantFileCRC { 198 br.err = StructuralError("file checksum mismatch") 199 return 0, br.err 200 } 201 202 // Skip ahead to byte boundary. 203 // Is there a file concatenated to this one? 204 // It would start with BZ. 205 if br.bits%8 != 0 { 206 br.ReadBits(br.bits % 8) 207 } 208 b, err := br.r.ReadByte() 209 if err == io.EOF { 210 br.err = io.EOF 211 bz2.eof = true 212 return 0, io.EOF 213 } 214 if err != nil { 215 br.err = err 216 return 0, err 217 } 218 z, err := br.r.ReadByte() 219 if err != nil { 220 if err == io.EOF { 221 err = io.ErrUnexpectedEOF 222 } 223 br.err = err 224 return 0, err 225 } 226 if b != 'B' || z != 'Z' { 227 return 0, StructuralError("bad magic value in continuation file") 228 } 229 if err := bz2.setup(false); err != nil { 230 return 0, err 231 } 232 } 233 } 234 } 235 236 // readBlock reads a bzip2 block. The magic number should already have been consumed. 237 func (bz2 *reader) readBlock() (err error) { 238 br := &bz2.br 239 bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is. 240 bz2.blockCRC = 0 241 bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC 242 randomized := br.ReadBits(1) 243 if randomized != 0 { 244 return StructuralError("deprecated randomized files") 245 } 246 origPtr := uint(br.ReadBits(24)) 247 248 // If not every byte value is used in the block (i.e., it's text) then 249 // the symbol set is reduced. The symbols used are stored as a 250 // two-level, 16x16 bitmap. 251 symbolRangeUsedBitmap := br.ReadBits(16) 252 symbolPresent := make([]bool, 256) 253 numSymbols := 0 254 for symRange := uint(0); symRange < 16; symRange++ { 255 if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 { 256 bits := br.ReadBits(16) 257 for symbol := uint(0); symbol < 16; symbol++ { 258 if bits&(1<<(15-symbol)) != 0 { 259 symbolPresent[16*symRange+symbol] = true 260 numSymbols++ 261 } 262 } 263 } 264 } 265 266 if numSymbols == 0 { 267 // There must be an EOF symbol. 268 return StructuralError("no symbols in input") 269 } 270 271 // A block uses between two and six different Huffman trees. 272 numHuffmanTrees := br.ReadBits(3) 273 if numHuffmanTrees < 2 || numHuffmanTrees > 6 { 274 return StructuralError("invalid number of Huffman trees") 275 } 276 277 // The Huffman tree can switch every 50 symbols so there's a list of 278 // tree indexes telling us which tree to use for each 50 symbol block. 279 numSelectors := br.ReadBits(15) 280 treeIndexes := make([]uint8, numSelectors) 281 282 // The tree indexes are move-to-front transformed and stored as unary 283 // numbers. 284 mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees) 285 for i := range treeIndexes { 286 c := 0 287 for { 288 inc := br.ReadBits(1) 289 if inc == 0 { 290 break 291 } 292 c++ 293 } 294 if c >= numHuffmanTrees { 295 return StructuralError("tree index too large") 296 } 297 treeIndexes[i] = uint8(mtfTreeDecoder.Decode(c)) 298 } 299 300 // The list of symbols for the move-to-front transform is taken from 301 // the previously decoded symbol bitmap. 302 symbols := make([]byte, numSymbols) 303 nextSymbol := 0 304 for i := 0; i < 256; i++ { 305 if symbolPresent[i] { 306 symbols[nextSymbol] = byte(i) 307 nextSymbol++ 308 } 309 } 310 mtf := newMTFDecoder(symbols) 311 312 numSymbols += 2 // to account for RUNA and RUNB symbols 313 huffmanTrees := make([]huffmanTree, numHuffmanTrees) 314 315 // Now we decode the arrays of code-lengths for each tree. 316 lengths := make([]uint8, numSymbols) 317 for i := range huffmanTrees { 318 // The code lengths are delta encoded from a 5-bit base value. 319 length := br.ReadBits(5) 320 for j := range lengths { 321 for { 322 if !br.ReadBit() { 323 break 324 } 325 if br.ReadBit() { 326 length-- 327 } else { 328 length++ 329 } 330 } 331 if length < 0 || length > 20 { 332 return StructuralError("Huffman length out of range") 333 } 334 lengths[j] = uint8(length) 335 } 336 huffmanTrees[i], err = newHuffmanTree(lengths) 337 if err != nil { 338 return err 339 } 340 } 341 342 selectorIndex := 1 // the next tree index to use 343 if len(treeIndexes) == 0 { 344 return StructuralError("no tree selectors given") 345 } 346 if int(treeIndexes[0]) >= len(huffmanTrees) { 347 return StructuralError("tree selector out of range") 348 } 349 currentHuffmanTree := huffmanTrees[treeIndexes[0]] 350 bufIndex := 0 // indexes bz2.buf, the output buffer. 351 // The output of the move-to-front transform is run-length encoded and 352 // we merge the decoding into the Huffman parsing loop. These two 353 // variables accumulate the repeat count. See the Wikipedia page for 354 // details. 355 repeat := 0 356 repeatPower := 0 357 358 // The `C' array (used by the inverse BWT) needs to be zero initialized. 359 for i := range bz2.c { 360 bz2.c[i] = 0 361 } 362 363 decoded := 0 // counts the number of symbols decoded by the current tree. 364 for { 365 if decoded == 50 { 366 if selectorIndex >= numSelectors { 367 return StructuralError("insufficient selector indices for number of symbols") 368 } 369 if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) { 370 return StructuralError("tree selector out of range") 371 } 372 currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]] 373 selectorIndex++ 374 decoded = 0 375 } 376 377 v := currentHuffmanTree.Decode(br) 378 decoded++ 379 380 if v < 2 { 381 // This is either the RUNA or RUNB symbol. 382 if repeat == 0 { 383 repeatPower = 1 384 } 385 repeat += repeatPower << v 386 repeatPower <<= 1 387 388 // This limit of 2 million comes from the bzip2 source 389 // code. It prevents repeat from overflowing. 390 if repeat > 2*1024*1024 { 391 return StructuralError("repeat count too large") 392 } 393 continue 394 } 395 396 if repeat > 0 { 397 // We have decoded a complete run-length so we need to 398 // replicate the last output symbol. 399 if repeat > bz2.blockSize-bufIndex { 400 return StructuralError("repeats past end of block") 401 } 402 for i := 0; i < repeat; i++ { 403 b := byte(mtf.First()) 404 bz2.tt[bufIndex] = uint32(b) 405 bz2.c[b]++ 406 bufIndex++ 407 } 408 repeat = 0 409 } 410 411 if int(v) == numSymbols-1 { 412 // This is the EOF symbol. Because it's always at the 413 // end of the move-to-front list, and never gets moved 414 // to the front, it has this unique value. 415 break 416 } 417 418 // Since two metasymbols (RUNA and RUNB) have values 0 and 1, 419 // one would expect |v-2| to be passed to the MTF decoder. 420 // However, the front of the MTF list is never referenced as 0, 421 // it's always referenced with a run-length of 1. Thus 0 422 // doesn't need to be encoded and we have |v-1| in the next 423 // line. 424 b := byte(mtf.Decode(int(v - 1))) 425 if bufIndex >= bz2.blockSize { 426 return StructuralError("data exceeds block size") 427 } 428 bz2.tt[bufIndex] = uint32(b) 429 bz2.c[b]++ 430 bufIndex++ 431 } 432 433 if origPtr >= uint(bufIndex) { 434 return StructuralError("origPtr out of bounds") 435 } 436 437 // We have completed the entropy decoding. Now we can perform the 438 // inverse BWT and setup the RLE buffer. 439 bz2.preRLE = bz2.tt[:bufIndex] 440 bz2.preRLEUsed = 0 441 bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:]) 442 bz2.lastByte = -1 443 bz2.byteRepeats = 0 444 bz2.repeats = 0 445 446 return nil 447 } 448 449 // inverseBWT implements the inverse Burrows-Wheeler transform as described in 450 // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2. 451 // In that document, origPtr is called `I' and c is the `C' array after the 452 // first pass over the data. It's an argument here because we merge the first 453 // pass with the Huffman decoding. 454 // 455 // This also implements the `single array' method from the bzip2 source code 456 // which leaves the output, still shuffled, in the bottom 8 bits of tt with the 457 // index of the next byte in the top 24-bits. The index of the first byte is 458 // returned. 459 func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 { 460 sum := uint(0) 461 for i := 0; i < 256; i++ { 462 sum += c[i] 463 c[i] = sum - c[i] 464 } 465 466 for i := range tt { 467 b := tt[i] & 0xff 468 tt[c[b]] |= uint32(i) << 8 469 c[b]++ 470 } 471 472 return tt[origPtr] >> 8 473 } 474 475 // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed, 476 // causing the bits in the input to be processed in the reverse of the usual order. 477 478 var crctab [256]uint32 479 480 func init() { 481 const poly = 0x04C11DB7 482 for i := range crctab { 483 crc := uint32(i) << 24 484 for j := 0; j < 8; j++ { 485 if crc&0x80000000 != 0 { 486 crc = (crc << 1) ^ poly 487 } else { 488 crc <<= 1 489 } 490 } 491 crctab[i] = crc 492 } 493 } 494 495 // updateCRC updates the crc value to incorporate the data in b. 496 // The initial value is 0. 497 func updateCRC(val uint32, b []byte) uint32 { 498 crc := ^val 499 for _, v := range b { 500 crc = crctab[byte(crc>>24)^v] ^ (crc << 8) 501 } 502 return ^crc 503 } 504