Home | History | Annotate | Download | only in bzip2
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package bzip2
      6 
      7 import "sort"
      8 
      9 // A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
     10 // symbol.
     11 type huffmanTree struct {
     12 	// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
     13 	// root of the tree and nextNode contains the index of the next element
     14 	// of nodes to use when the tree is being constructed.
     15 	nodes    []huffmanNode
     16 	nextNode int
     17 }
     18 
     19 // A huffmanNode is a node in the tree. left and right contain indexes into the
     20 // nodes slice of the tree. If left or right is invalidNodeValue then the child
     21 // is a left node and its value is in leftValue/rightValue.
     22 //
     23 // The symbols are uint16s because bzip2 encodes not only MTF indexes in the
     24 // tree, but also two magic values for run-length encoding and an EOF symbol.
     25 // Thus there are more than 256 possible symbols.
     26 type huffmanNode struct {
     27 	left, right           uint16
     28 	leftValue, rightValue uint16
     29 }
     30 
     31 // invalidNodeValue is an invalid index which marks a leaf node in the tree.
     32 const invalidNodeValue = 0xffff
     33 
     34 // Decode reads bits from the given bitReader and navigates the tree until a
     35 // symbol is found.
     36 func (t *huffmanTree) Decode(br *bitReader) (v uint16) {
     37 	nodeIndex := uint16(0) // node 0 is the root of the tree.
     38 
     39 	for {
     40 		node := &t.nodes[nodeIndex]
     41 
     42 		var bit uint16
     43 		if br.bits > 0 {
     44 			// Get next bit - fast path.
     45 			br.bits--
     46 			bit = 0 - (uint16(br.n>>br.bits) & 1)
     47 		} else {
     48 			// Get next bit - slow path.
     49 			// Use ReadBits to retrieve a single bit
     50 			// from the underling io.ByteReader.
     51 			bit = 0 - uint16(br.ReadBits(1))
     52 		}
     53 		// now
     54 		// bit = 0xffff if the next bit was 1
     55 		// bit = 0x0000 if the next bit was 0
     56 
     57 		// 1 means left, 0 means right.
     58 		//
     59 		// if bit == 0xffff {
     60 		//     nodeIndex = node.left
     61 		// } else {
     62 		//     nodeIndex = node.right
     63 		// }
     64 		nodeIndex = (bit & node.left) | (^bit & node.right)
     65 
     66 		if nodeIndex == invalidNodeValue {
     67 			// We found a leaf. Use the value of bit to decide
     68 			// whether is a left or a right value.
     69 			return (bit & node.leftValue) | (^bit & node.rightValue)
     70 		}
     71 	}
     72 }
     73 
     74 // newHuffmanTree builds a Huffman tree from a slice containing the code
     75 // lengths of each symbol. The maximum code length is 32 bits.
     76 func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
     77 	// There are many possible trees that assign the same code length to
     78 	// each symbol (consider reflecting a tree down the middle, for
     79 	// example). Since the code length assignments determine the
     80 	// efficiency of the tree, each of these trees is equally good. In
     81 	// order to minimize the amount of information needed to build a tree
     82 	// bzip2 uses a canonical tree so that it can be reconstructed given
     83 	// only the code length assignments.
     84 
     85 	if len(lengths) < 2 {
     86 		panic("newHuffmanTree: too few symbols")
     87 	}
     88 
     89 	var t huffmanTree
     90 
     91 	// First we sort the code length assignments by ascending code length,
     92 	// using the symbol value to break ties.
     93 	pairs := huffmanSymbolLengthPairs(make([]huffmanSymbolLengthPair, len(lengths)))
     94 	for i, length := range lengths {
     95 		pairs[i].value = uint16(i)
     96 		pairs[i].length = length
     97 	}
     98 
     99 	sort.Sort(pairs)
    100 
    101 	// Now we assign codes to the symbols, starting with the longest code.
    102 	// We keep the codes packed into a uint32, at the most-significant end.
    103 	// So branches are taken from the MSB downwards. This makes it easy to
    104 	// sort them later.
    105 	code := uint32(0)
    106 	length := uint8(32)
    107 
    108 	codes := huffmanCodes(make([]huffmanCode, len(lengths)))
    109 	for i := len(pairs) - 1; i >= 0; i-- {
    110 		if length > pairs[i].length {
    111 			// If the code length decreases we shift in order to
    112 			// zero any bits beyond the end of the code.
    113 			length >>= 32 - pairs[i].length
    114 			length <<= 32 - pairs[i].length
    115 			length = pairs[i].length
    116 		}
    117 		codes[i].code = code
    118 		codes[i].codeLen = length
    119 		codes[i].value = pairs[i].value
    120 		// We need to 'increment' the code, which means treating |code|
    121 		// like a |length| bit number.
    122 		code += 1 << (32 - length)
    123 	}
    124 
    125 	// Now we can sort by the code so that the left half of each branch are
    126 	// grouped together, recursively.
    127 	sort.Sort(codes)
    128 
    129 	t.nodes = make([]huffmanNode, len(codes))
    130 	_, err := buildHuffmanNode(&t, codes, 0)
    131 	return t, err
    132 }
    133 
    134 // huffmanSymbolLengthPair contains a symbol and its code length.
    135 type huffmanSymbolLengthPair struct {
    136 	value  uint16
    137 	length uint8
    138 }
    139 
    140 // huffmanSymbolLengthPair is used to provide an interface for sorting.
    141 type huffmanSymbolLengthPairs []huffmanSymbolLengthPair
    142 
    143 func (h huffmanSymbolLengthPairs) Len() int {
    144 	return len(h)
    145 }
    146 
    147 func (h huffmanSymbolLengthPairs) Less(i, j int) bool {
    148 	if h[i].length < h[j].length {
    149 		return true
    150 	}
    151 	if h[i].length > h[j].length {
    152 		return false
    153 	}
    154 	if h[i].value < h[j].value {
    155 		return true
    156 	}
    157 	return false
    158 }
    159 
    160 func (h huffmanSymbolLengthPairs) Swap(i, j int) {
    161 	h[i], h[j] = h[j], h[i]
    162 }
    163 
    164 // huffmanCode contains a symbol, its code and code length.
    165 type huffmanCode struct {
    166 	code    uint32
    167 	codeLen uint8
    168 	value   uint16
    169 }
    170 
    171 // huffmanCodes is used to provide an interface for sorting.
    172 type huffmanCodes []huffmanCode
    173 
    174 func (n huffmanCodes) Len() int {
    175 	return len(n)
    176 }
    177 
    178 func (n huffmanCodes) Less(i, j int) bool {
    179 	return n[i].code < n[j].code
    180 }
    181 
    182 func (n huffmanCodes) Swap(i, j int) {
    183 	n[i], n[j] = n[j], n[i]
    184 }
    185 
    186 // buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
    187 // the Huffman tree at the given level. It returns the index of the newly
    188 // constructed node.
    189 func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
    190 	test := uint32(1) << (31 - level)
    191 
    192 	// We have to search the list of codes to find the divide between the left and right sides.
    193 	firstRightIndex := len(codes)
    194 	for i, code := range codes {
    195 		if code.code&test != 0 {
    196 			firstRightIndex = i
    197 			break
    198 		}
    199 	}
    200 
    201 	left := codes[:firstRightIndex]
    202 	right := codes[firstRightIndex:]
    203 
    204 	if len(left) == 0 || len(right) == 0 {
    205 		// There is a superfluous level in the Huffman tree indicating
    206 		// a bug in the encoder. However, this bug has been observed in
    207 		// the wild so we handle it.
    208 
    209 		// If this function was called recursively then we know that
    210 		// len(codes) >= 2 because, otherwise, we would have hit the
    211 		// "leaf node" case, below, and not recursed.
    212 		//
    213 		// However, for the initial call it's possible that len(codes)
    214 		// is zero or one. Both cases are invalid because a zero length
    215 		// tree cannot encode anything and a length-1 tree can only
    216 		// encode EOF and so is superfluous. We reject both.
    217 		if len(codes) < 2 {
    218 			return 0, StructuralError("empty Huffman tree")
    219 		}
    220 
    221 		// In this case the recursion doesn't always reduce the length
    222 		// of codes so we need to ensure termination via another
    223 		// mechanism.
    224 		if level == 31 {
    225 			// Since len(codes) >= 2 the only way that the values
    226 			// can match at all 32 bits is if they are equal, which
    227 			// is invalid. This ensures that we never enter
    228 			// infinite recursion.
    229 			return 0, StructuralError("equal symbols in Huffman tree")
    230 		}
    231 
    232 		if len(left) == 0 {
    233 			return buildHuffmanNode(t, right, level+1)
    234 		}
    235 		return buildHuffmanNode(t, left, level+1)
    236 	}
    237 
    238 	nodeIndex = uint16(t.nextNode)
    239 	node := &t.nodes[t.nextNode]
    240 	t.nextNode++
    241 
    242 	if len(left) == 1 {
    243 		// leaf node
    244 		node.left = invalidNodeValue
    245 		node.leftValue = left[0].value
    246 	} else {
    247 		node.left, err = buildHuffmanNode(t, left, level+1)
    248 	}
    249 
    250 	if err != nil {
    251 		return
    252 	}
    253 
    254 	if len(right) == 1 {
    255 		// leaf node
    256 		node.right = invalidNodeValue
    257 		node.rightValue = right[0].value
    258 	} else {
    259 		node.right, err = buildHuffmanNode(t, right, level+1)
    260 	}
    261 
    262 	return
    263 }
    264