Home | History | Annotate | Download | only in bzip2
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package bzip2
      6 
      7 import "sort"
      8 
      9 // A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
     10 // symbol.
     11 type huffmanTree struct {
     12 	// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
     13 	// root of the tree and nextNode contains the index of the next element
     14 	// of nodes to use when the tree is being constructed.
     15 	nodes    []huffmanNode
     16 	nextNode int
     17 }
     18 
     19 // A huffmanNode is a node in the tree. left and right contain indexes into the
     20 // nodes slice of the tree. If left or right is invalidNodeValue then the child
     21 // is a left node and its value is in leftValue/rightValue.
     22 //
     23 // The symbols are uint16s because bzip2 encodes not only MTF indexes in the
     24 // tree, but also two magic values for run-length encoding and an EOF symbol.
     25 // Thus there are more than 256 possible symbols.
     26 type huffmanNode struct {
     27 	left, right           uint16
     28 	leftValue, rightValue uint16
     29 }
     30 
     31 // invalidNodeValue is an invalid index which marks a leaf node in the tree.
     32 const invalidNodeValue = 0xffff
     33 
     34 // Decode reads bits from the given bitReader and navigates the tree until a
     35 // symbol is found.
     36 func (t *huffmanTree) Decode(br *bitReader) (v uint16) {
     37 	nodeIndex := uint16(0) // node 0 is the root of the tree.
     38 
     39 	for {
     40 		node := &t.nodes[nodeIndex]
     41 		bit, ok := br.TryReadBit()
     42 		if !ok && br.ReadBit() {
     43 			bit = 1
     44 		}
     45 		// bzip2 encodes left as a true bit.
     46 		if bit != 0 {
     47 			// left
     48 			if node.left == invalidNodeValue {
     49 				return node.leftValue
     50 			}
     51 			nodeIndex = node.left
     52 		} else {
     53 			// right
     54 			if node.right == invalidNodeValue {
     55 				return node.rightValue
     56 			}
     57 			nodeIndex = node.right
     58 		}
     59 	}
     60 }
     61 
     62 // newHuffmanTree builds a Huffman tree from a slice containing the code
     63 // lengths of each symbol. The maximum code length is 32 bits.
     64 func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
     65 	// There are many possible trees that assign the same code length to
     66 	// each symbol (consider reflecting a tree down the middle, for
     67 	// example). Since the code length assignments determine the
     68 	// efficiency of the tree, each of these trees is equally good. In
     69 	// order to minimize the amount of information needed to build a tree
     70 	// bzip2 uses a canonical tree so that it can be reconstructed given
     71 	// only the code length assignments.
     72 
     73 	if len(lengths) < 2 {
     74 		panic("newHuffmanTree: too few symbols")
     75 	}
     76 
     77 	var t huffmanTree
     78 
     79 	// First we sort the code length assignments by ascending code length,
     80 	// using the symbol value to break ties.
     81 	pairs := huffmanSymbolLengthPairs(make([]huffmanSymbolLengthPair, len(lengths)))
     82 	for i, length := range lengths {
     83 		pairs[i].value = uint16(i)
     84 		pairs[i].length = length
     85 	}
     86 
     87 	sort.Sort(pairs)
     88 
     89 	// Now we assign codes to the symbols, starting with the longest code.
     90 	// We keep the codes packed into a uint32, at the most-significant end.
     91 	// So branches are taken from the MSB downwards. This makes it easy to
     92 	// sort them later.
     93 	code := uint32(0)
     94 	length := uint8(32)
     95 
     96 	codes := huffmanCodes(make([]huffmanCode, len(lengths)))
     97 	for i := len(pairs) - 1; i >= 0; i-- {
     98 		if length > pairs[i].length {
     99 			// If the code length decreases we shift in order to
    100 			// zero any bits beyond the end of the code.
    101 			length >>= 32 - pairs[i].length
    102 			length <<= 32 - pairs[i].length
    103 			length = pairs[i].length
    104 		}
    105 		codes[i].code = code
    106 		codes[i].codeLen = length
    107 		codes[i].value = pairs[i].value
    108 		// We need to 'increment' the code, which means treating |code|
    109 		// like a |length| bit number.
    110 		code += 1 << (32 - length)
    111 	}
    112 
    113 	// Now we can sort by the code so that the left half of each branch are
    114 	// grouped together, recursively.
    115 	sort.Sort(codes)
    116 
    117 	t.nodes = make([]huffmanNode, len(codes))
    118 	_, err := buildHuffmanNode(&t, codes, 0)
    119 	return t, err
    120 }
    121 
    122 // huffmanSymbolLengthPair contains a symbol and its code length.
    123 type huffmanSymbolLengthPair struct {
    124 	value  uint16
    125 	length uint8
    126 }
    127 
    128 // huffmanSymbolLengthPair is used to provide an interface for sorting.
    129 type huffmanSymbolLengthPairs []huffmanSymbolLengthPair
    130 
    131 func (h huffmanSymbolLengthPairs) Len() int {
    132 	return len(h)
    133 }
    134 
    135 func (h huffmanSymbolLengthPairs) Less(i, j int) bool {
    136 	if h[i].length < h[j].length {
    137 		return true
    138 	}
    139 	if h[i].length > h[j].length {
    140 		return false
    141 	}
    142 	if h[i].value < h[j].value {
    143 		return true
    144 	}
    145 	return false
    146 }
    147 
    148 func (h huffmanSymbolLengthPairs) Swap(i, j int) {
    149 	h[i], h[j] = h[j], h[i]
    150 }
    151 
    152 // huffmanCode contains a symbol, its code and code length.
    153 type huffmanCode struct {
    154 	code    uint32
    155 	codeLen uint8
    156 	value   uint16
    157 }
    158 
    159 // huffmanCodes is used to provide an interface for sorting.
    160 type huffmanCodes []huffmanCode
    161 
    162 func (n huffmanCodes) Len() int {
    163 	return len(n)
    164 }
    165 
    166 func (n huffmanCodes) Less(i, j int) bool {
    167 	return n[i].code < n[j].code
    168 }
    169 
    170 func (n huffmanCodes) Swap(i, j int) {
    171 	n[i], n[j] = n[j], n[i]
    172 }
    173 
    174 // buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
    175 // the Huffman tree at the given level. It returns the index of the newly
    176 // constructed node.
    177 func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
    178 	test := uint32(1) << (31 - level)
    179 
    180 	// We have to search the list of codes to find the divide between the left and right sides.
    181 	firstRightIndex := len(codes)
    182 	for i, code := range codes {
    183 		if code.code&test != 0 {
    184 			firstRightIndex = i
    185 			break
    186 		}
    187 	}
    188 
    189 	left := codes[:firstRightIndex]
    190 	right := codes[firstRightIndex:]
    191 
    192 	if len(left) == 0 || len(right) == 0 {
    193 		// There is a superfluous level in the Huffman tree indicating
    194 		// a bug in the encoder. However, this bug has been observed in
    195 		// the wild so we handle it.
    196 
    197 		// If this function was called recursively then we know that
    198 		// len(codes) >= 2 because, otherwise, we would have hit the
    199 		// "leaf node" case, below, and not recursed.
    200 		//
    201 		// However, for the initial call it's possible that len(codes)
    202 		// is zero or one. Both cases are invalid because a zero length
    203 		// tree cannot encode anything and a length-1 tree can only
    204 		// encode EOF and so is superfluous. We reject both.
    205 		if len(codes) < 2 {
    206 			return 0, StructuralError("empty Huffman tree")
    207 		}
    208 
    209 		// In this case the recursion doesn't always reduce the length
    210 		// of codes so we need to ensure termination via another
    211 		// mechanism.
    212 		if level == 31 {
    213 			// Since len(codes) >= 2 the only way that the values
    214 			// can match at all 32 bits is if they are equal, which
    215 			// is invalid. This ensures that we never enter
    216 			// infinite recursion.
    217 			return 0, StructuralError("equal symbols in Huffman tree")
    218 		}
    219 
    220 		if len(left) == 0 {
    221 			return buildHuffmanNode(t, right, level+1)
    222 		}
    223 		return buildHuffmanNode(t, left, level+1)
    224 	}
    225 
    226 	nodeIndex = uint16(t.nextNode)
    227 	node := &t.nodes[t.nextNode]
    228 	t.nextNode++
    229 
    230 	if len(left) == 1 {
    231 		// leaf node
    232 		node.left = invalidNodeValue
    233 		node.leftValue = left[0].value
    234 	} else {
    235 		node.left, err = buildHuffmanNode(t, left, level+1)
    236 	}
    237 
    238 	if err != nil {
    239 		return
    240 	}
    241 
    242 	if len(right) == 1 {
    243 		// leaf node
    244 		node.right = invalidNodeValue
    245 		node.rightValue = right[0].value
    246 	} else {
    247 		node.right, err = buildHuffmanNode(t, right, level+1)
    248 	}
    249 
    250 	return
    251 }
    252