1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package bzip2 6 7 import "sort" 8 9 // A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a 10 // symbol. 11 type huffmanTree struct { 12 // nodes contains all the non-leaf nodes in the tree. nodes[0] is the 13 // root of the tree and nextNode contains the index of the next element 14 // of nodes to use when the tree is being constructed. 15 nodes []huffmanNode 16 nextNode int 17 } 18 19 // A huffmanNode is a node in the tree. left and right contain indexes into the 20 // nodes slice of the tree. If left or right is invalidNodeValue then the child 21 // is a left node and its value is in leftValue/rightValue. 22 // 23 // The symbols are uint16s because bzip2 encodes not only MTF indexes in the 24 // tree, but also two magic values for run-length encoding and an EOF symbol. 25 // Thus there are more than 256 possible symbols. 26 type huffmanNode struct { 27 left, right uint16 28 leftValue, rightValue uint16 29 } 30 31 // invalidNodeValue is an invalid index which marks a leaf node in the tree. 32 const invalidNodeValue = 0xffff 33 34 // Decode reads bits from the given bitReader and navigates the tree until a 35 // symbol is found. 36 func (t *huffmanTree) Decode(br *bitReader) (v uint16) { 37 nodeIndex := uint16(0) // node 0 is the root of the tree. 38 39 for { 40 node := &t.nodes[nodeIndex] 41 bit, ok := br.TryReadBit() 42 if !ok && br.ReadBit() { 43 bit = 1 44 } 45 // bzip2 encodes left as a true bit. 46 if bit != 0 { 47 // left 48 if node.left == invalidNodeValue { 49 return node.leftValue 50 } 51 nodeIndex = node.left 52 } else { 53 // right 54 if node.right == invalidNodeValue { 55 return node.rightValue 56 } 57 nodeIndex = node.right 58 } 59 } 60 } 61 62 // newHuffmanTree builds a Huffman tree from a slice containing the code 63 // lengths of each symbol. The maximum code length is 32 bits. 64 func newHuffmanTree(lengths []uint8) (huffmanTree, error) { 65 // There are many possible trees that assign the same code length to 66 // each symbol (consider reflecting a tree down the middle, for 67 // example). Since the code length assignments determine the 68 // efficiency of the tree, each of these trees is equally good. In 69 // order to minimize the amount of information needed to build a tree 70 // bzip2 uses a canonical tree so that it can be reconstructed given 71 // only the code length assignments. 72 73 if len(lengths) < 2 { 74 panic("newHuffmanTree: too few symbols") 75 } 76 77 var t huffmanTree 78 79 // First we sort the code length assignments by ascending code length, 80 // using the symbol value to break ties. 81 pairs := huffmanSymbolLengthPairs(make([]huffmanSymbolLengthPair, len(lengths))) 82 for i, length := range lengths { 83 pairs[i].value = uint16(i) 84 pairs[i].length = length 85 } 86 87 sort.Sort(pairs) 88 89 // Now we assign codes to the symbols, starting with the longest code. 90 // We keep the codes packed into a uint32, at the most-significant end. 91 // So branches are taken from the MSB downwards. This makes it easy to 92 // sort them later. 93 code := uint32(0) 94 length := uint8(32) 95 96 codes := huffmanCodes(make([]huffmanCode, len(lengths))) 97 for i := len(pairs) - 1; i >= 0; i-- { 98 if length > pairs[i].length { 99 // If the code length decreases we shift in order to 100 // zero any bits beyond the end of the code. 101 length >>= 32 - pairs[i].length 102 length <<= 32 - pairs[i].length 103 length = pairs[i].length 104 } 105 codes[i].code = code 106 codes[i].codeLen = length 107 codes[i].value = pairs[i].value 108 // We need to 'increment' the code, which means treating |code| 109 // like a |length| bit number. 110 code += 1 << (32 - length) 111 } 112 113 // Now we can sort by the code so that the left half of each branch are 114 // grouped together, recursively. 115 sort.Sort(codes) 116 117 t.nodes = make([]huffmanNode, len(codes)) 118 _, err := buildHuffmanNode(&t, codes, 0) 119 return t, err 120 } 121 122 // huffmanSymbolLengthPair contains a symbol and its code length. 123 type huffmanSymbolLengthPair struct { 124 value uint16 125 length uint8 126 } 127 128 // huffmanSymbolLengthPair is used to provide an interface for sorting. 129 type huffmanSymbolLengthPairs []huffmanSymbolLengthPair 130 131 func (h huffmanSymbolLengthPairs) Len() int { 132 return len(h) 133 } 134 135 func (h huffmanSymbolLengthPairs) Less(i, j int) bool { 136 if h[i].length < h[j].length { 137 return true 138 } 139 if h[i].length > h[j].length { 140 return false 141 } 142 if h[i].value < h[j].value { 143 return true 144 } 145 return false 146 } 147 148 func (h huffmanSymbolLengthPairs) Swap(i, j int) { 149 h[i], h[j] = h[j], h[i] 150 } 151 152 // huffmanCode contains a symbol, its code and code length. 153 type huffmanCode struct { 154 code uint32 155 codeLen uint8 156 value uint16 157 } 158 159 // huffmanCodes is used to provide an interface for sorting. 160 type huffmanCodes []huffmanCode 161 162 func (n huffmanCodes) Len() int { 163 return len(n) 164 } 165 166 func (n huffmanCodes) Less(i, j int) bool { 167 return n[i].code < n[j].code 168 } 169 170 func (n huffmanCodes) Swap(i, j int) { 171 n[i], n[j] = n[j], n[i] 172 } 173 174 // buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in 175 // the Huffman tree at the given level. It returns the index of the newly 176 // constructed node. 177 func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) { 178 test := uint32(1) << (31 - level) 179 180 // We have to search the list of codes to find the divide between the left and right sides. 181 firstRightIndex := len(codes) 182 for i, code := range codes { 183 if code.code&test != 0 { 184 firstRightIndex = i 185 break 186 } 187 } 188 189 left := codes[:firstRightIndex] 190 right := codes[firstRightIndex:] 191 192 if len(left) == 0 || len(right) == 0 { 193 // There is a superfluous level in the Huffman tree indicating 194 // a bug in the encoder. However, this bug has been observed in 195 // the wild so we handle it. 196 197 // If this function was called recursively then we know that 198 // len(codes) >= 2 because, otherwise, we would have hit the 199 // "leaf node" case, below, and not recursed. 200 // 201 // However, for the initial call it's possible that len(codes) 202 // is zero or one. Both cases are invalid because a zero length 203 // tree cannot encode anything and a length-1 tree can only 204 // encode EOF and so is superfluous. We reject both. 205 if len(codes) < 2 { 206 return 0, StructuralError("empty Huffman tree") 207 } 208 209 // In this case the recursion doesn't always reduce the length 210 // of codes so we need to ensure termination via another 211 // mechanism. 212 if level == 31 { 213 // Since len(codes) >= 2 the only way that the values 214 // can match at all 32 bits is if they are equal, which 215 // is invalid. This ensures that we never enter 216 // infinite recursion. 217 return 0, StructuralError("equal symbols in Huffman tree") 218 } 219 220 if len(left) == 0 { 221 return buildHuffmanNode(t, right, level+1) 222 } 223 return buildHuffmanNode(t, left, level+1) 224 } 225 226 nodeIndex = uint16(t.nextNode) 227 node := &t.nodes[t.nextNode] 228 t.nextNode++ 229 230 if len(left) == 1 { 231 // leaf node 232 node.left = invalidNodeValue 233 node.leftValue = left[0].value 234 } else { 235 node.left, err = buildHuffmanNode(t, left, level+1) 236 } 237 238 if err != nil { 239 return 240 } 241 242 if len(right) == 1 { 243 // leaf node 244 node.right = invalidNodeValue 245 node.rightValue = right[0].value 246 } else { 247 node.right, err = buildHuffmanNode(t, right, level+1) 248 } 249 250 return 251 } 252