1 // Copyright 2013 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // Functions for clustering similar histograms together. 16 17 #ifndef BROTLI_ENC_CLUSTER_H_ 18 #define BROTLI_ENC_CLUSTER_H_ 19 20 #include <math.h> 21 #include <stdint.h> 22 #include <stdio.h> 23 #include <complex> 24 #include <map> 25 #include <set> 26 #include <utility> 27 #include <vector> 28 29 #include "./bit_cost.h" 30 #include "./entropy_encode.h" 31 #include "./fast_log.h" 32 #include "./histogram.h" 33 34 namespace brotli { 35 36 struct HistogramPair { 37 int idx1; 38 int idx2; 39 bool valid; 40 double cost_combo; 41 double cost_diff; 42 }; 43 44 struct HistogramPairComparator { 45 bool operator()(const HistogramPair& p1, const HistogramPair& p2) { 46 if (p1.cost_diff != p2.cost_diff) { 47 return p1.cost_diff > p2.cost_diff; 48 } 49 return abs(p1.idx1 - p1.idx2) > abs(p2.idx1 - p2.idx2); 50 } 51 }; 52 53 // Returns entropy reduction of the context map when we combine two clusters. 54 inline double ClusterCostDiff(int size_a, int size_b) { 55 int size_c = size_a + size_b; 56 return size_a * FastLog2(size_a) + size_b * FastLog2(size_b) - 57 size_c * FastLog2(size_c); 58 } 59 60 // Computes the bit cost reduction by combining out[idx1] and out[idx2] and if 61 // it is below a threshold, stores the pair (idx1, idx2) in the *pairs heap. 62 template<int kSize> 63 void CompareAndPushToHeap(const Histogram<kSize>* out, 64 const int* cluster_size, 65 int idx1, int idx2, 66 std::vector<HistogramPair>* pairs) { 67 if (idx1 == idx2) { 68 return; 69 } 70 if (idx2 < idx1) { 71 int t = idx2; 72 idx2 = idx1; 73 idx1 = t; 74 } 75 bool store_pair = false; 76 HistogramPair p; 77 p.idx1 = idx1; 78 p.idx2 = idx2; 79 p.valid = true; 80 p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]); 81 p.cost_diff -= out[idx1].bit_cost_; 82 p.cost_diff -= out[idx2].bit_cost_; 83 84 if (out[idx1].total_count_ == 0) { 85 p.cost_combo = out[idx2].bit_cost_; 86 store_pair = true; 87 } else if (out[idx2].total_count_ == 0) { 88 p.cost_combo = out[idx1].bit_cost_; 89 store_pair = true; 90 } else { 91 double threshold = pairs->empty() ? 1e99 : 92 std::max(0.0, (*pairs)[0].cost_diff); 93 Histogram<kSize> combo = out[idx1]; 94 combo.AddHistogram(out[idx2]); 95 double cost_combo = PopulationCost(combo); 96 if (cost_combo < threshold - p.cost_diff) { 97 p.cost_combo = cost_combo; 98 store_pair = true; 99 } 100 } 101 if (store_pair) { 102 p.cost_diff += p.cost_combo; 103 pairs->push_back(p); 104 push_heap(pairs->begin(), pairs->end(), HistogramPairComparator()); 105 } 106 } 107 108 template<int kSize> 109 void HistogramCombine(Histogram<kSize>* out, 110 int* cluster_size, 111 int* symbols, 112 int symbols_size, 113 int max_clusters) { 114 double cost_diff_threshold = 0.0; 115 int min_cluster_size = 1; 116 std::set<int> all_symbols; 117 std::vector<int> clusters; 118 for (int i = 0; i < symbols_size; ++i) { 119 if (all_symbols.find(symbols[i]) == all_symbols.end()) { 120 all_symbols.insert(symbols[i]); 121 clusters.push_back(symbols[i]); 122 } 123 } 124 125 // We maintain a heap of histogram pairs, ordered by the bit cost reduction. 126 std::vector<HistogramPair> pairs; 127 for (int idx1 = 0; idx1 < clusters.size(); ++idx1) { 128 for (int idx2 = idx1 + 1; idx2 < clusters.size(); ++idx2) { 129 CompareAndPushToHeap(out, cluster_size, clusters[idx1], clusters[idx2], 130 &pairs); 131 } 132 } 133 134 while (clusters.size() > min_cluster_size) { 135 if (pairs[0].cost_diff >= cost_diff_threshold) { 136 cost_diff_threshold = 1e99; 137 min_cluster_size = max_clusters; 138 continue; 139 } 140 // Take the best pair from the top of heap. 141 int best_idx1 = pairs[0].idx1; 142 int best_idx2 = pairs[0].idx2; 143 out[best_idx1].AddHistogram(out[best_idx2]); 144 out[best_idx1].bit_cost_ = pairs[0].cost_combo; 145 cluster_size[best_idx1] += cluster_size[best_idx2]; 146 for (int i = 0; i < symbols_size; ++i) { 147 if (symbols[i] == best_idx2) { 148 symbols[i] = best_idx1; 149 } 150 } 151 for (int i = 0; i + 1 < clusters.size(); ++i) { 152 if (clusters[i] >= best_idx2) { 153 clusters[i] = clusters[i + 1]; 154 } 155 } 156 clusters.pop_back(); 157 // Invalidate pairs intersecting the just combined best pair. 158 for (int i = 0; i < pairs.size(); ++i) { 159 HistogramPair& p = pairs[i]; 160 if (p.idx1 == best_idx1 || p.idx2 == best_idx1 || 161 p.idx1 == best_idx2 || p.idx2 == best_idx2) { 162 p.valid = false; 163 } 164 } 165 // Pop invalid pairs from the top of the heap. 166 while (!pairs.empty() && !pairs[0].valid) { 167 pop_heap(pairs.begin(), pairs.end(), HistogramPairComparator()); 168 pairs.pop_back(); 169 } 170 // Push new pairs formed with the combined histogram to the heap. 171 for (int i = 0; i < clusters.size(); ++i) { 172 CompareAndPushToHeap(out, cluster_size, best_idx1, clusters[i], &pairs); 173 } 174 } 175 } 176 177 // ----------------------------------------------------------------------------- 178 // Histogram refinement 179 180 // What is the bit cost of moving histogram from cur_symbol to candidate. 181 template<int kSize> 182 double HistogramBitCostDistance(const Histogram<kSize>& histogram, 183 const Histogram<kSize>& candidate) { 184 if (histogram.total_count_ == 0) { 185 return 0.0; 186 } 187 Histogram<kSize> tmp = histogram; 188 tmp.AddHistogram(candidate); 189 return PopulationCost(tmp) - candidate.bit_cost_; 190 } 191 192 // Find the best 'out' histogram for each of the 'in' histograms. 193 // Note: we assume that out[]->bit_cost_ is already up-to-date. 194 template<int kSize> 195 void HistogramRemap(const Histogram<kSize>* in, int in_size, 196 Histogram<kSize>* out, int* symbols) { 197 std::set<int> all_symbols; 198 for (int i = 0; i < in_size; ++i) { 199 all_symbols.insert(symbols[i]); 200 } 201 for (int i = 0; i < in_size; ++i) { 202 int best_out = i == 0 ? symbols[0] : symbols[i - 1]; 203 double best_bits = HistogramBitCostDistance(in[i], out[best_out]); 204 for (std::set<int>::const_iterator k = all_symbols.begin(); 205 k != all_symbols.end(); ++k) { 206 const double cur_bits = HistogramBitCostDistance(in[i], out[*k]); 207 if (cur_bits < best_bits) { 208 best_bits = cur_bits; 209 best_out = *k; 210 } 211 } 212 symbols[i] = best_out; 213 } 214 215 // Recompute each out based on raw and symbols. 216 for (std::set<int>::const_iterator k = all_symbols.begin(); 217 k != all_symbols.end(); ++k) { 218 out[*k].Clear(); 219 } 220 for (int i = 0; i < in_size; ++i) { 221 out[symbols[i]].AddHistogram(in[i]); 222 } 223 } 224 225 // Reorder histograms in *out so that the new symbols in *symbols come in 226 // increasing order. 227 template<int kSize> 228 void HistogramReindex(std::vector<Histogram<kSize> >* out, 229 std::vector<int>* symbols) { 230 std::vector<Histogram<kSize> > tmp(*out); 231 std::map<int, int> new_index; 232 int next_index = 0; 233 for (int i = 0; i < symbols->size(); ++i) { 234 if (new_index.find((*symbols)[i]) == new_index.end()) { 235 new_index[(*symbols)[i]] = next_index; 236 (*out)[next_index] = tmp[(*symbols)[i]]; 237 ++next_index; 238 } 239 } 240 out->resize(next_index); 241 for (int i = 0; i < symbols->size(); ++i) { 242 (*symbols)[i] = new_index[(*symbols)[i]]; 243 } 244 } 245 246 // Clusters similar histograms in 'in' together, the selected histograms are 247 // placed in 'out', and for each index in 'in', *histogram_symbols will 248 // indicate which of the 'out' histograms is the best approximation. 249 template<int kSize> 250 void ClusterHistograms(const std::vector<Histogram<kSize> >& in, 251 int num_contexts, int num_blocks, 252 int max_histograms, 253 std::vector<Histogram<kSize> >* out, 254 std::vector<int>* histogram_symbols) { 255 const int in_size = num_contexts * num_blocks; 256 std::vector<int> cluster_size(in_size, 1); 257 out->resize(in_size); 258 histogram_symbols->resize(in_size); 259 for (int i = 0; i < in_size; ++i) { 260 (*out)[i] = in[i]; 261 (*out)[i].bit_cost_ = PopulationCost(in[i]); 262 (*histogram_symbols)[i] = i; 263 } 264 265 // Collapse similar histograms within a block type. 266 if (num_contexts > 1) { 267 for (int i = 0; i < num_blocks; ++i) { 268 HistogramCombine(&(*out)[0], &cluster_size[0], 269 &(*histogram_symbols)[i * num_contexts], num_contexts, 270 max_histograms); 271 } 272 } 273 274 // Collapse similar histograms. 275 HistogramCombine(&(*out)[0], &cluster_size[0], 276 &(*histogram_symbols)[0], in_size, 277 max_histograms); 278 279 // Find the optimal map from original histograms to the final ones. 280 HistogramRemap(&in[0], in_size, &(*out)[0], &(*histogram_symbols)[0]); 281 282 // Convert the context map to a canonical form. 283 HistogramReindex(out, histogram_symbols); 284 } 285 286 } // namespace brotli 287 288 #endif // BROTLI_ENC_CLUSTER_H_ 289