Home | History | Annotate | Download | only in enc
      1 // Copyright 2013 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 // http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 //
     15 // Functions for clustering similar histograms together.
     16 
     17 #ifndef BROTLI_ENC_CLUSTER_H_
     18 #define BROTLI_ENC_CLUSTER_H_
     19 
     20 #include <math.h>
     21 #include <stdint.h>
     22 #include <stdio.h>
     23 #include <complex>
     24 #include <map>
     25 #include <set>
     26 #include <utility>
     27 #include <vector>
     28 
     29 #include "./bit_cost.h"
     30 #include "./entropy_encode.h"
     31 #include "./fast_log.h"
     32 #include "./histogram.h"
     33 
     34 namespace brotli {
     35 
     36 struct HistogramPair {
     37   int idx1;
     38   int idx2;
     39   bool valid;
     40   double cost_combo;
     41   double cost_diff;
     42 };
     43 
     44 struct HistogramPairComparator {
     45   bool operator()(const HistogramPair& p1, const HistogramPair& p2) {
     46     if (p1.cost_diff != p2.cost_diff) {
     47       return p1.cost_diff > p2.cost_diff;
     48     }
     49     return abs(p1.idx1 - p1.idx2) > abs(p2.idx1 - p2.idx2);
     50   }
     51 };
     52 
     53 // Returns entropy reduction of the context map when we combine two clusters.
     54 inline double ClusterCostDiff(int size_a, int size_b) {
     55   int size_c = size_a + size_b;
     56   return size_a * FastLog2(size_a) + size_b * FastLog2(size_b) -
     57       size_c * FastLog2(size_c);
     58 }
     59 
     60 // Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
     61 // it is below a threshold, stores the pair (idx1, idx2) in the *pairs heap.
     62 template<int kSize>
     63 void CompareAndPushToHeap(const Histogram<kSize>* out,
     64                           const int* cluster_size,
     65                           int idx1, int idx2,
     66                           std::vector<HistogramPair>* pairs) {
     67   if (idx1 == idx2) {
     68     return;
     69   }
     70   if (idx2 < idx1) {
     71     int t = idx2;
     72     idx2 = idx1;
     73     idx1 = t;
     74   }
     75   bool store_pair = false;
     76   HistogramPair p;
     77   p.idx1 = idx1;
     78   p.idx2 = idx2;
     79   p.valid = true;
     80   p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
     81   p.cost_diff -= out[idx1].bit_cost_;
     82   p.cost_diff -= out[idx2].bit_cost_;
     83 
     84   if (out[idx1].total_count_ == 0) {
     85     p.cost_combo = out[idx2].bit_cost_;
     86     store_pair = true;
     87   } else if (out[idx2].total_count_ == 0) {
     88     p.cost_combo = out[idx1].bit_cost_;
     89     store_pair = true;
     90   } else {
     91     double threshold = pairs->empty() ? 1e99 :
     92         std::max(0.0, (*pairs)[0].cost_diff);
     93     Histogram<kSize> combo = out[idx1];
     94     combo.AddHistogram(out[idx2]);
     95     double cost_combo = PopulationCost(combo);
     96     if (cost_combo < threshold - p.cost_diff) {
     97       p.cost_combo = cost_combo;
     98       store_pair = true;
     99     }
    100   }
    101   if (store_pair) {
    102     p.cost_diff += p.cost_combo;
    103     pairs->push_back(p);
    104     push_heap(pairs->begin(), pairs->end(), HistogramPairComparator());
    105   }
    106 }
    107 
    108 template<int kSize>
    109 void HistogramCombine(Histogram<kSize>* out,
    110                       int* cluster_size,
    111                       int* symbols,
    112                       int symbols_size,
    113                       int max_clusters) {
    114   double cost_diff_threshold = 0.0;
    115   int min_cluster_size = 1;
    116   std::set<int> all_symbols;
    117   std::vector<int> clusters;
    118   for (int i = 0; i < symbols_size; ++i) {
    119     if (all_symbols.find(symbols[i]) == all_symbols.end()) {
    120       all_symbols.insert(symbols[i]);
    121       clusters.push_back(symbols[i]);
    122     }
    123   }
    124 
    125   // We maintain a heap of histogram pairs, ordered by the bit cost reduction.
    126   std::vector<HistogramPair> pairs;
    127   for (int idx1 = 0; idx1 < clusters.size(); ++idx1) {
    128     for (int idx2 = idx1 + 1; idx2 < clusters.size(); ++idx2) {
    129       CompareAndPushToHeap(out, cluster_size, clusters[idx1], clusters[idx2],
    130                            &pairs);
    131     }
    132   }
    133 
    134   while (clusters.size() > min_cluster_size) {
    135     if (pairs[0].cost_diff >= cost_diff_threshold) {
    136       cost_diff_threshold = 1e99;
    137       min_cluster_size = max_clusters;
    138       continue;
    139     }
    140     // Take the best pair from the top of heap.
    141     int best_idx1 = pairs[0].idx1;
    142     int best_idx2 = pairs[0].idx2;
    143     out[best_idx1].AddHistogram(out[best_idx2]);
    144     out[best_idx1].bit_cost_ = pairs[0].cost_combo;
    145     cluster_size[best_idx1] += cluster_size[best_idx2];
    146     for (int i = 0; i < symbols_size; ++i) {
    147       if (symbols[i] == best_idx2) {
    148         symbols[i] = best_idx1;
    149       }
    150     }
    151     for (int i = 0; i + 1 < clusters.size(); ++i) {
    152       if (clusters[i] >= best_idx2) {
    153         clusters[i] = clusters[i + 1];
    154       }
    155     }
    156     clusters.pop_back();
    157     // Invalidate pairs intersecting the just combined best pair.
    158     for (int i = 0; i < pairs.size(); ++i) {
    159       HistogramPair& p = pairs[i];
    160       if (p.idx1 == best_idx1 || p.idx2 == best_idx1 ||
    161           p.idx1 == best_idx2 || p.idx2 == best_idx2) {
    162         p.valid = false;
    163       }
    164     }
    165     // Pop invalid pairs from the top of the heap.
    166     while (!pairs.empty() && !pairs[0].valid) {
    167       pop_heap(pairs.begin(), pairs.end(), HistogramPairComparator());
    168       pairs.pop_back();
    169     }
    170     // Push new pairs formed with the combined histogram to the heap.
    171     for (int i = 0; i < clusters.size(); ++i) {
    172       CompareAndPushToHeap(out, cluster_size, best_idx1, clusters[i], &pairs);
    173     }
    174   }
    175 }
    176 
    177 // -----------------------------------------------------------------------------
    178 // Histogram refinement
    179 
    180 // What is the bit cost of moving histogram from cur_symbol to candidate.
    181 template<int kSize>
    182 double HistogramBitCostDistance(const Histogram<kSize>& histogram,
    183                                 const Histogram<kSize>& candidate) {
    184   if (histogram.total_count_ == 0) {
    185     return 0.0;
    186   }
    187   Histogram<kSize> tmp = histogram;
    188   tmp.AddHistogram(candidate);
    189   return PopulationCost(tmp) - candidate.bit_cost_;
    190 }
    191 
    192 // Find the best 'out' histogram for each of the 'in' histograms.
    193 // Note: we assume that out[]->bit_cost_ is already up-to-date.
    194 template<int kSize>
    195 void HistogramRemap(const Histogram<kSize>* in, int in_size,
    196                     Histogram<kSize>* out, int* symbols) {
    197   std::set<int> all_symbols;
    198   for (int i = 0; i < in_size; ++i) {
    199     all_symbols.insert(symbols[i]);
    200   }
    201   for (int i = 0; i < in_size; ++i) {
    202     int best_out = i == 0 ? symbols[0] : symbols[i - 1];
    203     double best_bits = HistogramBitCostDistance(in[i], out[best_out]);
    204     for (std::set<int>::const_iterator k = all_symbols.begin();
    205          k != all_symbols.end(); ++k) {
    206       const double cur_bits = HistogramBitCostDistance(in[i], out[*k]);
    207       if (cur_bits < best_bits) {
    208         best_bits = cur_bits;
    209         best_out = *k;
    210       }
    211     }
    212     symbols[i] = best_out;
    213   }
    214 
    215   // Recompute each out based on raw and symbols.
    216   for (std::set<int>::const_iterator k = all_symbols.begin();
    217        k != all_symbols.end(); ++k) {
    218     out[*k].Clear();
    219   }
    220   for (int i = 0; i < in_size; ++i) {
    221     out[symbols[i]].AddHistogram(in[i]);
    222   }
    223 }
    224 
    225 // Reorder histograms in *out so that the new symbols in *symbols come in
    226 // increasing order.
    227 template<int kSize>
    228 void HistogramReindex(std::vector<Histogram<kSize> >* out,
    229                       std::vector<int>* symbols) {
    230   std::vector<Histogram<kSize> > tmp(*out);
    231   std::map<int, int> new_index;
    232   int next_index = 0;
    233   for (int i = 0; i < symbols->size(); ++i) {
    234     if (new_index.find((*symbols)[i]) == new_index.end()) {
    235       new_index[(*symbols)[i]] = next_index;
    236       (*out)[next_index] = tmp[(*symbols)[i]];
    237       ++next_index;
    238     }
    239   }
    240   out->resize(next_index);
    241   for (int i = 0; i < symbols->size(); ++i) {
    242     (*symbols)[i] = new_index[(*symbols)[i]];
    243   }
    244 }
    245 
    246 // Clusters similar histograms in 'in' together, the selected histograms are
    247 // placed in 'out', and for each index in 'in', *histogram_symbols will
    248 // indicate which of the 'out' histograms is the best approximation.
    249 template<int kSize>
    250 void ClusterHistograms(const std::vector<Histogram<kSize> >& in,
    251                        int num_contexts, int num_blocks,
    252                        int max_histograms,
    253                        std::vector<Histogram<kSize> >* out,
    254                        std::vector<int>* histogram_symbols) {
    255   const int in_size = num_contexts * num_blocks;
    256   std::vector<int> cluster_size(in_size, 1);
    257   out->resize(in_size);
    258   histogram_symbols->resize(in_size);
    259   for (int i = 0; i < in_size; ++i) {
    260     (*out)[i] = in[i];
    261     (*out)[i].bit_cost_ = PopulationCost(in[i]);
    262     (*histogram_symbols)[i] = i;
    263   }
    264 
    265   // Collapse similar histograms within a block type.
    266   if (num_contexts > 1) {
    267     for (int i = 0; i < num_blocks; ++i) {
    268       HistogramCombine(&(*out)[0], &cluster_size[0],
    269                        &(*histogram_symbols)[i * num_contexts], num_contexts,
    270                        max_histograms);
    271     }
    272   }
    273 
    274   // Collapse similar histograms.
    275   HistogramCombine(&(*out)[0], &cluster_size[0],
    276                    &(*histogram_symbols)[0], in_size,
    277                    max_histograms);
    278 
    279   // Find the optimal map from original histograms to the final ones.
    280   HistogramRemap(&in[0], in_size, &(*out)[0], &(*histogram_symbols)[0]);
    281 
    282   // Convert the context map to a canonical form.
    283   HistogramReindex(out, histogram_symbols);
    284 }
    285 
    286 }  // namespace brotli
    287 
    288 #endif  // BROTLI_ENC_CLUSTER_H_
    289