Home | History | Annotate | Download | only in enc
      1 // Copyright 2013 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 // http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 //
     15 // Block split point selection utilities.
     16 
     17 #include "./block_splitter.h"
     18 
     19 #include <math.h>
     20 #include <stdio.h>
     21 #include <stdlib.h>
     22 #include <string.h>
     23 
     24 #include <algorithm>
     25 #include <map>
     26 
     27 #include "./cluster.h"
     28 #include "./command.h"
     29 #include "./fast_log.h"
     30 #include "./histogram.h"
     31 
     32 namespace brotli {
     33 
     34 static const int kMaxLiteralHistograms = 100;
     35 static const int kMaxCommandHistograms = 50;
     36 static const double kLiteralBlockSwitchCost = 26;
     37 static const double kCommandBlockSwitchCost = 13.5;
     38 static const double kDistanceBlockSwitchCost = 14.6;
     39 static const int kLiteralStrideLength = 70;
     40 static const int kCommandStrideLength = 40;
     41 static const int kSymbolsPerLiteralHistogram = 544;
     42 static const int kSymbolsPerCommandHistogram = 530;
     43 static const int kSymbolsPerDistanceHistogram = 544;
     44 static const int kMinLengthForBlockSplitting = 128;
     45 static const int kIterMulForRefining = 2;
     46 static const int kMinItersForRefining = 100;
     47 
     48 void CopyLiteralsToByteArray(const std::vector<Command>& cmds,
     49                              const uint8_t* data,
     50                              std::vector<uint8_t>* literals) {
     51   // Count how many we have.
     52   size_t total_length = 0;
     53   for (int i = 0; i < cmds.size(); ++i) {
     54     total_length += cmds[i].insert_length_;
     55   }
     56   if (total_length == 0) {
     57     return;
     58   }
     59 
     60   // Allocate.
     61   literals->resize(total_length);
     62 
     63   // Loop again, and copy this time.
     64   size_t pos = 0;
     65   size_t from_pos = 0;
     66   for (int i = 0; i < cmds.size() && pos < total_length; ++i) {
     67     memcpy(&(*literals)[pos], data + from_pos, cmds[i].insert_length_);
     68     pos += cmds[i].insert_length_;
     69     from_pos += cmds[i].insert_length_ + cmds[i].copy_length_;
     70   }
     71 }
     72 
     73 void CopyCommandsToByteArray(const std::vector<Command>& cmds,
     74                              std::vector<uint16_t>* insert_and_copy_codes,
     75                              std::vector<uint8_t>* distance_prefixes) {
     76   for (int i = 0; i < cmds.size(); ++i) {
     77     const Command& cmd = cmds[i];
     78     insert_and_copy_codes->push_back(cmd.command_prefix_);
     79     if (cmd.copy_length_ > 0 && cmd.distance_prefix_ != 0xffff) {
     80       distance_prefixes->push_back(cmd.distance_prefix_);
     81     }
     82   }
     83 }
     84 
     85 template<typename HistogramType, typename DataType>
     86 void InitialEntropyCodes(const DataType* data, size_t length,
     87                          int literals_per_histogram,
     88                          int max_histograms,
     89                          size_t stride,
     90                          std::vector<HistogramType>* vec) {
     91   int total_histograms = length / literals_per_histogram + 1;
     92   if (total_histograms > max_histograms) {
     93     total_histograms = max_histograms;
     94   }
     95   unsigned int seed = 7;
     96   int block_length = length / total_histograms;
     97   for (int i = 0; i < total_histograms; ++i) {
     98     int pos = length * i / total_histograms;
     99     if (i != 0) {
    100       pos += rand_r(&seed) % block_length;
    101     }
    102     if (pos + stride >= length) {
    103       pos = length - stride - 1;
    104     }
    105     HistogramType histo;
    106     histo.Add(data + pos, stride);
    107     vec->push_back(histo);
    108   }
    109 }
    110 
    111 template<typename HistogramType, typename DataType>
    112 void RandomSample(unsigned int* seed,
    113                   const DataType* data,
    114                   size_t length,
    115                   size_t stride,
    116                   HistogramType* sample) {
    117   size_t pos = 0;
    118   if (stride >= length) {
    119     pos = 0;
    120     stride = length;
    121   } else {
    122     pos = rand_r(seed) % (length - stride + 1);
    123   }
    124   sample->Add(data + pos, stride);
    125 }
    126 
    127 template<typename HistogramType, typename DataType>
    128 void RefineEntropyCodes(const DataType* data, size_t length,
    129                         size_t stride,
    130                         std::vector<HistogramType>* vec) {
    131   int iters =
    132       kIterMulForRefining * length / stride + kMinItersForRefining;
    133   unsigned int seed = 7;
    134   iters = ((iters + vec->size() - 1) / vec->size()) * vec->size();
    135   for (int iter = 0; iter < iters; ++iter) {
    136     HistogramType sample;
    137     RandomSample(&seed, data, length, stride, &sample);
    138     int ix = iter % vec->size();
    139     (*vec)[ix].AddHistogram(sample);
    140   }
    141 }
    142 
    143 inline static float BitCost(int total, int count) {
    144   return count == 0 ? FastLog2(total) + 2 : FastLog2(total) - FastLog2(count);
    145 }
    146 
    147 template<typename DataType, int kSize>
    148 void FindBlocks(const DataType* data, const size_t length,
    149                 const double block_switch_bitcost,
    150                 const std::vector<Histogram<kSize> > &vec,
    151                 uint8_t *block_id) {
    152   if (vec.size() <= 1) {
    153     for (int i = 0; i < length; ++i) {
    154       block_id[i] = 0;
    155     }
    156     return;
    157   }
    158   int vecsize = vec.size();
    159   double* insert_cost = new double[kSize * vecsize];
    160   memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * vecsize);
    161   for (int i = 0; i < kSize; ++i) {
    162     for (int j = 0; j < vecsize; ++j) {
    163       insert_cost[i * vecsize + j] =
    164           BitCost(vec[j].total_count_, vec[j].data_[i]);
    165     }
    166   }
    167   double *cost = new double[vecsize];
    168   memset(cost, 0, sizeof(cost[0]) * vecsize);
    169   bool* switch_signal = new bool[length * vecsize];
    170   memset(switch_signal, 0, sizeof(switch_signal[0]) * length * vecsize);
    171   // After each iteration of this loop, cost[k] will contain the difference
    172   // between the minimum cost of arriving at the current byte position using
    173   // entropy code k, and the minimum cost of arriving at the current byte
    174   // position. This difference is capped at the block switch cost, and if it
    175   // reaches block switch cost, it means that when we trace back from the last
    176   // position, we need to switch here.
    177   for (size_t byte_ix = 0; byte_ix < length; ++byte_ix) {
    178     int ix = byte_ix * vecsize;
    179     int insert_cost_ix = data[byte_ix] * vecsize;
    180     double min_cost = 1e99;
    181     for (int k = 0; k < vecsize; ++k) {
    182       // We are coding the symbol in data[byte_ix] with entropy code k.
    183       cost[k] += insert_cost[insert_cost_ix + k];
    184       if (cost[k] < min_cost) {
    185         min_cost = cost[k];
    186         block_id[byte_ix] = k;
    187       }
    188     }
    189     double block_switch_cost = block_switch_bitcost;
    190     // More blocks for the beginning.
    191     if (byte_ix < 2000) {
    192       block_switch_cost *= 0.77 + 0.07 * byte_ix / 2000;
    193     }
    194     for (int k = 0; k < vecsize; ++k) {
    195       cost[k] -= min_cost;
    196       if (cost[k] >= block_switch_cost) {
    197         cost[k] = block_switch_cost;
    198         switch_signal[ix + k] = true;
    199       }
    200     }
    201   }
    202   // Now trace back from the last position and switch at the marked places.
    203   int byte_ix = length - 1;
    204   int ix = byte_ix * vecsize;
    205   int cur_id = block_id[byte_ix];
    206   while (byte_ix > 0) {
    207     --byte_ix;
    208     ix -= vecsize;
    209     if (switch_signal[ix + cur_id]) {
    210       cur_id = block_id[byte_ix];
    211     }
    212     block_id[byte_ix] = cur_id;
    213   }
    214   delete[] insert_cost;
    215   delete[] cost;
    216   delete[] switch_signal;
    217 }
    218 
    219 int RemapBlockIds(uint8_t* block_ids, const size_t length) {
    220   std::map<uint8_t, uint8_t> new_id;
    221   int next_id = 0;
    222   for (int i = 0; i < length; ++i) {
    223     if (new_id.find(block_ids[i]) == new_id.end()) {
    224       new_id[block_ids[i]] = next_id;
    225       ++next_id;
    226     }
    227   }
    228   for (int i = 0; i < length; ++i) {
    229     block_ids[i] = new_id[block_ids[i]];
    230   }
    231   return next_id;
    232 }
    233 
    234 template<typename HistogramType, typename DataType>
    235 void BuildBlockHistograms(const DataType* data, const size_t length,
    236                           uint8_t* block_ids,
    237                           std::vector<HistogramType>* histograms) {
    238   int num_types = RemapBlockIds(block_ids, length);
    239   histograms->clear();
    240   histograms->resize(num_types);
    241   for (int i = 0; i < length; ++i) {
    242     (*histograms)[block_ids[i]].Add(data[i]);
    243   }
    244 }
    245 
    246 template<typename HistogramType, typename DataType>
    247 void ClusterBlocks(const DataType* data, const size_t length,
    248                    uint8_t* block_ids) {
    249   std::vector<HistogramType> histograms;
    250   std::vector<int> block_index(length);
    251   int cur_idx = 0;
    252   HistogramType cur_histogram;
    253   for (int i = 0; i < length; ++i) {
    254     bool block_boundary = (i + 1 == length || block_ids[i] != block_ids[i + 1]);
    255     block_index[i] = cur_idx;
    256     cur_histogram.Add(data[i]);
    257     if (block_boundary) {
    258       histograms.push_back(cur_histogram);
    259       cur_histogram.Clear();
    260       ++cur_idx;
    261     }
    262   }
    263   std::vector<HistogramType> clustered_histograms;
    264   std::vector<int> histogram_symbols;
    265   // Block ids need to fit in one byte.
    266   static const int kMaxNumberOfBlockTypes = 256;
    267   ClusterHistograms(histograms, 1, histograms.size(),
    268                     kMaxNumberOfBlockTypes,
    269                     &clustered_histograms,
    270                     &histogram_symbols);
    271   for (int i = 0; i < length; ++i) {
    272     block_ids[i] = histogram_symbols[block_index[i]];
    273   }
    274 }
    275 
    276 void BuildBlockSplit(const std::vector<uint8_t>& block_ids, BlockSplit* split) {
    277   int cur_id = block_ids[0];
    278   int cur_length = 1;
    279   split->num_types_ = -1;
    280   for (int i = 1; i < block_ids.size(); ++i) {
    281     if (block_ids[i] != cur_id) {
    282       split->types_.push_back(cur_id);
    283       split->lengths_.push_back(cur_length);
    284       split->num_types_ = std::max(split->num_types_, cur_id);
    285       cur_id = block_ids[i];
    286       cur_length = 0;
    287     }
    288     ++cur_length;
    289   }
    290   split->types_.push_back(cur_id);
    291   split->lengths_.push_back(cur_length);
    292   split->num_types_ = std::max(split->num_types_, cur_id);
    293   ++split->num_types_;
    294 }
    295 
    296 template<typename HistogramType, typename DataType>
    297 void SplitByteVector(const std::vector<DataType>& data,
    298                      const int literals_per_histogram,
    299                      const int max_histograms,
    300                      const int sampling_stride_length,
    301                      const double block_switch_cost,
    302                      BlockSplit* split) {
    303   if (data.empty()) {
    304     split->num_types_ = 0;
    305     return;
    306   } else if (data.size() < kMinLengthForBlockSplitting) {
    307     split->num_types_ = 1;
    308     split->types_.push_back(0);
    309     split->lengths_.push_back(data.size());
    310     return;
    311   }
    312   std::vector<HistogramType> histograms;
    313   // Find good entropy codes.
    314   InitialEntropyCodes(data.data(), data.size(),
    315                       literals_per_histogram,
    316                       max_histograms,
    317                       sampling_stride_length,
    318                       &histograms);
    319   RefineEntropyCodes(data.data(), data.size(),
    320                      sampling_stride_length,
    321                      &histograms);
    322   // Find a good path through literals with the good entropy codes.
    323   std::vector<uint8_t> block_ids(data.size());
    324   for (int i = 0; i < 10; ++i) {
    325     FindBlocks(data.data(), data.size(),
    326                block_switch_cost,
    327                histograms,
    328                &block_ids[0]);
    329     BuildBlockHistograms(data.data(), data.size(), &block_ids[0], &histograms);
    330   }
    331   ClusterBlocks<HistogramType>(data.data(), data.size(), &block_ids[0]);
    332   BuildBlockSplit(block_ids, split);
    333 }
    334 
    335 void SplitBlock(const std::vector<Command>& cmds,
    336                 const uint8_t* data,
    337                 BlockSplit* literal_split,
    338                 BlockSplit* insert_and_copy_split,
    339                 BlockSplit* dist_split) {
    340   // Create a continuous array of literals.
    341   std::vector<uint8_t> literals;
    342   CopyLiteralsToByteArray(cmds, data, &literals);
    343 
    344   // Compute prefix codes for commands.
    345   std::vector<uint16_t> insert_and_copy_codes;
    346   std::vector<uint8_t> distance_prefixes;
    347   CopyCommandsToByteArray(cmds,
    348                           &insert_and_copy_codes,
    349                           &distance_prefixes);
    350 
    351 
    352   SplitByteVector<HistogramLiteral>(
    353       literals,
    354       kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
    355       kLiteralStrideLength, kLiteralBlockSwitchCost,
    356       literal_split);
    357   SplitByteVector<HistogramCommand>(
    358       insert_and_copy_codes,
    359       kSymbolsPerCommandHistogram, kMaxCommandHistograms,
    360       kCommandStrideLength, kCommandBlockSwitchCost,
    361       insert_and_copy_split);
    362   SplitByteVector<HistogramDistance>(
    363       distance_prefixes,
    364       kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
    365       kCommandStrideLength, kDistanceBlockSwitchCost,
    366       dist_split);
    367 }
    368 
    369 void SplitBlockByTotalLength(const std::vector<Command>& all_commands,
    370                              int input_size,
    371                              int target_length,
    372                              std::vector<std::vector<Command> >* blocks) {
    373   int num_blocks = input_size / target_length + 1;
    374   int length_limit = input_size / num_blocks + 1;
    375   int total_length = 0;
    376   std::vector<Command> cur_block;
    377   for (int i = 0; i < all_commands.size(); ++i) {
    378     const Command& cmd = all_commands[i];
    379     int cmd_length = cmd.insert_length_ + cmd.copy_length_;
    380     if (total_length > length_limit) {
    381       blocks->push_back(cur_block);
    382       cur_block.clear();
    383       total_length = 0;
    384     }
    385     cur_block.push_back(cmd);
    386     total_length += cmd_length;
    387   }
    388   blocks->push_back(cur_block);
    389 }
    390 
    391 }  // namespace brotli
    392