Home | History | Annotate | Download | only in enc
      1 // Copyright 2010 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 // http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 //
     15 // A (forgetful) hash table to the data seen by the compressor, to
     16 // help create backward references to previous data.
     17 
     18 #ifndef BROTLI_ENC_HASH_H_
     19 #define BROTLI_ENC_HASH_H_
     20 
     21 #include <stddef.h>
     22 #include <stdint.h>
     23 #include <string.h>
     24 #include <sys/types.h>
     25 #include <algorithm>
     26 #include <cstdlib>
     27 #include <memory>
     28 #include <string>
     29 
     30 #include "./transform.h"
     31 #include "./fast_log.h"
     32 #include "./find_match_length.h"
     33 #include "./port.h"
     34 #include "./static_dict.h"
     35 
     36 namespace brotli {
     37 
     38 // kHashMul32 multiplier has these properties:
     39 // * The multiplier must be odd. Otherwise we may lose the highest bit.
     40 // * No long streaks of 1s or 0s.
     41 // * There is no effort to ensure that it is a prime, the oddity is enough
     42 //   for this use.
     43 // * The number has been tuned heuristically against compression benchmarks.
     44 static const uint32_t kHashMul32 = 0x1e35a7bd;
     45 
     46 template<int kShiftBits, int kMinLength>
     47 inline uint32_t Hash(const uint8_t *data) {
     48   if (kMinLength <= 3) {
     49     // If kMinLength is 2 or 3, we hash the first 3 bytes of data.
     50     uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32;
     51     // The higher bits contain more mixture from the multiplication,
     52     // so we take our results from there.
     53     return h >> (32 - kShiftBits);
     54   } else {
     55     // If kMinLength is at least 4, we hash the first 4 bytes of data.
     56     uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
     57     // The higher bits contain more mixture from the multiplication,
     58     // so we take our results from there.
     59     return h >> (32 - kShiftBits);
     60   }
     61 }
     62 
     63 // Usually, we always choose the longest backward reference. This function
     64 // allows for the exception of that rule.
     65 //
     66 // If we choose a backward reference that is further away, it will
     67 // usually be coded with more bits. We approximate this by assuming
     68 // log2(distance). If the distance can be expressed in terms of the
     69 // last four distances, we use some heuristic constants to estimate
     70 // the bits cost. For the first up to four literals we use the bit
     71 // cost of the literals from the literal cost model, after that we
     72 // use the average bit cost of the cost model.
     73 //
     74 // This function is used to sometimes discard a longer backward reference
     75 // when it is not much longer and the bit cost for encoding it is more
     76 // than the saved literals.
     77 inline double BackwardReferenceScore(double average_cost,
     78                                      double start_cost4,
     79                                      double start_cost3,
     80                                      double start_cost2,
     81                                      int copy_length,
     82                                      int backward_reference_offset) {
     83   double retval = 0;
     84   switch (copy_length) {
     85     case 2: retval = start_cost2; break;
     86     case 3: retval = start_cost3; break;
     87     default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
     88   }
     89   retval -= 1.20 * Log2Floor(backward_reference_offset);
     90   return retval;
     91 }
     92 
     93 inline double BackwardReferenceScoreUsingLastDistance(double average_cost,
     94                                                       double start_cost4,
     95                                                       double start_cost3,
     96                                                       double start_cost2,
     97                                                       int copy_length,
     98                                                       int distance_short_code) {
     99   double retval = 0;
    100   switch (copy_length) {
    101     case 2: retval = start_cost2; break;
    102     case 3: retval = start_cost3; break;
    103     default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
    104   }
    105   static const double kDistanceShortCodeBitCost[16] = {
    106     -0.6, 0.95, 1.17, 1.27,
    107     0.93, 0.93, 0.96, 0.96, 0.99, 0.99,
    108     1.05, 1.05, 1.15, 1.15, 1.25, 1.25
    109   };
    110   retval -= kDistanceShortCodeBitCost[distance_short_code];
    111   return retval;
    112 }
    113 
    114 // A (forgetful) hash table to the data seen by the compressor, to
    115 // help create backward references to previous data.
    116 //
    117 // This is a hash map of fixed size (kBucketSize) to a ring buffer of
    118 // fixed size (kBlockSize). The ring buffer contains the last kBlockSize
    119 // index positions of the given hash key in the compressed data.
    120 template <int kBucketBits, int kBlockBits, int kMinLength>
    121 class HashLongestMatch {
    122  public:
    123   HashLongestMatch()
    124       : last_distance1_(4),
    125         last_distance2_(11),
    126         last_distance3_(15),
    127         last_distance4_(16),
    128         insert_length_(0),
    129         average_cost_(5.4),
    130         static_dict_(NULL) {
    131     Reset();
    132   }
    133   void Reset() {
    134     std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0);
    135   }
    136   void SetStaticDictionary(const StaticDictionary *dict) {
    137     static_dict_ = dict;
    138   }
    139   bool HasStaticDictionary() const {
    140     return static_dict_ != NULL;
    141   }
    142 
    143   // Look at 3 bytes at data.
    144   // Compute a hash from these, and store the value of ix at that position.
    145   inline void Store(const uint8_t *data, const int ix) {
    146     const uint32_t key = Hash<kBucketBits, kMinLength>(data);
    147     const int minor_ix = num_[key] & kBlockMask;
    148     buckets_[key][minor_ix] = ix;
    149     ++num_[key];
    150   }
    151 
    152   // Store hashes for a range of data.
    153   void StoreHashes(const uint8_t *data, size_t len, int startix, int mask) {
    154     for (int p = 0; p < len; ++p) {
    155       Store(&data[p & mask], startix + p);
    156     }
    157   }
    158 
    159   // Find a longest backward match of &data[cur_ix] up to the length of
    160   // max_length.
    161   //
    162   // Does not look for matches longer than max_length.
    163   // Does not look for matches further away than max_backward.
    164   // Writes the best found match length into best_len_out.
    165   // Writes the index (&data[index]) offset from the start of the best match
    166   // into best_distance_out.
    167   // Write the score of the best match into best_score_out.
    168   bool FindLongestMatch(const uint8_t * __restrict data,
    169                         const float * __restrict literal_cost,
    170                         const size_t ring_buffer_mask,
    171                         const uint32_t cur_ix,
    172                         uint32_t max_length,
    173                         const uint32_t max_backward,
    174                         size_t * __restrict best_len_out,
    175                         size_t * __restrict best_len_code_out,
    176                         size_t * __restrict best_distance_out,
    177                         double * __restrict best_score_out,
    178                         bool * __restrict in_dictionary) {
    179     *in_dictionary = true;
    180     *best_len_code_out = 0;
    181     const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
    182     const double start_cost4 = literal_cost == NULL ? 20 :
    183         literal_cost[cur_ix_masked] +
    184         literal_cost[(cur_ix + 1) & ring_buffer_mask] +
    185         literal_cost[(cur_ix + 2) & ring_buffer_mask] +
    186         literal_cost[(cur_ix + 3) & ring_buffer_mask];
    187     const double start_cost3 = literal_cost == NULL ? 15 :
    188         literal_cost[cur_ix_masked] +
    189         literal_cost[(cur_ix + 1) & ring_buffer_mask] +
    190         literal_cost[(cur_ix + 2) & ring_buffer_mask] + 0.3;
    191     double start_cost2 = literal_cost == NULL ? 10 :
    192         literal_cost[cur_ix_masked] +
    193         literal_cost[(cur_ix + 1) & ring_buffer_mask] + 1.2;
    194     bool match_found = false;
    195     // Don't accept a short copy from far away.
    196     double best_score = 8.115;
    197     if (insert_length_ < 4) {
    198       double cost_diff[4] = { 0.10, 0.04, 0.02, 0.01 };
    199       best_score += cost_diff[insert_length_];
    200     }
    201     size_t best_len = *best_len_out;
    202     *best_len_out = 0;
    203     size_t best_ix = 1;
    204     // Try last distance first.
    205     for (int i = 0; i < 16; ++i) {
    206       size_t prev_ix = cur_ix;
    207       switch(i) {
    208         case 0: prev_ix -= last_distance1_; break;
    209         case 1: prev_ix -= last_distance2_; break;
    210         case 2: prev_ix -= last_distance3_; break;
    211         case 3: prev_ix -= last_distance4_; break;
    212 
    213         case 4: prev_ix -= last_distance1_ - 1; break;
    214         case 5: prev_ix -= last_distance1_ + 1; break;
    215         case 6: prev_ix -= last_distance1_ - 2; break;
    216         case 7: prev_ix -= last_distance1_ + 2; break;
    217         case 8: prev_ix -= last_distance1_ - 3; break;
    218         case 9: prev_ix -= last_distance1_ + 3; break;
    219 
    220         case 10: prev_ix -= last_distance2_ - 1; break;
    221         case 11: prev_ix -= last_distance2_ + 1; break;
    222         case 12: prev_ix -= last_distance2_ - 2; break;
    223         case 13: prev_ix -= last_distance2_ + 2; break;
    224         case 14: prev_ix -= last_distance2_ - 3; break;
    225         case 15: prev_ix -= last_distance2_ + 3; break;
    226       }
    227       if (prev_ix >= cur_ix) {
    228         continue;
    229       }
    230       const size_t backward = cur_ix - prev_ix;
    231       if (PREDICT_FALSE(backward > max_backward)) {
    232         continue;
    233       }
    234       prev_ix &= ring_buffer_mask;
    235       if (cur_ix_masked + best_len > ring_buffer_mask ||
    236           prev_ix + best_len > ring_buffer_mask ||
    237           data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
    238         continue;
    239       }
    240       const size_t len =
    241           FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
    242                                    max_length);
    243       if (len >= std::max(kMinLength, 3) ||
    244           (kMinLength == 2 && len == 2 && i < 2)) {
    245         // Comparing for >= 2 does not change the semantics, but just saves for
    246         // a few unnecessary binary logarithms in backward reference score,
    247         // since we are not interested in such short matches.
    248         const double score = BackwardReferenceScoreUsingLastDistance(
    249             average_cost_,
    250             start_cost4,
    251             start_cost3,
    252             start_cost2,
    253             len, i);
    254         if (best_score < score) {
    255           best_score = score;
    256           best_len = len;
    257           best_ix = backward;
    258           *best_len_out = best_len;
    259           *best_len_code_out = best_len;
    260           *best_distance_out = best_ix;
    261           *best_score_out = best_score;
    262           match_found = true;
    263           *in_dictionary = backward > max_backward;
    264         }
    265       }
    266     }
    267     if (kMinLength == 2) {
    268       int stop = int(cur_ix) - 64;
    269       if (stop < 0) { stop = 0; }
    270       start_cost2 -= 1.0;
    271       for (int i = cur_ix - 1; i > stop; --i) {
    272         size_t prev_ix = i;
    273         const size_t backward = cur_ix - prev_ix;
    274         if (PREDICT_FALSE(backward > max_backward)) {
    275           break;
    276         }
    277         prev_ix &= ring_buffer_mask;
    278         if (data[cur_ix_masked] != data[prev_ix] ||
    279             data[cur_ix_masked + 1] != data[prev_ix + 1]) {
    280           continue;
    281         }
    282         int len = 2;
    283         const double score = start_cost2 - 2.3 * Log2Floor(backward);
    284 
    285         if (best_score < score) {
    286           best_score = score;
    287           best_len = len;
    288           best_ix = backward;
    289           *best_len_out = best_len;
    290           *best_len_code_out = best_len;
    291           *best_distance_out = best_ix;
    292           match_found = true;
    293         }
    294       }
    295     }
    296     const uint32_t key = Hash<kBucketBits, kMinLength>(&data[cur_ix_masked]);
    297     const int * __restrict const bucket = &buckets_[key][0];
    298     const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
    299     for (int i = num_[key] - 1; i >= down; --i) {
    300       int prev_ix = bucket[i & kBlockMask];
    301       if (prev_ix >= 0) {
    302         const size_t backward = cur_ix - prev_ix;
    303         if (PREDICT_FALSE(backward > max_backward)) {
    304           break;
    305         }
    306         prev_ix &= ring_buffer_mask;
    307         if (cur_ix_masked + best_len > ring_buffer_mask ||
    308             prev_ix + best_len > ring_buffer_mask ||
    309             data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
    310           continue;
    311         }
    312         const size_t len =
    313             FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
    314                                      max_length);
    315         if (len >= std::max(kMinLength, 3)) {
    316           // Comparing for >= 3 does not change the semantics, but just saves
    317           // for a few unnecessary binary logarithms in backward reference
    318           // score, since we are not interested in such short matches.
    319           const double score = BackwardReferenceScore(average_cost_,
    320                                                       start_cost4,
    321                                                       start_cost3,
    322                                                       start_cost2,
    323                                                       len, backward);
    324           if (best_score < score) {
    325             best_score = score;
    326             best_len = len;
    327             best_ix = backward;
    328             *best_len_out = best_len;
    329             *best_len_code_out = best_len;
    330             *best_distance_out = best_ix;
    331             *best_score_out = best_score;
    332             match_found = true;
    333             *in_dictionary = false;
    334           }
    335         }
    336       }
    337     }
    338     if (static_dict_ != NULL) {
    339       // We decide based on first 4 bytes how many bytes to test for.
    340       int prefix = BROTLI_UNALIGNED_LOAD32(&data[cur_ix_masked]);
    341       int maxlen = static_dict_->GetLength(prefix);
    342       for (int len = std::min<size_t>(maxlen, max_length);
    343            len > best_len && len >= 4; --len) {
    344         std::string snippet((const char *)&data[cur_ix_masked], len);
    345         int copy_len_code;
    346         int word_id;
    347         if (static_dict_->Get(snippet, &copy_len_code, &word_id)) {
    348           const size_t backward = max_backward + word_id + 1;
    349           const double score = BackwardReferenceScore(average_cost_,
    350                                                       start_cost4,
    351                                                       start_cost3,
    352                                                       start_cost2,
    353                                                       len, backward);
    354           if (best_score < score) {
    355             best_score = score;
    356             best_len = len;
    357             best_ix = backward;
    358             *best_len_out = best_len;
    359             *best_len_code_out = copy_len_code;
    360             *best_distance_out = best_ix;
    361             *best_score_out = best_score;
    362             match_found = true;
    363             *in_dictionary = true;
    364           }
    365         }
    366       }
    367     }
    368     return match_found;
    369   }
    370 
    371   void set_last_distance(int v) {
    372     if (last_distance1_ != v) {
    373       last_distance4_ = last_distance3_;
    374       last_distance3_ = last_distance2_;
    375       last_distance2_ = last_distance1_;
    376       last_distance1_ = v;
    377     }
    378   }
    379 
    380   int last_distance() const { return last_distance1_; }
    381 
    382   void set_insert_length(int v) { insert_length_ = v; }
    383 
    384   void set_average_cost(double v) { average_cost_ = v; }
    385 
    386  private:
    387   // Number of hash buckets.
    388   static const uint32_t kBucketSize = 1 << kBucketBits;
    389 
    390   // Only kBlockSize newest backward references are kept,
    391   // and the older are forgotten.
    392   static const uint32_t kBlockSize = 1 << kBlockBits;
    393 
    394   // Mask for accessing entries in a block (in a ringbuffer manner).
    395   static const uint32_t kBlockMask = (1 << kBlockBits) - 1;
    396 
    397   // Number of entries in a particular bucket.
    398   uint16_t num_[kBucketSize];
    399 
    400   // Buckets containing kBlockSize of backward references.
    401   int buckets_[kBucketSize][kBlockSize];
    402 
    403   int last_distance1_;
    404   int last_distance2_;
    405   int last_distance3_;
    406   int last_distance4_;
    407 
    408   // Cost adjustment for how many literals we are planning to insert
    409   // anyway.
    410   int insert_length_;
    411 
    412   double average_cost_;
    413 
    414   const StaticDictionary *static_dict_;
    415 };
    416 
    417 struct Hashers {
    418   enum Type {
    419     HASH_15_8_4 = 0,
    420     HASH_15_8_2 = 1,
    421   };
    422 
    423   void Init(Type type) {
    424     switch (type) {
    425       case HASH_15_8_4:
    426         hash_15_8_4.reset(new HashLongestMatch<15, 8, 4>());
    427         break;
    428       case HASH_15_8_2:
    429         hash_15_8_2.reset(new HashLongestMatch<15, 8, 2>());
    430         break;
    431       default:
    432         break;
    433     }
    434   }
    435 
    436   void SetStaticDictionary(const StaticDictionary *dict) {
    437     if (hash_15_8_4.get() != NULL) hash_15_8_4->SetStaticDictionary(dict);
    438     if (hash_15_8_2.get() != NULL) hash_15_8_2->SetStaticDictionary(dict);
    439   }
    440 
    441   std::unique_ptr<HashLongestMatch<15, 8, 4> > hash_15_8_4;
    442   std::unique_ptr<HashLongestMatch<15, 8, 2> > hash_15_8_2;
    443 };
    444 
    445 }  // namespace brotli
    446 
    447 #endif  // BROTLI_ENC_HASH_H_
    448