Home | History | Annotate | Download | only in enc
      1 /* Copyright 2010 Google Inc. All Rights Reserved.
      2 
      3    Distributed under MIT license.
      4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
      5 */
      6 
      7 /* A (forgetful) hash table to the data seen by the compressor, to
      8    help create backward references to previous data. */
      9 
     10 #ifndef BROTLI_ENC_HASH_H_
     11 #define BROTLI_ENC_HASH_H_
     12 
     13 #include <string.h>  /* memcmp, memset */
     14 
     15 #include "../common/constants.h"
     16 #include "../common/dictionary.h"
     17 #include <brotli/types.h>
     18 #include "./fast_log.h"
     19 #include "./find_match_length.h"
     20 #include "./memory.h"
     21 #include "./port.h"
     22 #include "./quality.h"
     23 #include "./static_dict.h"
     24 
     25 #if defined(__cplusplus) || defined(c_plusplus)
     26 extern "C" {
     27 #endif
     28 
     29 /* Pointer to hasher data.
     30  *
     31  * Excluding initialization and destruction, hasher can be passed as
     32  * HasherHandle by value.
     33  *
     34  * Typically hasher data consists of 3 sections:
     35  * * HasherCommon structure
     36  * * private structured hasher data, depending on hasher type
     37  * * private dynamic hasher data, depending on hasher type and parameters
     38  */
     39 typedef uint8_t* HasherHandle;
     40 
     41 typedef struct {
     42   BrotliHasherParams params;
     43 
     44   /* False if hasher needs to be "prepared" before use. */
     45   BROTLI_BOOL is_prepared_;
     46 
     47   size_t dict_num_lookups;
     48   size_t dict_num_matches;
     49 } HasherCommon;
     50 
     51 static BROTLI_INLINE HasherCommon* GetHasherCommon(HasherHandle handle) {
     52   return (HasherCommon*)handle;
     53 }
     54 
     55 #define score_t size_t
     56 
     57 static const uint32_t kCutoffTransformsCount = 10;
     58 /*   0,  12,   27,    23,    42,    63,    56,    48,    59,    64 */
     59 /* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */
     60 static const uint64_t kCutoffTransforms =
     61     BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200);
     62 
     63 typedef struct HasherSearchResult {
     64   size_t len;
     65   size_t distance;
     66   score_t score;
     67   int len_code_delta; /* == len_code - len */
     68 } HasherSearchResult;
     69 
     70 /* kHashMul32 multiplier has these properties:
     71    * The multiplier must be odd. Otherwise we may lose the highest bit.
     72    * No long streaks of ones or zeros.
     73    * There is no effort to ensure that it is a prime, the oddity is enough
     74      for this use.
     75    * The number has been tuned heuristically against compression benchmarks. */
     76 static const uint32_t kHashMul32 = 0x1e35a7bd;
     77 static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1e35a7bd, 0x1e35a7bd);
     78 static const uint64_t kHashMul64Long =
     79     BROTLI_MAKE_UINT64_T(0x1fe35a7bU, 0xd3579bd3U);
     80 
     81 static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) {
     82   uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
     83   /* The higher bits contain more mixture from the multiplication,
     84      so we take our results from there. */
     85   return h >> (32 - 14);
     86 }
     87 
     88 static BROTLI_INLINE void PrepareDistanceCache(
     89     int* BROTLI_RESTRICT distance_cache, const int num_distances) {
     90   if (num_distances > 4) {
     91     int last_distance = distance_cache[0];
     92     distance_cache[4] = last_distance - 1;
     93     distance_cache[5] = last_distance + 1;
     94     distance_cache[6] = last_distance - 2;
     95     distance_cache[7] = last_distance + 2;
     96     distance_cache[8] = last_distance - 3;
     97     distance_cache[9] = last_distance + 3;
     98     if (num_distances > 10) {
     99       int next_last_distance = distance_cache[1];
    100       distance_cache[10] = next_last_distance - 1;
    101       distance_cache[11] = next_last_distance + 1;
    102       distance_cache[12] = next_last_distance - 2;
    103       distance_cache[13] = next_last_distance + 2;
    104       distance_cache[14] = next_last_distance - 3;
    105       distance_cache[15] = next_last_distance + 3;
    106     }
    107   }
    108 }
    109 
    110 #define BROTLI_LITERAL_BYTE_SCORE 135
    111 #define BROTLI_DISTANCE_BIT_PENALTY 30
    112 /* Score must be positive after applying maximal penalty. */
    113 #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t))
    114 
    115 /* Usually, we always choose the longest backward reference. This function
    116    allows for the exception of that rule.
    117 
    118    If we choose a backward reference that is further away, it will
    119    usually be coded with more bits. We approximate this by assuming
    120    log2(distance). If the distance can be expressed in terms of the
    121    last four distances, we use some heuristic constants to estimate
    122    the bits cost. For the first up to four literals we use the bit
    123    cost of the literals from the literal cost model, after that we
    124    use the average bit cost of the cost model.
    125 
    126    This function is used to sometimes discard a longer backward reference
    127    when it is not much longer and the bit cost for encoding it is more
    128    than the saved literals.
    129 
    130    backward_reference_offset MUST be positive. */
    131 static BROTLI_INLINE score_t BackwardReferenceScore(
    132     size_t copy_length, size_t backward_reference_offset) {
    133   return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length -
    134       BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset);
    135 }
    136 
    137 static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance(
    138     size_t copy_length) {
    139   return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length +
    140       BROTLI_SCORE_BASE + 15;
    141 }
    142 
    143 static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance(
    144     size_t distance_short_code) {
    145   return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE);
    146 }
    147 
    148 static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
    149     const BrotliDictionary* dictionary, size_t item, const uint8_t* data,
    150     size_t max_length, size_t max_backward, HasherSearchResult* out) {
    151   size_t len;
    152   size_t dist;
    153   size_t offset;
    154   size_t matchlen;
    155   size_t backward;
    156   score_t score;
    157   len = item & 0x1F;
    158   dist = item >> 5;
    159   offset = dictionary->offsets_by_length[len] + len * dist;
    160   if (len > max_length) {
    161     return BROTLI_FALSE;
    162   }
    163 
    164   matchlen =
    165       FindMatchLengthWithLimit(data, &dictionary->data[offset], len);
    166   if (matchlen + kCutoffTransformsCount <= len || matchlen == 0) {
    167     return BROTLI_FALSE;
    168   }
    169   {
    170     size_t cut = len - matchlen;
    171     size_t transform_id =
    172         (cut << 2) + (size_t)((kCutoffTransforms >> (cut * 6)) & 0x3F);
    173     backward = max_backward + dist + 1 +
    174         (transform_id << dictionary->size_bits_by_length[len]);
    175   }
    176   if (backward >= BROTLI_MAX_DISTANCE) {
    177     return BROTLI_FALSE;
    178   }
    179   score = BackwardReferenceScore(matchlen, backward);
    180   if (score < out->score) {
    181     return BROTLI_FALSE;
    182   }
    183   out->len = matchlen;
    184   out->len_code_delta = (int)len - (int)matchlen;
    185   out->distance = backward;
    186   out->score = score;
    187   return BROTLI_TRUE;
    188 }
    189 
    190 static BROTLI_INLINE void SearchInStaticDictionary(
    191     const BrotliDictionary* dictionary, const uint16_t* dictionary_hash,
    192     HasherHandle handle, const uint8_t* data, size_t max_length,
    193     size_t max_backward, HasherSearchResult* out, BROTLI_BOOL shallow) {
    194   size_t key;
    195   size_t i;
    196   HasherCommon* self = GetHasherCommon(handle);
    197   if (self->dict_num_matches < (self->dict_num_lookups >> 7)) {
    198     return;
    199   }
    200   key = Hash14(data) << 1;
    201   for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
    202     size_t item = dictionary_hash[key];
    203     self->dict_num_lookups++;
    204     if (item != 0) {
    205       BROTLI_BOOL item_matches = TestStaticDictionaryItem(
    206           dictionary, item, data, max_length, max_backward, out);
    207       if (item_matches) {
    208         self->dict_num_matches++;
    209       }
    210     }
    211   }
    212 }
    213 
    214 typedef struct BackwardMatch {
    215   uint32_t distance;
    216   uint32_t length_and_code;
    217 } BackwardMatch;
    218 
    219 static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self,
    220     size_t dist, size_t len) {
    221   self->distance = (uint32_t)dist;
    222   self->length_and_code = (uint32_t)(len << 5);
    223 }
    224 
    225 static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self,
    226     size_t dist, size_t len, size_t len_code) {
    227   self->distance = (uint32_t)dist;
    228   self->length_and_code =
    229       (uint32_t)((len << 5) | (len == len_code ? 0 : len_code));
    230 }
    231 
    232 static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) {
    233   return self->length_and_code >> 5;
    234 }
    235 
    236 static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) {
    237   size_t code = self->length_and_code & 31;
    238   return code ? code : BackwardMatchLength(self);
    239 }
    240 
    241 #define EXPAND_CAT(a, b) CAT(a, b)
    242 #define CAT(a, b) a ## b
    243 #define FN(X) EXPAND_CAT(X, HASHER())
    244 
    245 #define HASHER() H10
    246 #define BUCKET_BITS 17
    247 #define MAX_TREE_SEARCH_DEPTH 64
    248 #define MAX_TREE_COMP_LENGTH 128
    249 #include "./hash_to_binary_tree_inc.h"  /* NOLINT(build/include) */
    250 #undef MAX_TREE_SEARCH_DEPTH
    251 #undef MAX_TREE_COMP_LENGTH
    252 #undef BUCKET_BITS
    253 #undef HASHER
    254 /* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */
    255 #define MAX_NUM_MATCHES_H10 128
    256 
    257 /* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression
    258    a little faster (0.5% - 1%) and it compresses 0.15% better on small text
    259    and HTML inputs. */
    260 
    261 #define HASHER() H2
    262 #define BUCKET_BITS 16
    263 #define BUCKET_SWEEP 1
    264 #define HASH_LEN 5
    265 #define USE_DICTIONARY 1
    266 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
    267 #undef BUCKET_SWEEP
    268 #undef USE_DICTIONARY
    269 #undef HASHER
    270 
    271 #define HASHER() H3
    272 #define BUCKET_SWEEP 2
    273 #define USE_DICTIONARY 0
    274 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
    275 #undef USE_DICTIONARY
    276 #undef BUCKET_SWEEP
    277 #undef BUCKET_BITS
    278 #undef HASHER
    279 
    280 #define HASHER() H4
    281 #define BUCKET_BITS 17
    282 #define BUCKET_SWEEP 4
    283 #define USE_DICTIONARY 1
    284 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
    285 #undef USE_DICTIONARY
    286 #undef HASH_LEN
    287 #undef BUCKET_SWEEP
    288 #undef BUCKET_BITS
    289 #undef HASHER
    290 
    291 #define HASHER() H5
    292 #include "./hash_longest_match_inc.h"  /* NOLINT(build/include) */
    293 #undef HASHER
    294 
    295 #define HASHER() H6
    296 #include "./hash_longest_match64_inc.h"  /* NOLINT(build/include) */
    297 #undef HASHER
    298 
    299 #define BUCKET_BITS 15
    300 
    301 #define NUM_LAST_DISTANCES_TO_CHECK 4
    302 #define NUM_BANKS 1
    303 #define BANK_BITS 16
    304 #define HASHER() H40
    305 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
    306 #undef HASHER
    307 #undef NUM_LAST_DISTANCES_TO_CHECK
    308 
    309 #define NUM_LAST_DISTANCES_TO_CHECK 10
    310 #define HASHER() H41
    311 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
    312 #undef HASHER
    313 #undef NUM_LAST_DISTANCES_TO_CHECK
    314 #undef NUM_BANKS
    315 #undef BANK_BITS
    316 
    317 #define NUM_LAST_DISTANCES_TO_CHECK 16
    318 #define NUM_BANKS 512
    319 #define BANK_BITS 9
    320 #define HASHER() H42
    321 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
    322 #undef HASHER
    323 #undef NUM_LAST_DISTANCES_TO_CHECK
    324 #undef NUM_BANKS
    325 #undef BANK_BITS
    326 
    327 #undef BUCKET_BITS
    328 
    329 #define HASHER() H54
    330 #define BUCKET_BITS 20
    331 #define BUCKET_SWEEP 4
    332 #define HASH_LEN 7
    333 #define USE_DICTIONARY 0
    334 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
    335 #undef USE_DICTIONARY
    336 #undef HASH_LEN
    337 #undef BUCKET_SWEEP
    338 #undef BUCKET_BITS
    339 #undef HASHER
    340 
    341 #undef FN
    342 #undef CAT
    343 #undef EXPAND_CAT
    344 
    345 #define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)
    346 #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10)
    347 
    348 static BROTLI_INLINE void DestroyHasher(
    349     MemoryManager* m, HasherHandle* handle) {
    350   if (*handle == NULL) return;
    351   BROTLI_FREE(m, *handle);
    352 }
    353 
    354 static BROTLI_INLINE void HasherReset(HasherHandle handle) {
    355   if (handle == NULL) return;
    356   GetHasherCommon(handle)->is_prepared_ = BROTLI_FALSE;
    357 }
    358 
    359 static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params,
    360     BROTLI_BOOL one_shot, const size_t input_size) {
    361   size_t result = sizeof(HasherCommon);
    362   switch (params->hasher.type) {
    363 #define SIZE_(N)                                                         \
    364     case N:                                                              \
    365       result += HashMemAllocInBytesH ## N(params, one_shot, input_size); \
    366       break;
    367     FOR_ALL_HASHERS(SIZE_)
    368 #undef SIZE_
    369     default:
    370       break;
    371   }
    372   return result;
    373 }
    374 
    375 static BROTLI_INLINE void HasherSetup(MemoryManager* m, HasherHandle* handle,
    376     BrotliEncoderParams* params, const uint8_t* data, size_t position,
    377     size_t input_size, BROTLI_BOOL is_last) {
    378   HasherHandle self = NULL;
    379   HasherCommon* common = NULL;
    380   BROTLI_BOOL one_shot = (position == 0 && is_last);
    381   if (*handle == NULL) {
    382     size_t alloc_size;
    383     ChooseHasher(params, &params->hasher);
    384     alloc_size = HasherSize(params, one_shot, input_size);
    385     self = BROTLI_ALLOC(m, uint8_t, alloc_size);
    386     if (BROTLI_IS_OOM(m)) return;
    387     *handle = self;
    388     common = GetHasherCommon(self);
    389     common->params = params->hasher;
    390     switch (common->params.type) {
    391 #define INITIALIZE_(N)                     \
    392       case N:                              \
    393         InitializeH ## N(*handle, params); \
    394         break;
    395       FOR_ALL_HASHERS(INITIALIZE_);
    396 #undef INITIALIZE_
    397       default:
    398         break;
    399     }
    400     HasherReset(*handle);
    401   }
    402 
    403   self = *handle;
    404   common = GetHasherCommon(self);
    405   if (!common->is_prepared_) {
    406     switch (common->params.type) {
    407 #define PREPARE_(N)                                      \
    408       case N:                                            \
    409         PrepareH ## N(self, one_shot, input_size, data); \
    410         break;
    411       FOR_ALL_HASHERS(PREPARE_)
    412 #undef PREPARE_
    413       default: break;
    414     }
    415     if (position == 0) {
    416         common->dict_num_lookups = 0;
    417         common->dict_num_matches = 0;
    418     }
    419     common->is_prepared_ = BROTLI_TRUE;
    420   }
    421 }
    422 
    423 static BROTLI_INLINE void InitOrStitchToPreviousBlock(
    424     MemoryManager* m, HasherHandle* handle, const uint8_t* data, size_t mask,
    425     BrotliEncoderParams* params, size_t position, size_t input_size,
    426     BROTLI_BOOL is_last) {
    427   HasherHandle self;
    428   HasherSetup(m, handle, params, data, position, input_size, is_last);
    429   if (BROTLI_IS_OOM(m)) return;
    430   self = *handle;
    431   switch (GetHasherCommon(self)->params.type) {
    432 #define INIT_(N)                                                           \
    433     case N:                                                                \
    434       StitchToPreviousBlockH ## N(self, input_size, position, data, mask); \
    435     break;
    436     FOR_ALL_HASHERS(INIT_)
    437 #undef INIT_
    438     default: break;
    439   }
    440 }
    441 
    442 #if defined(__cplusplus) || defined(c_plusplus)
    443 }  /* extern "C" */
    444 #endif
    445 
    446 #endif  /* BROTLI_ENC_HASH_H_ */
    447