Home | History | Annotate | Download | only in enc
      1 /* Copyright 2010 Google Inc. All Rights Reserved.
      2 
      3    Distributed under MIT license.
      4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
      5 */
      6 
      7 /* A (forgetful) hash table to the data seen by the compressor, to
      8    help create backward references to previous data. */
      9 
     10 #ifndef BROTLI_ENC_HASH_H_
     11 #define BROTLI_ENC_HASH_H_
     12 
     13 #include <string.h>  /* memcmp, memset */
     14 
     15 #include "../common/constants.h"
     16 #include "../common/dictionary.h"
     17 #include "../common/platform.h"
     18 #include <brotli/types.h>
     19 #include "./encoder_dict.h"
     20 #include "./fast_log.h"
     21 #include "./find_match_length.h"
     22 #include "./memory.h"
     23 #include "./quality.h"
     24 #include "./static_dict.h"
     25 
     26 #if defined(__cplusplus) || defined(c_plusplus)
     27 extern "C" {
     28 #endif
     29 
     30 /* Pointer to hasher data.
     31  *
     32  * Excluding initialization and destruction, hasher can be passed as
     33  * HasherHandle by value.
     34  *
     35  * Typically hasher data consists of 3 sections:
     36  * * HasherCommon structure
     37  * * private structured hasher data, depending on hasher type
     38  * * private dynamic hasher data, depending on hasher type and parameters
     39  *
     40  * Using "define" instead of "typedef", because on MSVC __restrict does not work
     41  * on typedef pointer types. */
     42 #define HasherHandle uint8_t*
     43 
     44 typedef struct {
     45   BrotliHasherParams params;
     46 
     47   /* False if hasher needs to be "prepared" before use. */
     48   BROTLI_BOOL is_prepared_;
     49 
     50   size_t dict_num_lookups;
     51   size_t dict_num_matches;
     52 } HasherCommon;
     53 
     54 static BROTLI_INLINE HasherCommon* GetHasherCommon(HasherHandle handle) {
     55   return (HasherCommon*)handle;
     56 }
     57 
     58 #define score_t size_t
     59 
     60 static const uint32_t kCutoffTransformsCount = 10;
     61 /*   0,  12,   27,    23,    42,    63,    56,    48,    59,    64 */
     62 /* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */
     63 static const uint64_t kCutoffTransforms =
     64     BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200);
     65 
     66 typedef struct HasherSearchResult {
     67   size_t len;
     68   size_t distance;
     69   score_t score;
     70   int len_code_delta; /* == len_code - len */
     71 } HasherSearchResult;
     72 
     73 /* kHashMul32 multiplier has these properties:
     74    * The multiplier must be odd. Otherwise we may lose the highest bit.
     75    * No long streaks of ones or zeros.
     76    * There is no effort to ensure that it is a prime, the oddity is enough
     77      for this use.
     78    * The number has been tuned heuristically against compression benchmarks. */
     79 static const uint32_t kHashMul32 = 0x1E35A7BD;
     80 static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1E35A7BD, 0x1E35A7BD);
     81 static const uint64_t kHashMul64Long =
     82     BROTLI_MAKE_UINT64_T(0x1FE35A7Bu, 0xD3579BD3u);
     83 
     84 static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) {
     85   uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32;
     86   /* The higher bits contain more mixture from the multiplication,
     87      so we take our results from there. */
     88   return h >> (32 - 14);
     89 }
     90 
     91 static BROTLI_INLINE void PrepareDistanceCache(
     92     int* BROTLI_RESTRICT distance_cache, const int num_distances) {
     93   if (num_distances > 4) {
     94     int last_distance = distance_cache[0];
     95     distance_cache[4] = last_distance - 1;
     96     distance_cache[5] = last_distance + 1;
     97     distance_cache[6] = last_distance - 2;
     98     distance_cache[7] = last_distance + 2;
     99     distance_cache[8] = last_distance - 3;
    100     distance_cache[9] = last_distance + 3;
    101     if (num_distances > 10) {
    102       int next_last_distance = distance_cache[1];
    103       distance_cache[10] = next_last_distance - 1;
    104       distance_cache[11] = next_last_distance + 1;
    105       distance_cache[12] = next_last_distance - 2;
    106       distance_cache[13] = next_last_distance + 2;
    107       distance_cache[14] = next_last_distance - 3;
    108       distance_cache[15] = next_last_distance + 3;
    109     }
    110   }
    111 }
    112 
    113 #define BROTLI_LITERAL_BYTE_SCORE 135
    114 #define BROTLI_DISTANCE_BIT_PENALTY 30
    115 /* Score must be positive after applying maximal penalty. */
    116 #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t))
    117 
    118 /* Usually, we always choose the longest backward reference. This function
    119    allows for the exception of that rule.
    120 
    121    If we choose a backward reference that is further away, it will
    122    usually be coded with more bits. We approximate this by assuming
    123    log2(distance). If the distance can be expressed in terms of the
    124    last four distances, we use some heuristic constants to estimate
    125    the bits cost. For the first up to four literals we use the bit
    126    cost of the literals from the literal cost model, after that we
    127    use the average bit cost of the cost model.
    128 
    129    This function is used to sometimes discard a longer backward reference
    130    when it is not much longer and the bit cost for encoding it is more
    131    than the saved literals.
    132 
    133    backward_reference_offset MUST be positive. */
    134 static BROTLI_INLINE score_t BackwardReferenceScore(
    135     size_t copy_length, size_t backward_reference_offset) {
    136   return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length -
    137       BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset);
    138 }
    139 
    140 static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance(
    141     size_t copy_length) {
    142   return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length +
    143       BROTLI_SCORE_BASE + 15;
    144 }
    145 
    146 static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance(
    147     size_t distance_short_code) {
    148   return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE);
    149 }
    150 
    151 static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
    152     const BrotliEncoderDictionary* dictionary, size_t item,
    153     const uint8_t* data, size_t max_length, size_t max_backward,
    154     size_t max_distance, HasherSearchResult* out) {
    155   size_t len;
    156   size_t word_idx;
    157   size_t offset;
    158   size_t matchlen;
    159   size_t backward;
    160   score_t score;
    161   len = item & 0x1F;
    162   word_idx = item >> 5;
    163   offset = dictionary->words->offsets_by_length[len] + len * word_idx;
    164   if (len > max_length) {
    165     return BROTLI_FALSE;
    166   }
    167 
    168   matchlen =
    169       FindMatchLengthWithLimit(data, &dictionary->words->data[offset], len);
    170   if (matchlen + dictionary->cutoffTransformsCount <= len || matchlen == 0) {
    171     return BROTLI_FALSE;
    172   }
    173   {
    174     size_t cut = len - matchlen;
    175     size_t transform_id = (cut << 2) +
    176         (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F);
    177     backward = max_backward + 1 + word_idx +
    178         (transform_id << dictionary->words->size_bits_by_length[len]);
    179   }
    180   if (backward > max_distance) {
    181     return BROTLI_FALSE;
    182   }
    183   score = BackwardReferenceScore(matchlen, backward);
    184   if (score < out->score) {
    185     return BROTLI_FALSE;
    186   }
    187   out->len = matchlen;
    188   out->len_code_delta = (int)len - (int)matchlen;
    189   out->distance = backward;
    190   out->score = score;
    191   return BROTLI_TRUE;
    192 }
    193 
    194 static BROTLI_INLINE void SearchInStaticDictionary(
    195     const BrotliEncoderDictionary* dictionary,
    196     HasherHandle handle, const uint8_t* data, size_t max_length,
    197     size_t max_backward, size_t max_distance,
    198     HasherSearchResult* out, BROTLI_BOOL shallow) {
    199   size_t key;
    200   size_t i;
    201   HasherCommon* self = GetHasherCommon(handle);
    202   if (self->dict_num_matches < (self->dict_num_lookups >> 7)) {
    203     return;
    204   }
    205   key = Hash14(data) << 1;
    206   for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
    207     size_t item = dictionary->hash_table[key];
    208     self->dict_num_lookups++;
    209     if (item != 0) {
    210       BROTLI_BOOL item_matches = TestStaticDictionaryItem(
    211           dictionary, item, data,
    212           max_length, max_backward, max_distance, out);
    213       if (item_matches) {
    214         self->dict_num_matches++;
    215       }
    216     }
    217   }
    218 }
    219 
    220 typedef struct BackwardMatch {
    221   uint32_t distance;
    222   uint32_t length_and_code;
    223 } BackwardMatch;
    224 
    225 static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self,
    226     size_t dist, size_t len) {
    227   self->distance = (uint32_t)dist;
    228   self->length_and_code = (uint32_t)(len << 5);
    229 }
    230 
    231 static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self,
    232     size_t dist, size_t len, size_t len_code) {
    233   self->distance = (uint32_t)dist;
    234   self->length_and_code =
    235       (uint32_t)((len << 5) | (len == len_code ? 0 : len_code));
    236 }
    237 
    238 static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) {
    239   return self->length_and_code >> 5;
    240 }
    241 
    242 static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) {
    243   size_t code = self->length_and_code & 31;
    244   return code ? code : BackwardMatchLength(self);
    245 }
    246 
    247 #define EXPAND_CAT(a, b) CAT(a, b)
    248 #define CAT(a, b) a ## b
    249 #define FN(X) EXPAND_CAT(X, HASHER())
    250 
    251 #define HASHER() H10
    252 #define BUCKET_BITS 17
    253 #define MAX_TREE_SEARCH_DEPTH 64
    254 #define MAX_TREE_COMP_LENGTH 128
    255 #include "./hash_to_binary_tree_inc.h"  /* NOLINT(build/include) */
    256 #undef MAX_TREE_SEARCH_DEPTH
    257 #undef MAX_TREE_COMP_LENGTH
    258 #undef BUCKET_BITS
    259 #undef HASHER
    260 /* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */
    261 #define MAX_NUM_MATCHES_H10 128
    262 
    263 /* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression
    264    a little faster (0.5% - 1%) and it compresses 0.15% better on small text
    265    and HTML inputs. */
    266 
    267 #define HASHER() H2
    268 #define BUCKET_BITS 16
    269 #define BUCKET_SWEEP 1
    270 #define HASH_LEN 5
    271 #define USE_DICTIONARY 1
    272 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
    273 #undef BUCKET_SWEEP
    274 #undef USE_DICTIONARY
    275 #undef HASHER
    276 
    277 #define HASHER() H3
    278 #define BUCKET_SWEEP 2
    279 #define USE_DICTIONARY 0
    280 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
    281 #undef USE_DICTIONARY
    282 #undef BUCKET_SWEEP
    283 #undef BUCKET_BITS
    284 #undef HASHER
    285 
    286 #define HASHER() H4
    287 #define BUCKET_BITS 17
    288 #define BUCKET_SWEEP 4
    289 #define USE_DICTIONARY 1
    290 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
    291 #undef USE_DICTIONARY
    292 #undef HASH_LEN
    293 #undef BUCKET_SWEEP
    294 #undef BUCKET_BITS
    295 #undef HASHER
    296 
    297 #define HASHER() H5
    298 #include "./hash_longest_match_inc.h"  /* NOLINT(build/include) */
    299 #undef HASHER
    300 
    301 #define HASHER() H6
    302 #include "./hash_longest_match64_inc.h"  /* NOLINT(build/include) */
    303 #undef HASHER
    304 
    305 #define BUCKET_BITS 15
    306 
    307 #define NUM_LAST_DISTANCES_TO_CHECK 4
    308 #define NUM_BANKS 1
    309 #define BANK_BITS 16
    310 #define HASHER() H40
    311 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
    312 #undef HASHER
    313 #undef NUM_LAST_DISTANCES_TO_CHECK
    314 
    315 #define NUM_LAST_DISTANCES_TO_CHECK 10
    316 #define HASHER() H41
    317 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
    318 #undef HASHER
    319 #undef NUM_LAST_DISTANCES_TO_CHECK
    320 #undef NUM_BANKS
    321 #undef BANK_BITS
    322 
    323 #define NUM_LAST_DISTANCES_TO_CHECK 16
    324 #define NUM_BANKS 512
    325 #define BANK_BITS 9
    326 #define HASHER() H42
    327 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
    328 #undef HASHER
    329 #undef NUM_LAST_DISTANCES_TO_CHECK
    330 #undef NUM_BANKS
    331 #undef BANK_BITS
    332 
    333 #undef BUCKET_BITS
    334 
    335 #define HASHER() H54
    336 #define BUCKET_BITS 20
    337 #define BUCKET_SWEEP 4
    338 #define HASH_LEN 7
    339 #define USE_DICTIONARY 0
    340 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
    341 #undef USE_DICTIONARY
    342 #undef HASH_LEN
    343 #undef BUCKET_SWEEP
    344 #undef BUCKET_BITS
    345 #undef HASHER
    346 
    347 /* fast large window hashers */
    348 
    349 #define HASHER() HROLLING_FAST
    350 #define CHUNKLEN 32
    351 #define JUMP 4
    352 #define NUMBUCKETS 16777216
    353 #define MASK ((NUMBUCKETS * 64) - 1)
    354 #include "./hash_rolling_inc.h"  /* NOLINT(build/include) */
    355 #undef JUMP
    356 #undef HASHER
    357 
    358 
    359 #define HASHER() HROLLING
    360 #define JUMP 1
    361 #include "./hash_rolling_inc.h"  /* NOLINT(build/include) */
    362 #undef MASK
    363 #undef NUMBUCKETS
    364 #undef JUMP
    365 #undef CHUNKLEN
    366 #undef HASHER
    367 
    368 #define HASHER() H35
    369 #define HASHER_A H3
    370 #define HASHER_B HROLLING_FAST
    371 #include "./hash_composite_inc.h"  /* NOLINT(build/include) */
    372 #undef HASHER_A
    373 #undef HASHER_B
    374 #undef HASHER
    375 
    376 #define HASHER() H55
    377 #define HASHER_A H54
    378 #define HASHER_B HROLLING_FAST
    379 #include "./hash_composite_inc.h"  /* NOLINT(build/include) */
    380 #undef HASHER_A
    381 #undef HASHER_B
    382 #undef HASHER
    383 
    384 #define HASHER() H65
    385 #define HASHER_A H6
    386 #define HASHER_B HROLLING
    387 #include "./hash_composite_inc.h"  /* NOLINT(build/include) */
    388 #undef HASHER_A
    389 #undef HASHER_B
    390 #undef HASHER
    391 
    392 #undef FN
    393 #undef CAT
    394 #undef EXPAND_CAT
    395 
    396 #define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)\
    397                                H(35) H(55) H(65)
    398 #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10)
    399 
    400 static BROTLI_INLINE void DestroyHasher(
    401     MemoryManager* m, HasherHandle* handle) {
    402   if (*handle == NULL) return;
    403   BROTLI_FREE(m, *handle);
    404 }
    405 
    406 static BROTLI_INLINE void HasherReset(HasherHandle handle) {
    407   if (handle == NULL) return;
    408   GetHasherCommon(handle)->is_prepared_ = BROTLI_FALSE;
    409 }
    410 
    411 static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params,
    412     BROTLI_BOOL one_shot, const size_t input_size) {
    413   size_t result = sizeof(HasherCommon);
    414   switch (params->hasher.type) {
    415 #define SIZE_(N)                                                         \
    416     case N:                                                              \
    417       result += HashMemAllocInBytesH ## N(params, one_shot, input_size); \
    418       break;
    419     FOR_ALL_HASHERS(SIZE_)
    420 #undef SIZE_
    421     default:
    422       break;
    423   }
    424   return result;
    425 }
    426 
    427 static BROTLI_INLINE void HasherSetup(MemoryManager* m, HasherHandle* handle,
    428     BrotliEncoderParams* params, const uint8_t* data, size_t position,
    429     size_t input_size, BROTLI_BOOL is_last) {
    430   HasherHandle self = NULL;
    431   HasherCommon* common = NULL;
    432   BROTLI_BOOL one_shot = (position == 0 && is_last);
    433   if (*handle == NULL) {
    434     size_t alloc_size;
    435     ChooseHasher(params, &params->hasher);
    436     alloc_size = HasherSize(params, one_shot, input_size);
    437     self = BROTLI_ALLOC(m, uint8_t, alloc_size);
    438     if (BROTLI_IS_OOM(m)) return;
    439     *handle = self;
    440     common = GetHasherCommon(self);
    441     common->params = params->hasher;
    442     switch (common->params.type) {
    443 #define INITIALIZE_(N)                     \
    444       case N:                              \
    445         InitializeH ## N(*handle, params); \
    446         break;
    447       FOR_ALL_HASHERS(INITIALIZE_);
    448 #undef INITIALIZE_
    449       default:
    450         break;
    451     }
    452     HasherReset(*handle);
    453   }
    454 
    455   self = *handle;
    456   common = GetHasherCommon(self);
    457   if (!common->is_prepared_) {
    458     switch (common->params.type) {
    459 #define PREPARE_(N)                                      \
    460       case N:                                            \
    461         PrepareH ## N(self, one_shot, input_size, data); \
    462         break;
    463       FOR_ALL_HASHERS(PREPARE_)
    464 #undef PREPARE_
    465       default: break;
    466     }
    467     if (position == 0) {
    468         common->dict_num_lookups = 0;
    469         common->dict_num_matches = 0;
    470     }
    471     common->is_prepared_ = BROTLI_TRUE;
    472   }
    473 }
    474 
    475 static BROTLI_INLINE void InitOrStitchToPreviousBlock(
    476     MemoryManager* m, HasherHandle* handle, const uint8_t* data, size_t mask,
    477     BrotliEncoderParams* params, size_t position, size_t input_size,
    478     BROTLI_BOOL is_last) {
    479   HasherHandle self;
    480   HasherSetup(m, handle, params, data, position, input_size, is_last);
    481   if (BROTLI_IS_OOM(m)) return;
    482   self = *handle;
    483   switch (GetHasherCommon(self)->params.type) {
    484 #define INIT_(N)                                                           \
    485     case N:                                                                \
    486       StitchToPreviousBlockH ## N(self, input_size, position, data, mask); \
    487     break;
    488     FOR_ALL_HASHERS(INIT_)
    489 #undef INIT_
    490     default: break;
    491   }
    492 }
    493 
    494 #if defined(__cplusplus) || defined(c_plusplus)
    495 }  /* extern "C" */
    496 #endif
    497 
    498 #endif  /* BROTLI_ENC_HASH_H_ */
    499