Home | History | Annotate | Download | only in enc
      1 /* NOLINT(build/header_guard) */
      2 /* Copyright 2016 Google Inc. All Rights Reserved.
      3 
      4    Distributed under MIT license.
      5    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
      6 */
      7 
      8 /* template parameters: FN, BUCKET_BITS, MAX_TREE_COMP_LENGTH,
      9                         MAX_TREE_SEARCH_DEPTH */
     10 
     11 /* A (forgetful) hash table where each hash bucket contains a binary tree of
     12    sequences whose first 4 bytes share the same hash code.
     13    Each sequence is MAX_TREE_COMP_LENGTH long and is identified by its starting
     14    position in the input data. The binary tree is sorted by the lexicographic
     15    order of the sequences, and it is also a max-heap with respect to the
     16    starting positions. */
     17 
     18 #define HashToBinaryTree HASHER()
     19 
     20 #define BUCKET_SIZE (1 << BUCKET_BITS)
     21 
     22 static BROTLI_INLINE size_t FN(HashTypeLength)(void) { return 4; }
     23 static BROTLI_INLINE size_t FN(StoreLookahead)(void) {
     24   return MAX_TREE_COMP_LENGTH;
     25 }
     26 
     27 static uint32_t FN(HashBytes)(const uint8_t *data) {
     28   uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
     29   /* The higher bits contain more mixture from the multiplication,
     30      so we take our results from there. */
     31   return h >> (32 - BUCKET_BITS);
     32 }
     33 
     34 typedef struct HashToBinaryTree {
     35   /* The window size minus 1 */
     36   size_t window_mask_;
     37 
     38   /* Hash table that maps the 4-byte hashes of the sequence to the last
     39      position where this hash was found, which is the root of the binary
     40      tree of sequences that share this hash bucket. */
     41   uint32_t buckets_[BUCKET_SIZE];
     42 
     43   /* A position used to mark a non-existent sequence, i.e. a tree is empty if
     44      its root is at invalid_pos_ and a node is a leaf if both its children
     45      are at invalid_pos_. */
     46   uint32_t invalid_pos_;
     47 
     48   /* --- Dynamic size members --- */
     49 
     50   /* The union of the binary trees of each hash bucket. The root of the tree
     51      corresponding to a hash is a sequence starting at buckets_[hash] and
     52      the left and right children of a sequence starting at pos are
     53      forest_[2 * pos] and forest_[2 * pos + 1]. */
     54   /* uint32_t forest[2 * num_nodes] */
     55 } HashToBinaryTree;
     56 
     57 static BROTLI_INLINE HashToBinaryTree* FN(Self)(HasherHandle handle) {
     58   return (HashToBinaryTree*)&(GetHasherCommon(handle)[1]);
     59 }
     60 
     61 static BROTLI_INLINE uint32_t* FN(Forest)(HashToBinaryTree* self) {
     62   return (uint32_t*)(&self[1]);
     63 }
     64 
     65 static void FN(Initialize)(
     66     HasherHandle handle, const BrotliEncoderParams* params) {
     67   HashToBinaryTree* self = FN(Self)(handle);
     68   self->window_mask_ = (1u << params->lgwin) - 1u;
     69   self->invalid_pos_ = (uint32_t)(0 - self->window_mask_);
     70 }
     71 
     72 static void FN(Prepare)(HasherHandle handle, BROTLI_BOOL one_shot,
     73     size_t input_size, const uint8_t* data) {
     74   HashToBinaryTree* self = FN(Self)(handle);
     75   uint32_t invalid_pos = self->invalid_pos_;
     76   uint32_t i;
     77   BROTLI_UNUSED(data);
     78   BROTLI_UNUSED(one_shot);
     79   BROTLI_UNUSED(input_size);
     80   for (i = 0; i < BUCKET_SIZE; i++) {
     81     self->buckets_[i] = invalid_pos;
     82   }
     83 }
     84 
     85 static BROTLI_INLINE size_t FN(HashMemAllocInBytes)(
     86     const BrotliEncoderParams* params, BROTLI_BOOL one_shot,
     87     size_t input_size) {
     88   size_t num_nodes = (size_t)1 << params->lgwin;
     89   if (one_shot && input_size < num_nodes) {
     90     num_nodes = input_size;
     91   }
     92   return sizeof(HashToBinaryTree) + 2 * sizeof(uint32_t) * num_nodes;
     93 }
     94 
     95 static BROTLI_INLINE size_t FN(LeftChildIndex)(HashToBinaryTree* self,
     96     const size_t pos) {
     97   return 2 * (pos & self->window_mask_);
     98 }
     99 
    100 static BROTLI_INLINE size_t FN(RightChildIndex)(HashToBinaryTree* self,
    101     const size_t pos) {
    102   return 2 * (pos & self->window_mask_) + 1;
    103 }
    104 
    105 /* Stores the hash of the next 4 bytes and in a single tree-traversal, the
    106    hash bucket's binary tree is searched for matches and is re-rooted at the
    107    current position.
    108 
    109    If less than MAX_TREE_COMP_LENGTH data is available, the hash bucket of the
    110    current position is searched for matches, but the state of the hash table
    111    is not changed, since we can not know the final sorting order of the
    112    current (incomplete) sequence.
    113 
    114    This function must be called with increasing cur_ix positions. */
    115 static BROTLI_INLINE BackwardMatch* FN(StoreAndFindMatches)(
    116     HashToBinaryTree* self, const uint8_t* const BROTLI_RESTRICT data,
    117     const size_t cur_ix, const size_t ring_buffer_mask, const size_t max_length,
    118     const size_t max_backward, size_t* const BROTLI_RESTRICT best_len,
    119     BackwardMatch* BROTLI_RESTRICT matches) {
    120   const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
    121   const size_t max_comp_len =
    122       BROTLI_MIN(size_t, max_length, MAX_TREE_COMP_LENGTH);
    123   const BROTLI_BOOL should_reroot_tree =
    124       TO_BROTLI_BOOL(max_length >= MAX_TREE_COMP_LENGTH);
    125   const uint32_t key = FN(HashBytes)(&data[cur_ix_masked]);
    126   uint32_t* forest = FN(Forest)(self);
    127   size_t prev_ix = self->buckets_[key];
    128   /* The forest index of the rightmost node of the left subtree of the new
    129      root, updated as we traverse and re-root the tree of the hash bucket. */
    130   size_t node_left = FN(LeftChildIndex)(self, cur_ix);
    131   /* The forest index of the leftmost node of the right subtree of the new
    132      root, updated as we traverse and re-root the tree of the hash bucket. */
    133   size_t node_right = FN(RightChildIndex)(self, cur_ix);
    134   /* The match length of the rightmost node of the left subtree of the new
    135      root, updated as we traverse and re-root the tree of the hash bucket. */
    136   size_t best_len_left = 0;
    137   /* The match length of the leftmost node of the right subtree of the new
    138      root, updated as we traverse and re-root the tree of the hash bucket. */
    139   size_t best_len_right = 0;
    140   size_t depth_remaining;
    141   if (should_reroot_tree) {
    142     self->buckets_[key] = (uint32_t)cur_ix;
    143   }
    144   for (depth_remaining = MAX_TREE_SEARCH_DEPTH; ; --depth_remaining) {
    145     const size_t backward = cur_ix - prev_ix;
    146     const size_t prev_ix_masked = prev_ix & ring_buffer_mask;
    147     if (backward == 0 || backward > max_backward || depth_remaining == 0) {
    148       if (should_reroot_tree) {
    149         forest[node_left] = self->invalid_pos_;
    150         forest[node_right] = self->invalid_pos_;
    151       }
    152       break;
    153     }
    154     {
    155       const size_t cur_len = BROTLI_MIN(size_t, best_len_left, best_len_right);
    156       size_t len;
    157       assert(cur_len <= MAX_TREE_COMP_LENGTH);
    158       len = cur_len +
    159           FindMatchLengthWithLimit(&data[cur_ix_masked + cur_len],
    160                                    &data[prev_ix_masked + cur_len],
    161                                    max_length - cur_len);
    162       assert(0 == memcmp(&data[cur_ix_masked], &data[prev_ix_masked], len));
    163       if (matches && len > *best_len) {
    164         *best_len = len;
    165         InitBackwardMatch(matches++, backward, len);
    166       }
    167       if (len >= max_comp_len) {
    168         if (should_reroot_tree) {
    169           forest[node_left] = forest[FN(LeftChildIndex)(self, prev_ix)];
    170           forest[node_right] = forest[FN(RightChildIndex)(self, prev_ix)];
    171         }
    172         break;
    173       }
    174       if (data[cur_ix_masked + len] > data[prev_ix_masked + len]) {
    175         best_len_left = len;
    176         if (should_reroot_tree) {
    177           forest[node_left] = (uint32_t)prev_ix;
    178         }
    179         node_left = FN(RightChildIndex)(self, prev_ix);
    180         prev_ix = forest[node_left];
    181       } else {
    182         best_len_right = len;
    183         if (should_reroot_tree) {
    184           forest[node_right] = (uint32_t)prev_ix;
    185         }
    186         node_right = FN(LeftChildIndex)(self, prev_ix);
    187         prev_ix = forest[node_right];
    188       }
    189     }
    190   }
    191   return matches;
    192 }
    193 
    194 /* Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the
    195    length of max_length and stores the position cur_ix in the hash table.
    196 
    197    Sets *num_matches to the number of matches found, and stores the found
    198    matches in matches[0] to matches[*num_matches - 1]. The matches will be
    199    sorted by strictly increasing length and (non-strictly) increasing
    200    distance. */
    201 static BROTLI_INLINE size_t FN(FindAllMatches)(HasherHandle handle,
    202     const BrotliDictionary* dictionary, const uint8_t* data,
    203     const size_t ring_buffer_mask, const size_t cur_ix,
    204     const size_t max_length, const size_t max_backward, const size_t gap,
    205     const BrotliEncoderParams* params, BackwardMatch* matches) {
    206   BackwardMatch* const orig_matches = matches;
    207   const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
    208   size_t best_len = 1;
    209   const size_t short_match_max_backward =
    210       params->quality != HQ_ZOPFLIFICATION_QUALITY ? 16 : 64;
    211   size_t stop = cur_ix - short_match_max_backward;
    212   uint32_t dict_matches[BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1];
    213   size_t i;
    214   if (cur_ix < short_match_max_backward) { stop = 0; }
    215   for (i = cur_ix - 1; i > stop && best_len <= 2; --i) {
    216     size_t prev_ix = i;
    217     const size_t backward = cur_ix - prev_ix;
    218     if (BROTLI_PREDICT_FALSE(backward > max_backward)) {
    219       break;
    220     }
    221     prev_ix &= ring_buffer_mask;
    222     if (data[cur_ix_masked] != data[prev_ix] ||
    223         data[cur_ix_masked + 1] != data[prev_ix + 1]) {
    224       continue;
    225     }
    226     {
    227       const size_t len =
    228           FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
    229                                    max_length);
    230       if (len > best_len) {
    231         best_len = len;
    232         InitBackwardMatch(matches++, backward, len);
    233       }
    234     }
    235   }
    236   if (best_len < max_length) {
    237     matches = FN(StoreAndFindMatches)(FN(Self)(handle), data, cur_ix,
    238         ring_buffer_mask, max_length, max_backward, &best_len, matches);
    239   }
    240   for (i = 0; i <= BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN; ++i) {
    241     dict_matches[i] = kInvalidMatch;
    242   }
    243   {
    244     size_t minlen = BROTLI_MAX(size_t, 4, best_len + 1);
    245     if (BrotliFindAllStaticDictionaryMatches(dictionary,
    246         &data[cur_ix_masked], minlen, max_length, &dict_matches[0])) {
    247       size_t maxlen = BROTLI_MIN(
    248           size_t, BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN, max_length);
    249       size_t l;
    250       for (l = minlen; l <= maxlen; ++l) {
    251         uint32_t dict_id = dict_matches[l];
    252         if (dict_id < kInvalidMatch) {
    253           size_t distance = max_backward + gap + (dict_id >> 5) + 1;
    254           if (distance < BROTLI_MAX_DISTANCE) {
    255             InitDictionaryBackwardMatch(matches++, distance, l, dict_id & 31);
    256           }
    257         }
    258       }
    259     }
    260   }
    261   return (size_t)(matches - orig_matches);
    262 }
    263 
    264 /* Stores the hash of the next 4 bytes and re-roots the binary tree at the
    265    current sequence, without returning any matches.
    266    REQUIRES: ix + MAX_TREE_COMP_LENGTH <= end-of-current-block */
    267 static BROTLI_INLINE void FN(Store)(HasherHandle handle, const uint8_t *data,
    268     const size_t mask, const size_t ix) {
    269   HashToBinaryTree* self = FN(Self)(handle);
    270   /* Maximum distance is window size - 16, see section 9.1. of the spec. */
    271   const size_t max_backward = self->window_mask_ - BROTLI_WINDOW_GAP + 1;
    272   FN(StoreAndFindMatches)(self, data, ix, mask, MAX_TREE_COMP_LENGTH,
    273       max_backward, NULL, NULL);
    274 }
    275 
    276 static BROTLI_INLINE void FN(StoreRange)(HasherHandle handle,
    277     const uint8_t *data, const size_t mask, const size_t ix_start,
    278     const size_t ix_end) {
    279   size_t i = ix_start;
    280   size_t j = ix_start;
    281   if (ix_start + 63 <= ix_end) {
    282     i = ix_end - 63;
    283   }
    284   if (ix_start + 512 <= i) {
    285     for (; j < i; j += 8) {
    286       FN(Store)(handle, data, mask, j);
    287     }
    288   }
    289   for (; i < ix_end; ++i) {
    290     FN(Store)(handle, data, mask, i);
    291   }
    292 }
    293 
    294 static BROTLI_INLINE void FN(StitchToPreviousBlock)(HasherHandle handle,
    295     size_t num_bytes, size_t position, const uint8_t* ringbuffer,
    296     size_t ringbuffer_mask) {
    297   HashToBinaryTree* self = FN(Self)(handle);
    298   if (num_bytes >= FN(HashTypeLength)() - 1 &&
    299       position >= MAX_TREE_COMP_LENGTH) {
    300     /* Store the last `MAX_TREE_COMP_LENGTH - 1` positions in the hasher.
    301        These could not be calculated before, since they require knowledge
    302        of both the previous and the current block. */
    303     const size_t i_start = position - MAX_TREE_COMP_LENGTH + 1;
    304     const size_t i_end = BROTLI_MIN(size_t, position, i_start + num_bytes);
    305     size_t i;
    306     for (i = i_start; i < i_end; ++i) {
    307       /* Maximum distance is window size - 16, see section 9.1. of the spec.
    308          Furthermore, we have to make sure that we don't look further back
    309          from the start of the next block than the window size, otherwise we
    310          could access already overwritten areas of the ring-buffer. */
    311       const size_t max_backward =
    312           self->window_mask_ - BROTLI_MAX(size_t,
    313                                           BROTLI_WINDOW_GAP - 1,
    314                                           position - i);
    315       /* We know that i + MAX_TREE_COMP_LENGTH <= position + num_bytes, i.e. the
    316          end of the current block and that we have at least
    317          MAX_TREE_COMP_LENGTH tail in the ring-buffer. */
    318       FN(StoreAndFindMatches)(self, ringbuffer, i, ringbuffer_mask,
    319           MAX_TREE_COMP_LENGTH, max_backward, NULL, NULL);
    320     }
    321   }
    322 }
    323 
    324 #undef BUCKET_SIZE
    325 
    326 #undef HashToBinaryTree
    327