1 /* Copyright 2010 Google Inc. All Rights Reserved. 2 3 Distributed under MIT license. 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 5 */ 6 7 /* A (forgetful) hash table to the data seen by the compressor, to 8 help create backward references to previous data. */ 9 10 #ifndef BROTLI_ENC_HASH_H_ 11 #define BROTLI_ENC_HASH_H_ 12 13 #include <string.h> /* memcmp, memset */ 14 15 #include "../common/constants.h" 16 #include "../common/dictionary.h" 17 #include <brotli/types.h> 18 #include "./fast_log.h" 19 #include "./find_match_length.h" 20 #include "./memory.h" 21 #include "./port.h" 22 #include "./quality.h" 23 #include "./static_dict.h" 24 25 #if defined(__cplusplus) || defined(c_plusplus) 26 extern "C" { 27 #endif 28 29 /* Pointer to hasher data. 30 * 31 * Excluding initialization and destruction, hasher can be passed as 32 * HasherHandle by value. 33 * 34 * Typically hasher data consists of 3 sections: 35 * * HasherCommon structure 36 * * private structured hasher data, depending on hasher type 37 * * private dynamic hasher data, depending on hasher type and parameters 38 */ 39 typedef uint8_t* HasherHandle; 40 41 typedef struct { 42 BrotliHasherParams params; 43 44 /* False if hasher needs to be "prepared" before use. */ 45 BROTLI_BOOL is_prepared_; 46 47 size_t dict_num_lookups; 48 size_t dict_num_matches; 49 } HasherCommon; 50 51 static BROTLI_INLINE HasherCommon* GetHasherCommon(HasherHandle handle) { 52 return (HasherCommon*)handle; 53 } 54 55 #define score_t size_t 56 57 static const uint32_t kCutoffTransformsCount = 10; 58 /* 0, 12, 27, 23, 42, 63, 56, 48, 59, 64 */ 59 /* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */ 60 static const uint64_t kCutoffTransforms = 61 BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200); 62 63 typedef struct HasherSearchResult { 64 size_t len; 65 size_t distance; 66 score_t score; 67 int len_code_delta; /* == len_code - len */ 68 } HasherSearchResult; 69 70 /* kHashMul32 multiplier has these properties: 71 * The multiplier must be odd. Otherwise we may lose the highest bit. 72 * No long streaks of ones or zeros. 73 * There is no effort to ensure that it is a prime, the oddity is enough 74 for this use. 75 * The number has been tuned heuristically against compression benchmarks. */ 76 static const uint32_t kHashMul32 = 0x1e35a7bd; 77 static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1e35a7bd, 0x1e35a7bd); 78 static const uint64_t kHashMul64Long = 79 BROTLI_MAKE_UINT64_T(0x1fe35a7bU, 0xd3579bd3U); 80 81 static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) { 82 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32; 83 /* The higher bits contain more mixture from the multiplication, 84 so we take our results from there. */ 85 return h >> (32 - 14); 86 } 87 88 static BROTLI_INLINE void PrepareDistanceCache( 89 int* BROTLI_RESTRICT distance_cache, const int num_distances) { 90 if (num_distances > 4) { 91 int last_distance = distance_cache[0]; 92 distance_cache[4] = last_distance - 1; 93 distance_cache[5] = last_distance + 1; 94 distance_cache[6] = last_distance - 2; 95 distance_cache[7] = last_distance + 2; 96 distance_cache[8] = last_distance - 3; 97 distance_cache[9] = last_distance + 3; 98 if (num_distances > 10) { 99 int next_last_distance = distance_cache[1]; 100 distance_cache[10] = next_last_distance - 1; 101 distance_cache[11] = next_last_distance + 1; 102 distance_cache[12] = next_last_distance - 2; 103 distance_cache[13] = next_last_distance + 2; 104 distance_cache[14] = next_last_distance - 3; 105 distance_cache[15] = next_last_distance + 3; 106 } 107 } 108 } 109 110 #define BROTLI_LITERAL_BYTE_SCORE 135 111 #define BROTLI_DISTANCE_BIT_PENALTY 30 112 /* Score must be positive after applying maximal penalty. */ 113 #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t)) 114 115 /* Usually, we always choose the longest backward reference. This function 116 allows for the exception of that rule. 117 118 If we choose a backward reference that is further away, it will 119 usually be coded with more bits. We approximate this by assuming 120 log2(distance). If the distance can be expressed in terms of the 121 last four distances, we use some heuristic constants to estimate 122 the bits cost. For the first up to four literals we use the bit 123 cost of the literals from the literal cost model, after that we 124 use the average bit cost of the cost model. 125 126 This function is used to sometimes discard a longer backward reference 127 when it is not much longer and the bit cost for encoding it is more 128 than the saved literals. 129 130 backward_reference_offset MUST be positive. */ 131 static BROTLI_INLINE score_t BackwardReferenceScore( 132 size_t copy_length, size_t backward_reference_offset) { 133 return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length - 134 BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset); 135 } 136 137 static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance( 138 size_t copy_length) { 139 return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length + 140 BROTLI_SCORE_BASE + 15; 141 } 142 143 static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance( 144 size_t distance_short_code) { 145 return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE); 146 } 147 148 static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem( 149 const BrotliDictionary* dictionary, size_t item, const uint8_t* data, 150 size_t max_length, size_t max_backward, HasherSearchResult* out) { 151 size_t len; 152 size_t dist; 153 size_t offset; 154 size_t matchlen; 155 size_t backward; 156 score_t score; 157 len = item & 0x1F; 158 dist = item >> 5; 159 offset = dictionary->offsets_by_length[len] + len * dist; 160 if (len > max_length) { 161 return BROTLI_FALSE; 162 } 163 164 matchlen = 165 FindMatchLengthWithLimit(data, &dictionary->data[offset], len); 166 if (matchlen + kCutoffTransformsCount <= len || matchlen == 0) { 167 return BROTLI_FALSE; 168 } 169 { 170 size_t cut = len - matchlen; 171 size_t transform_id = 172 (cut << 2) + (size_t)((kCutoffTransforms >> (cut * 6)) & 0x3F); 173 backward = max_backward + dist + 1 + 174 (transform_id << dictionary->size_bits_by_length[len]); 175 } 176 if (backward >= BROTLI_MAX_DISTANCE) { 177 return BROTLI_FALSE; 178 } 179 score = BackwardReferenceScore(matchlen, backward); 180 if (score < out->score) { 181 return BROTLI_FALSE; 182 } 183 out->len = matchlen; 184 out->len_code_delta = (int)len - (int)matchlen; 185 out->distance = backward; 186 out->score = score; 187 return BROTLI_TRUE; 188 } 189 190 static BROTLI_INLINE void SearchInStaticDictionary( 191 const BrotliDictionary* dictionary, const uint16_t* dictionary_hash, 192 HasherHandle handle, const uint8_t* data, size_t max_length, 193 size_t max_backward, HasherSearchResult* out, BROTLI_BOOL shallow) { 194 size_t key; 195 size_t i; 196 HasherCommon* self = GetHasherCommon(handle); 197 if (self->dict_num_matches < (self->dict_num_lookups >> 7)) { 198 return; 199 } 200 key = Hash14(data) << 1; 201 for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) { 202 size_t item = dictionary_hash[key]; 203 self->dict_num_lookups++; 204 if (item != 0) { 205 BROTLI_BOOL item_matches = TestStaticDictionaryItem( 206 dictionary, item, data, max_length, max_backward, out); 207 if (item_matches) { 208 self->dict_num_matches++; 209 } 210 } 211 } 212 } 213 214 typedef struct BackwardMatch { 215 uint32_t distance; 216 uint32_t length_and_code; 217 } BackwardMatch; 218 219 static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self, 220 size_t dist, size_t len) { 221 self->distance = (uint32_t)dist; 222 self->length_and_code = (uint32_t)(len << 5); 223 } 224 225 static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self, 226 size_t dist, size_t len, size_t len_code) { 227 self->distance = (uint32_t)dist; 228 self->length_and_code = 229 (uint32_t)((len << 5) | (len == len_code ? 0 : len_code)); 230 } 231 232 static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) { 233 return self->length_and_code >> 5; 234 } 235 236 static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) { 237 size_t code = self->length_and_code & 31; 238 return code ? code : BackwardMatchLength(self); 239 } 240 241 #define EXPAND_CAT(a, b) CAT(a, b) 242 #define CAT(a, b) a ## b 243 #define FN(X) EXPAND_CAT(X, HASHER()) 244 245 #define HASHER() H10 246 #define BUCKET_BITS 17 247 #define MAX_TREE_SEARCH_DEPTH 64 248 #define MAX_TREE_COMP_LENGTH 128 249 #include "./hash_to_binary_tree_inc.h" /* NOLINT(build/include) */ 250 #undef MAX_TREE_SEARCH_DEPTH 251 #undef MAX_TREE_COMP_LENGTH 252 #undef BUCKET_BITS 253 #undef HASHER 254 /* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */ 255 #define MAX_NUM_MATCHES_H10 128 256 257 /* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression 258 a little faster (0.5% - 1%) and it compresses 0.15% better on small text 259 and HTML inputs. */ 260 261 #define HASHER() H2 262 #define BUCKET_BITS 16 263 #define BUCKET_SWEEP 1 264 #define HASH_LEN 5 265 #define USE_DICTIONARY 1 266 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ 267 #undef BUCKET_SWEEP 268 #undef USE_DICTIONARY 269 #undef HASHER 270 271 #define HASHER() H3 272 #define BUCKET_SWEEP 2 273 #define USE_DICTIONARY 0 274 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ 275 #undef USE_DICTIONARY 276 #undef BUCKET_SWEEP 277 #undef BUCKET_BITS 278 #undef HASHER 279 280 #define HASHER() H4 281 #define BUCKET_BITS 17 282 #define BUCKET_SWEEP 4 283 #define USE_DICTIONARY 1 284 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ 285 #undef USE_DICTIONARY 286 #undef HASH_LEN 287 #undef BUCKET_SWEEP 288 #undef BUCKET_BITS 289 #undef HASHER 290 291 #define HASHER() H5 292 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */ 293 #undef HASHER 294 295 #define HASHER() H6 296 #include "./hash_longest_match64_inc.h" /* NOLINT(build/include) */ 297 #undef HASHER 298 299 #define BUCKET_BITS 15 300 301 #define NUM_LAST_DISTANCES_TO_CHECK 4 302 #define NUM_BANKS 1 303 #define BANK_BITS 16 304 #define HASHER() H40 305 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */ 306 #undef HASHER 307 #undef NUM_LAST_DISTANCES_TO_CHECK 308 309 #define NUM_LAST_DISTANCES_TO_CHECK 10 310 #define HASHER() H41 311 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */ 312 #undef HASHER 313 #undef NUM_LAST_DISTANCES_TO_CHECK 314 #undef NUM_BANKS 315 #undef BANK_BITS 316 317 #define NUM_LAST_DISTANCES_TO_CHECK 16 318 #define NUM_BANKS 512 319 #define BANK_BITS 9 320 #define HASHER() H42 321 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */ 322 #undef HASHER 323 #undef NUM_LAST_DISTANCES_TO_CHECK 324 #undef NUM_BANKS 325 #undef BANK_BITS 326 327 #undef BUCKET_BITS 328 329 #define HASHER() H54 330 #define BUCKET_BITS 20 331 #define BUCKET_SWEEP 4 332 #define HASH_LEN 7 333 #define USE_DICTIONARY 0 334 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ 335 #undef USE_DICTIONARY 336 #undef HASH_LEN 337 #undef BUCKET_SWEEP 338 #undef BUCKET_BITS 339 #undef HASHER 340 341 #undef FN 342 #undef CAT 343 #undef EXPAND_CAT 344 345 #define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54) 346 #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10) 347 348 static BROTLI_INLINE void DestroyHasher( 349 MemoryManager* m, HasherHandle* handle) { 350 if (*handle == NULL) return; 351 BROTLI_FREE(m, *handle); 352 } 353 354 static BROTLI_INLINE void HasherReset(HasherHandle handle) { 355 if (handle == NULL) return; 356 GetHasherCommon(handle)->is_prepared_ = BROTLI_FALSE; 357 } 358 359 static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params, 360 BROTLI_BOOL one_shot, const size_t input_size) { 361 size_t result = sizeof(HasherCommon); 362 switch (params->hasher.type) { 363 #define SIZE_(N) \ 364 case N: \ 365 result += HashMemAllocInBytesH ## N(params, one_shot, input_size); \ 366 break; 367 FOR_ALL_HASHERS(SIZE_) 368 #undef SIZE_ 369 default: 370 break; 371 } 372 return result; 373 } 374 375 static BROTLI_INLINE void HasherSetup(MemoryManager* m, HasherHandle* handle, 376 BrotliEncoderParams* params, const uint8_t* data, size_t position, 377 size_t input_size, BROTLI_BOOL is_last) { 378 HasherHandle self = NULL; 379 HasherCommon* common = NULL; 380 BROTLI_BOOL one_shot = (position == 0 && is_last); 381 if (*handle == NULL) { 382 size_t alloc_size; 383 ChooseHasher(params, ¶ms->hasher); 384 alloc_size = HasherSize(params, one_shot, input_size); 385 self = BROTLI_ALLOC(m, uint8_t, alloc_size); 386 if (BROTLI_IS_OOM(m)) return; 387 *handle = self; 388 common = GetHasherCommon(self); 389 common->params = params->hasher; 390 switch (common->params.type) { 391 #define INITIALIZE_(N) \ 392 case N: \ 393 InitializeH ## N(*handle, params); \ 394 break; 395 FOR_ALL_HASHERS(INITIALIZE_); 396 #undef INITIALIZE_ 397 default: 398 break; 399 } 400 HasherReset(*handle); 401 } 402 403 self = *handle; 404 common = GetHasherCommon(self); 405 if (!common->is_prepared_) { 406 switch (common->params.type) { 407 #define PREPARE_(N) \ 408 case N: \ 409 PrepareH ## N(self, one_shot, input_size, data); \ 410 break; 411 FOR_ALL_HASHERS(PREPARE_) 412 #undef PREPARE_ 413 default: break; 414 } 415 if (position == 0) { 416 common->dict_num_lookups = 0; 417 common->dict_num_matches = 0; 418 } 419 common->is_prepared_ = BROTLI_TRUE; 420 } 421 } 422 423 static BROTLI_INLINE void InitOrStitchToPreviousBlock( 424 MemoryManager* m, HasherHandle* handle, const uint8_t* data, size_t mask, 425 BrotliEncoderParams* params, size_t position, size_t input_size, 426 BROTLI_BOOL is_last) { 427 HasherHandle self; 428 HasherSetup(m, handle, params, data, position, input_size, is_last); 429 if (BROTLI_IS_OOM(m)) return; 430 self = *handle; 431 switch (GetHasherCommon(self)->params.type) { 432 #define INIT_(N) \ 433 case N: \ 434 StitchToPreviousBlockH ## N(self, input_size, position, data, mask); \ 435 break; 436 FOR_ALL_HASHERS(INIT_) 437 #undef INIT_ 438 default: break; 439 } 440 } 441 442 #if defined(__cplusplus) || defined(c_plusplus) 443 } /* extern "C" */ 444 #endif 445 446 #endif /* BROTLI_ENC_HASH_H_ */ 447