1 // Copyright 2013 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // Implementation of Brotli compressor. 16 17 #include "./encode.h" 18 19 #include <algorithm> 20 #include <limits> 21 22 #include "./backward_references.h" 23 #include "./bit_cost.h" 24 #include "./block_splitter.h" 25 #include "./cluster.h" 26 #include "./context.h" 27 #include "./transform.h" 28 #include "./entropy_encode.h" 29 #include "./fast_log.h" 30 #include "./hash.h" 31 #include "./histogram.h" 32 #include "./literal_cost.h" 33 #include "./prefix.h" 34 #include "./write_bits.h" 35 36 namespace brotli { 37 38 static const int kWindowBits = 22; 39 // To make decoding faster, we allow the decoder to write 16 bytes ahead in 40 // its ringbuffer, therefore the encoder has to decrease max distance by this 41 // amount. 42 static const int kDecoderRingBufferWriteAheadSlack = 16; 43 static const int kMaxBackwardDistance = 44 (1 << kWindowBits) - kDecoderRingBufferWriteAheadSlack; 45 46 static const int kMetaBlockSizeBits = 21; 47 static const int kRingBufferBits = 23; 48 static const int kRingBufferMask = (1 << kRingBufferBits) - 1; 49 50 template<int kSize> 51 double Entropy(const std::vector<Histogram<kSize> >& histograms) { 52 double retval = 0; 53 for (int i = 0; i < histograms.size(); ++i) { 54 retval += histograms[i].EntropyBitCost(); 55 } 56 return retval; 57 } 58 59 template<int kSize> 60 double TotalBitCost(const std::vector<Histogram<kSize> >& histograms) { 61 double retval = 0; 62 for (int i = 0; i < histograms.size(); ++i) { 63 retval += PopulationCost(histograms[i]); 64 } 65 return retval; 66 } 67 68 void EncodeVarLenUint8(int n, int* storage_ix, uint8_t* storage) { 69 if (n == 0) { 70 WriteBits(1, 0, storage_ix, storage); 71 } else { 72 WriteBits(1, 1, storage_ix, storage); 73 int nbits = Log2Floor(n); 74 WriteBits(3, nbits, storage_ix, storage); 75 if (nbits > 0) { 76 WriteBits(nbits, n - (1 << nbits), storage_ix, storage); 77 } 78 } 79 } 80 81 int ParseAsUTF8(int* symbol, const uint8_t* input, int size) { 82 // ASCII 83 if ((input[0] & 0x80) == 0) { 84 *symbol = input[0]; 85 if (*symbol > 0) { 86 return 1; 87 } 88 } 89 // 2-byte UTF8 90 if (size > 1 && 91 (input[0] & 0xe0) == 0xc0 && 92 (input[1] & 0xc0) == 0x80) { 93 *symbol = (((input[0] & 0x1f) << 6) | 94 (input[1] & 0x3f)); 95 if (*symbol > 0x7f) { 96 return 2; 97 } 98 } 99 // 3-byte UFT8 100 if (size > 2 && 101 (input[0] & 0xf0) == 0xe0 && 102 (input[1] & 0xc0) == 0x80 && 103 (input[2] & 0xc0) == 0x80) { 104 *symbol = (((input[0] & 0x0f) << 12) | 105 ((input[1] & 0x3f) << 6) | 106 (input[2] & 0x3f)); 107 if (*symbol > 0x7ff) { 108 return 3; 109 } 110 } 111 // 4-byte UFT8 112 if (size > 3 && 113 (input[0] & 0xf8) == 0xf0 && 114 (input[1] & 0xc0) == 0x80 && 115 (input[2] & 0xc0) == 0x80 && 116 (input[3] & 0xc0) == 0x80) { 117 *symbol = (((input[0] & 0x07) << 18) | 118 ((input[1] & 0x3f) << 12) | 119 ((input[2] & 0x3f) << 6) | 120 (input[3] & 0x3f)); 121 if (*symbol > 0xffff && *symbol <= 0x10ffff) { 122 return 4; 123 } 124 } 125 // Not UTF8, emit a special symbol above the UTF8-code space 126 *symbol = 0x110000 | input[0]; 127 return 1; 128 } 129 130 // Returns true if at least min_fraction of the data is UTF8-encoded. 131 bool IsMostlyUTF8(const uint8_t* data, size_t length, double min_fraction) { 132 size_t size_utf8 = 0; 133 size_t pos = 0; 134 while (pos < length) { 135 int symbol; 136 int bytes_read = ParseAsUTF8(&symbol, data + pos, length - pos); 137 pos += bytes_read; 138 if (symbol < 0x110000) size_utf8 += bytes_read; 139 } 140 return size_utf8 > min_fraction * length; 141 } 142 143 void EncodeMetaBlockLength(size_t meta_block_size, 144 bool is_last, 145 bool is_uncompressed, 146 int* storage_ix, uint8_t* storage) { 147 WriteBits(1, is_last, storage_ix, storage); 148 if (is_last) { 149 if (meta_block_size == 0) { 150 WriteBits(1, 1, storage_ix, storage); 151 return; 152 } 153 WriteBits(1, 0, storage_ix, storage); 154 } 155 --meta_block_size; 156 int num_bits = Log2Floor(meta_block_size) + 1; 157 if (num_bits < 16) { 158 num_bits = 16; 159 } 160 WriteBits(2, (num_bits - 13) >> 2, storage_ix, storage); 161 while (num_bits > 0) { 162 WriteBits(4, meta_block_size & 0xf, storage_ix, storage); 163 meta_block_size >>= 4; 164 num_bits -= 4; 165 } 166 if (!is_last) { 167 WriteBits(1, is_uncompressed, storage_ix, storage); 168 } 169 } 170 171 void StoreHuffmanTreeOfHuffmanTreeToBitMask( 172 const uint8_t* code_length_bitdepth, 173 int* storage_ix, uint8_t* storage) { 174 static const uint8_t kStorageOrder[kCodeLengthCodes] = { 175 1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15, 176 }; 177 // Throw away trailing zeros: 178 int codes_to_store = kCodeLengthCodes; 179 for (; codes_to_store > 0; --codes_to_store) { 180 if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) { 181 break; 182 } 183 } 184 int num_codes = 0; 185 for (int i = 0; i < codes_to_store; ++i) { 186 if (code_length_bitdepth[kStorageOrder[i]] != 0) { 187 ++num_codes; 188 } 189 } 190 if (num_codes == 1) { 191 codes_to_store = kCodeLengthCodes; 192 } 193 int skip_some = 0; // skips none. 194 if (code_length_bitdepth[kStorageOrder[0]] == 0 && 195 code_length_bitdepth[kStorageOrder[1]] == 0) { 196 skip_some = 2; // skips two. 197 if (code_length_bitdepth[kStorageOrder[2]] == 0) { 198 skip_some = 3; // skips three. 199 } 200 } 201 WriteBits(2, skip_some, storage_ix, storage); 202 for (int i = skip_some; i < codes_to_store; ++i) { 203 uint8_t len[] = { 2, 4, 3, 2, 2, 4 }; 204 uint8_t bits[] = { 0, 7, 3, 2, 1, 15 }; 205 int v = code_length_bitdepth[kStorageOrder[i]]; 206 WriteBits(len[v], bits[v], storage_ix, storage); 207 } 208 } 209 210 void StoreHuffmanTreeToBitMask( 211 const uint8_t* huffman_tree, 212 const uint8_t* huffman_tree_extra_bits, 213 const int huffman_tree_size, 214 const EntropyCode<kCodeLengthCodes>& entropy, 215 int* storage_ix, uint8_t* storage) { 216 for (int i = 0; i < huffman_tree_size; ++i) { 217 const int ix = huffman_tree[i]; 218 const int extra_bits = huffman_tree_extra_bits[i]; 219 if (entropy.count_ > 1) { 220 WriteBits(entropy.depth_[ix], entropy.bits_[ix], storage_ix, storage); 221 } 222 switch (ix) { 223 case 16: 224 WriteBits(2, extra_bits, storage_ix, storage); 225 break; 226 case 17: 227 WriteBits(3, extra_bits, storage_ix, storage); 228 break; 229 } 230 } 231 } 232 233 template<int kSize> 234 void StoreHuffmanCodeSimple( 235 const EntropyCode<kSize>& code, int alphabet_size, 236 int max_bits, int* storage_ix, uint8_t* storage) { 237 const uint8_t *depth = &code.depth_[0]; 238 int symbols[4]; 239 // Quadratic sort. 240 int k, j; 241 for (k = 0; k < code.count_; ++k) { 242 symbols[k] = code.symbols_[k]; 243 } 244 for (k = 0; k < code.count_; ++k) { 245 for (j = k + 1; j < code.count_; ++j) { 246 if (depth[symbols[j]] < depth[symbols[k]]) { 247 int t = symbols[k]; 248 symbols[k] = symbols[j]; 249 symbols[j] = t; 250 } 251 } 252 } 253 // Small tree marker to encode 1-4 symbols. 254 WriteBits(2, 1, storage_ix, storage); 255 WriteBits(2, code.count_ - 1, storage_ix, storage); 256 for (int i = 0; i < code.count_; ++i) { 257 WriteBits(max_bits, symbols[i], storage_ix, storage); 258 } 259 if (code.count_ == 4) { 260 if (depth[symbols[0]] == 2 && 261 depth[symbols[1]] == 2 && 262 depth[symbols[2]] == 2 && 263 depth[symbols[3]] == 2) { 264 WriteBits(1, 0, storage_ix, storage); 265 } else { 266 WriteBits(1, 1, storage_ix, storage); 267 } 268 } 269 } 270 271 template<int kSize> 272 void StoreHuffmanCodeComplex( 273 const EntropyCode<kSize>& code, int alphabet_size, 274 int* storage_ix, uint8_t* storage) { 275 const uint8_t *depth = &code.depth_[0]; 276 uint8_t huffman_tree[kSize]; 277 uint8_t huffman_tree_extra_bits[kSize]; 278 int huffman_tree_size = 0; 279 WriteHuffmanTree(depth, 280 alphabet_size, 281 &huffman_tree[0], 282 &huffman_tree_extra_bits[0], 283 &huffman_tree_size); 284 Histogram<kCodeLengthCodes> huffman_tree_histogram; 285 memset(huffman_tree_histogram.data_, 0, sizeof(huffman_tree_histogram.data_)); 286 for (int i = 0; i < huffman_tree_size; ++i) { 287 huffman_tree_histogram.Add(huffman_tree[i]); 288 } 289 EntropyCode<kCodeLengthCodes> huffman_tree_entropy; 290 BuildEntropyCode(huffman_tree_histogram, 5, kCodeLengthCodes, 291 &huffman_tree_entropy); 292 StoreHuffmanTreeOfHuffmanTreeToBitMask( 293 &huffman_tree_entropy.depth_[0], storage_ix, storage); 294 StoreHuffmanTreeToBitMask(&huffman_tree[0], &huffman_tree_extra_bits[0], 295 huffman_tree_size, huffman_tree_entropy, 296 storage_ix, storage); 297 } 298 299 template<int kSize> 300 void BuildAndStoreEntropyCode(const Histogram<kSize>& histogram, 301 const int tree_limit, 302 const int alphabet_size, 303 EntropyCode<kSize>* code, 304 int* storage_ix, uint8_t* storage) { 305 memset(code->depth_, 0, sizeof(code->depth_)); 306 memset(code->bits_, 0, sizeof(code->bits_)); 307 memset(code->symbols_, 0, sizeof(code->symbols_)); 308 code->count_ = 0; 309 310 int max_bits_counter = alphabet_size - 1; 311 int max_bits = 0; 312 while (max_bits_counter) { 313 max_bits_counter >>= 1; 314 ++max_bits; 315 } 316 317 for (size_t i = 0; i < alphabet_size; i++) { 318 if (histogram.data_[i] > 0) { 319 if (code->count_ < 4) code->symbols_[code->count_] = i; 320 ++code->count_; 321 } 322 } 323 324 if (code->count_ <= 1) { 325 WriteBits(2, 1, storage_ix, storage); 326 WriteBits(2, 0, storage_ix, storage); 327 WriteBits(max_bits, code->symbols_[0], storage_ix, storage); 328 return; 329 } 330 331 if (alphabet_size >= 50 && code->count_ >= 16) { 332 std::vector<int> counts(alphabet_size); 333 memcpy(&counts[0], histogram.data_, sizeof(counts[0]) * alphabet_size); 334 OptimizeHuffmanCountsForRle(alphabet_size, &counts[0]); 335 CreateHuffmanTree(&counts[0], alphabet_size, tree_limit, code->depth_); 336 } else { 337 CreateHuffmanTree(histogram.data_, alphabet_size, tree_limit, code->depth_); 338 } 339 ConvertBitDepthsToSymbols(code->depth_, alphabet_size, code->bits_); 340 341 if (code->count_ <= 4) { 342 StoreHuffmanCodeSimple(*code, alphabet_size, max_bits, storage_ix, storage); 343 } else { 344 StoreHuffmanCodeComplex(*code, alphabet_size, storage_ix, storage); 345 } 346 } 347 348 template<int kSize> 349 void BuildAndStoreEntropyCodes( 350 const std::vector<Histogram<kSize> >& histograms, 351 int alphabet_size, 352 std::vector<EntropyCode<kSize> >* entropy_codes, 353 int* storage_ix, uint8_t* storage) { 354 entropy_codes->resize(histograms.size()); 355 for (int i = 0; i < histograms.size(); ++i) { 356 BuildAndStoreEntropyCode(histograms[i], 15, alphabet_size, 357 &(*entropy_codes)[i], 358 storage_ix, storage); 359 } 360 } 361 362 void EncodeCommand(const Command& cmd, 363 const EntropyCodeCommand& entropy, 364 int* storage_ix, uint8_t* storage) { 365 int code = cmd.command_prefix_; 366 WriteBits(entropy.depth_[code], entropy.bits_[code], storage_ix, storage); 367 if (code >= 128) { 368 code -= 128; 369 } 370 int insert_extra_bits = InsertLengthExtraBits(code); 371 uint64_t insert_extra_bits_val = 372 cmd.insert_length_ - InsertLengthOffset(code); 373 int copy_extra_bits = CopyLengthExtraBits(code); 374 uint64_t copy_extra_bits_val = cmd.copy_length_code_ - CopyLengthOffset(code); 375 if (insert_extra_bits > 0) { 376 WriteBits(insert_extra_bits, insert_extra_bits_val, storage_ix, storage); 377 } 378 if (copy_extra_bits > 0) { 379 WriteBits(copy_extra_bits, copy_extra_bits_val, storage_ix, storage); 380 } 381 } 382 383 void EncodeCopyDistance(const Command& cmd, const EntropyCodeDistance& entropy, 384 int* storage_ix, uint8_t* storage) { 385 int code = cmd.distance_prefix_; 386 int extra_bits = cmd.distance_extra_bits_; 387 uint64_t extra_bits_val = cmd.distance_extra_bits_value_; 388 WriteBits(entropy.depth_[code], entropy.bits_[code], storage_ix, storage); 389 if (extra_bits > 0) { 390 WriteBits(extra_bits, extra_bits_val, storage_ix, storage); 391 } 392 } 393 394 void ComputeDistanceShortCodes(std::vector<Command>* cmds, 395 size_t pos, 396 const size_t max_backward, 397 int* dist_ringbuffer, 398 size_t* ringbuffer_idx) { 399 static const int kIndexOffset[16] = { 400 3, 2, 1, 0, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2 401 }; 402 static const int kValueOffset[16] = { 403 0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3 404 }; 405 for (int i = 0; i < cmds->size(); ++i) { 406 pos += (*cmds)[i].insert_length_; 407 size_t max_distance = std::min(pos, max_backward); 408 int cur_dist = (*cmds)[i].copy_distance_; 409 int dist_code = cur_dist + 16; 410 if (cur_dist <= max_distance) { 411 if (cur_dist == 0) break; 412 int limits[16] = { 0, 0, 0, 0, 413 6, 6, 11, 11, 414 11, 11, 11, 11, 415 12, 12, 12, 12 }; 416 for (int k = 0; k < 16; ++k) { 417 // Only accept more popular choices. 418 if (cur_dist < limits[k]) { 419 // Typically unpopular ranges, don't replace a short distance 420 // with them. 421 continue; 422 } 423 int comp = (dist_ringbuffer[(*ringbuffer_idx + kIndexOffset[k]) & 3] + 424 kValueOffset[k]); 425 if (cur_dist == comp) { 426 dist_code = k + 1; 427 break; 428 } 429 } 430 if (dist_code > 1) { 431 dist_ringbuffer[*ringbuffer_idx & 3] = cur_dist; 432 ++(*ringbuffer_idx); 433 } 434 pos += (*cmds)[i].copy_length_; 435 } else { 436 int word_idx = cur_dist - max_distance - 1; 437 const std::string word = 438 GetTransformedDictionaryWord((*cmds)[i].copy_length_code_, word_idx); 439 pos += word.size(); 440 } 441 (*cmds)[i].distance_code_ = dist_code; 442 } 443 } 444 445 void ComputeCommandPrefixes(std::vector<Command>* cmds, 446 int num_direct_distance_codes, 447 int distance_postfix_bits) { 448 for (int i = 0; i < cmds->size(); ++i) { 449 Command* cmd = &(*cmds)[i]; 450 cmd->command_prefix_ = CommandPrefix(cmd->insert_length_, 451 cmd->copy_length_code_); 452 if (cmd->copy_length_code_ > 0) { 453 PrefixEncodeCopyDistance(cmd->distance_code_, 454 num_direct_distance_codes, 455 distance_postfix_bits, 456 &cmd->distance_prefix_, 457 &cmd->distance_extra_bits_, 458 &cmd->distance_extra_bits_value_); 459 } 460 if (cmd->command_prefix_ < 128 && cmd->distance_prefix_ == 0) { 461 cmd->distance_prefix_ = 0xffff; 462 } else { 463 cmd->command_prefix_ += 128; 464 } 465 } 466 } 467 468 int IndexOf(const std::vector<int>& v, int value) { 469 for (int i = 0; i < v.size(); ++i) { 470 if (v[i] == value) return i; 471 } 472 return -1; 473 } 474 475 void MoveToFront(std::vector<int>* v, int index) { 476 int value = (*v)[index]; 477 for (int i = index; i > 0; --i) { 478 (*v)[i] = (*v)[i - 1]; 479 } 480 (*v)[0] = value; 481 } 482 483 std::vector<int> MoveToFrontTransform(const std::vector<int>& v) { 484 if (v.empty()) return v; 485 std::vector<int> mtf(*max_element(v.begin(), v.end()) + 1); 486 for (int i = 0; i < mtf.size(); ++i) mtf[i] = i; 487 std::vector<int> result(v.size()); 488 for (int i = 0; i < v.size(); ++i) { 489 int index = IndexOf(mtf, v[i]); 490 result[i] = index; 491 MoveToFront(&mtf, index); 492 } 493 return result; 494 } 495 496 // Finds runs of zeros in v_in and replaces them with a prefix code of the run 497 // length plus extra bits in *v_out and *extra_bits. Non-zero values in v_in are 498 // shifted by *max_length_prefix. Will not create prefix codes bigger than the 499 // initial value of *max_run_length_prefix. The prefix code of run length L is 500 // simply Log2Floor(L) and the number of extra bits is the same as the prefix 501 // code. 502 void RunLengthCodeZeros(const std::vector<int>& v_in, 503 int* max_run_length_prefix, 504 std::vector<int>* v_out, 505 std::vector<int>* extra_bits) { 506 int max_reps = 0; 507 for (int i = 0; i < v_in.size();) { 508 for (; i < v_in.size() && v_in[i] != 0; ++i) ; 509 int reps = 0; 510 for (; i < v_in.size() && v_in[i] == 0; ++i) { 511 ++reps; 512 } 513 max_reps = std::max(reps, max_reps); 514 } 515 int max_prefix = max_reps > 0 ? Log2Floor(max_reps) : 0; 516 *max_run_length_prefix = std::min(max_prefix, *max_run_length_prefix); 517 for (int i = 0; i < v_in.size();) { 518 if (v_in[i] != 0) { 519 v_out->push_back(v_in[i] + *max_run_length_prefix); 520 extra_bits->push_back(0); 521 ++i; 522 } else { 523 int reps = 1; 524 for (uint32_t k = i + 1; k < v_in.size() && v_in[k] == 0; ++k) { 525 ++reps; 526 } 527 i += reps; 528 while (reps) { 529 if (reps < (2 << *max_run_length_prefix)) { 530 int run_length_prefix = Log2Floor(reps); 531 v_out->push_back(run_length_prefix); 532 extra_bits->push_back(reps - (1 << run_length_prefix)); 533 break; 534 } else { 535 v_out->push_back(*max_run_length_prefix); 536 extra_bits->push_back((1 << *max_run_length_prefix) - 1); 537 reps -= (2 << *max_run_length_prefix) - 1; 538 } 539 } 540 } 541 } 542 } 543 544 // Returns a maximum zero-run-length-prefix value such that run-length coding 545 // zeros in v with this maximum prefix value and then encoding the resulting 546 // histogram and entropy-coding v produces the least amount of bits. 547 int BestMaxZeroRunLengthPrefix(const std::vector<int>& v) { 548 int min_cost = std::numeric_limits<int>::max(); 549 int best_max_prefix = 0; 550 for (int max_prefix = 0; max_prefix <= 16; ++max_prefix) { 551 std::vector<int> rle_symbols; 552 std::vector<int> extra_bits; 553 int max_run_length_prefix = max_prefix; 554 RunLengthCodeZeros(v, &max_run_length_prefix, &rle_symbols, &extra_bits); 555 if (max_run_length_prefix < max_prefix) break; 556 HistogramContextMap histogram; 557 for (int i = 0; i < rle_symbols.size(); ++i) { 558 histogram.Add(rle_symbols[i]); 559 } 560 int bit_cost = PopulationCost(histogram); 561 if (max_prefix > 0) { 562 bit_cost += 4; 563 } 564 for (int i = 1; i <= max_prefix; ++i) { 565 bit_cost += histogram.data_[i] * i; // extra bits 566 } 567 if (bit_cost < min_cost) { 568 min_cost = bit_cost; 569 best_max_prefix = max_prefix; 570 } 571 } 572 return best_max_prefix; 573 } 574 575 void EncodeContextMap(const std::vector<int>& context_map, 576 int num_clusters, 577 int* storage_ix, uint8_t* storage) { 578 EncodeVarLenUint8(num_clusters - 1, storage_ix, storage); 579 580 if (num_clusters == 1) { 581 return; 582 } 583 584 std::vector<int> transformed_symbols = MoveToFrontTransform(context_map); 585 std::vector<int> rle_symbols; 586 std::vector<int> extra_bits; 587 int max_run_length_prefix = BestMaxZeroRunLengthPrefix(transformed_symbols); 588 RunLengthCodeZeros(transformed_symbols, &max_run_length_prefix, 589 &rle_symbols, &extra_bits); 590 HistogramContextMap symbol_histogram; 591 for (int i = 0; i < rle_symbols.size(); ++i) { 592 symbol_histogram.Add(rle_symbols[i]); 593 } 594 bool use_rle = max_run_length_prefix > 0; 595 WriteBits(1, use_rle, storage_ix, storage); 596 if (use_rle) { 597 WriteBits(4, max_run_length_prefix - 1, storage_ix, storage); 598 } 599 EntropyCodeContextMap symbol_code; 600 BuildAndStoreEntropyCode(symbol_histogram, 15, 601 num_clusters + max_run_length_prefix, 602 &symbol_code, 603 storage_ix, storage); 604 for (int i = 0; i < rle_symbols.size(); ++i) { 605 WriteBits(symbol_code.depth_[rle_symbols[i]], 606 symbol_code.bits_[rle_symbols[i]], 607 storage_ix, storage); 608 if (rle_symbols[i] > 0 && rle_symbols[i] <= max_run_length_prefix) { 609 WriteBits(rle_symbols[i], extra_bits[i], storage_ix, storage); 610 } 611 } 612 WriteBits(1, 1, storage_ix, storage); // use move-to-front 613 } 614 615 struct BlockSplitCode { 616 EntropyCodeBlockType block_type_code; 617 EntropyCodeBlockLength block_len_code; 618 }; 619 620 void EncodeBlockLength(const EntropyCodeBlockLength& entropy, 621 int length, 622 int* storage_ix, uint8_t* storage) { 623 int len_code = BlockLengthPrefix(length); 624 int extra_bits = BlockLengthExtraBits(len_code); 625 int extra_bits_value = length - BlockLengthOffset(len_code); 626 WriteBits(entropy.depth_[len_code], entropy.bits_[len_code], 627 storage_ix, storage); 628 if (extra_bits > 0) { 629 WriteBits(extra_bits, extra_bits_value, storage_ix, storage); 630 } 631 } 632 633 void ComputeBlockTypeShortCodes(BlockSplit* split) { 634 if (split->num_types_ <= 1) { 635 split->num_types_ = 1; 636 return; 637 } 638 int ringbuffer[2] = { 0, 1 }; 639 size_t index = 0; 640 for (int i = 0; i < split->types_.size(); ++i) { 641 int type = split->types_[i]; 642 int type_code; 643 if (type == ringbuffer[index & 1]) { 644 type_code = 0; 645 } else if (type == ringbuffer[(index - 1) & 1] + 1) { 646 type_code = 1; 647 } else { 648 type_code = type + 2; 649 } 650 ringbuffer[index & 1] = type; 651 ++index; 652 split->type_codes_.push_back(type_code); 653 } 654 } 655 656 void BuildAndEncodeBlockSplitCode(const BlockSplit& split, 657 BlockSplitCode* code, 658 int* storage_ix, uint8_t* storage) { 659 EncodeVarLenUint8(split.num_types_ - 1, storage_ix, storage); 660 661 if (split.num_types_ == 1) { 662 return; 663 } 664 665 HistogramBlockType type_histo; 666 for (int i = 1; i < split.type_codes_.size(); ++i) { 667 type_histo.Add(split.type_codes_[i]); 668 } 669 HistogramBlockLength length_histo; 670 for (int i = 0; i < split.lengths_.size(); ++i) { 671 length_histo.Add(BlockLengthPrefix(split.lengths_[i])); 672 } 673 BuildAndStoreEntropyCode(type_histo, 15, split.num_types_ + 2, 674 &code->block_type_code, 675 storage_ix, storage); 676 BuildAndStoreEntropyCode(length_histo, 15, kNumBlockLenPrefixes, 677 &code->block_len_code, 678 storage_ix, storage); 679 EncodeBlockLength(code->block_len_code, split.lengths_[0], 680 storage_ix, storage); 681 } 682 683 void MoveAndEncode(const BlockSplitCode& code, 684 BlockSplitIterator* it, 685 int* storage_ix, uint8_t* storage) { 686 if (it->length_ == 0) { 687 ++it->idx_; 688 it->type_ = it->split_.types_[it->idx_]; 689 it->length_ = it->split_.lengths_[it->idx_]; 690 int type_code = it->split_.type_codes_[it->idx_]; 691 WriteBits(code.block_type_code.depth_[type_code], 692 code.block_type_code.bits_[type_code], 693 storage_ix, storage); 694 EncodeBlockLength(code.block_len_code, it->length_, storage_ix, storage); 695 } 696 --it->length_; 697 } 698 699 struct EncodingParams { 700 int num_direct_distance_codes; 701 int distance_postfix_bits; 702 int literal_context_mode; 703 }; 704 705 struct MetaBlock { 706 std::vector<Command> cmds; 707 EncodingParams params; 708 BlockSplit literal_split; 709 BlockSplit command_split; 710 BlockSplit distance_split; 711 std::vector<int> literal_context_modes; 712 std::vector<int> literal_context_map; 713 std::vector<int> distance_context_map; 714 std::vector<HistogramLiteral> literal_histograms; 715 std::vector<HistogramCommand> command_histograms; 716 std::vector<HistogramDistance> distance_histograms; 717 }; 718 719 void BuildMetaBlock(const EncodingParams& params, 720 const std::vector<Command>& cmds, 721 const uint8_t* ringbuffer, 722 const size_t pos, 723 const size_t mask, 724 MetaBlock* mb) { 725 mb->cmds = cmds; 726 mb->params = params; 727 if (cmds.empty()) { 728 return; 729 } 730 ComputeCommandPrefixes(&mb->cmds, 731 mb->params.num_direct_distance_codes, 732 mb->params.distance_postfix_bits); 733 SplitBlock(mb->cmds, 734 &ringbuffer[pos & mask], 735 &mb->literal_split, 736 &mb->command_split, 737 &mb->distance_split); 738 ComputeBlockTypeShortCodes(&mb->literal_split); 739 ComputeBlockTypeShortCodes(&mb->command_split); 740 ComputeBlockTypeShortCodes(&mb->distance_split); 741 742 mb->literal_context_modes.resize(mb->literal_split.num_types_, 743 mb->params.literal_context_mode); 744 745 746 int num_literal_contexts = 747 mb->literal_split.num_types_ << kLiteralContextBits; 748 int num_distance_contexts = 749 mb->distance_split.num_types_ << kDistanceContextBits; 750 std::vector<HistogramLiteral> literal_histograms(num_literal_contexts); 751 mb->command_histograms.resize(mb->command_split.num_types_); 752 std::vector<HistogramDistance> distance_histograms(num_distance_contexts); 753 BuildHistograms(mb->cmds, 754 mb->literal_split, 755 mb->command_split, 756 mb->distance_split, 757 ringbuffer, 758 pos, 759 mask, 760 mb->literal_context_modes, 761 &literal_histograms, 762 &mb->command_histograms, 763 &distance_histograms); 764 765 // Histogram ids need to fit in one byte. 766 static const int kMaxNumberOfHistograms = 256; 767 768 mb->literal_histograms = literal_histograms; 769 ClusterHistograms(literal_histograms, 770 1 << kLiteralContextBits, 771 mb->literal_split.num_types_, 772 kMaxNumberOfHistograms, 773 &mb->literal_histograms, 774 &mb->literal_context_map); 775 776 mb->distance_histograms = distance_histograms; 777 ClusterHistograms(distance_histograms, 778 1 << kDistanceContextBits, 779 mb->distance_split.num_types_, 780 kMaxNumberOfHistograms, 781 &mb->distance_histograms, 782 &mb->distance_context_map); 783 } 784 785 size_t MetaBlockLength(const std::vector<Command>& cmds) { 786 size_t length = 0; 787 for (int i = 0; i < cmds.size(); ++i) { 788 const Command& cmd = cmds[i]; 789 length += cmd.insert_length_ + cmd.copy_length_; 790 } 791 return length; 792 } 793 794 void StoreMetaBlock(const MetaBlock& mb, 795 const bool is_last, 796 const uint8_t* ringbuffer, 797 const size_t mask, 798 size_t* pos, 799 int* storage_ix, uint8_t* storage) { 800 size_t length = MetaBlockLength(mb.cmds); 801 const size_t end_pos = *pos + length; 802 EncodeMetaBlockLength(length, is_last, false, storage_ix, storage); 803 804 if (length == 0) { 805 return; 806 } 807 BlockSplitCode literal_split_code; 808 BlockSplitCode command_split_code; 809 BlockSplitCode distance_split_code; 810 BuildAndEncodeBlockSplitCode(mb.literal_split, &literal_split_code, 811 storage_ix, storage); 812 BuildAndEncodeBlockSplitCode(mb.command_split, &command_split_code, 813 storage_ix, storage); 814 BuildAndEncodeBlockSplitCode(mb.distance_split, &distance_split_code, 815 storage_ix, storage); 816 WriteBits(2, mb.params.distance_postfix_bits, storage_ix, storage); 817 WriteBits(4, 818 mb.params.num_direct_distance_codes >> 819 mb.params.distance_postfix_bits, 820 storage_ix, storage); 821 int num_distance_codes = 822 kNumDistanceShortCodes + mb.params.num_direct_distance_codes + 823 (48 << mb.params.distance_postfix_bits); 824 for (int i = 0; i < mb.literal_split.num_types_; ++i) { 825 WriteBits(2, mb.literal_context_modes[i], storage_ix, storage); 826 } 827 EncodeContextMap(mb.literal_context_map, mb.literal_histograms.size(), 828 storage_ix, storage); 829 EncodeContextMap(mb.distance_context_map, mb.distance_histograms.size(), 830 storage_ix, storage); 831 std::vector<EntropyCodeLiteral> literal_codes; 832 std::vector<EntropyCodeCommand> command_codes; 833 std::vector<EntropyCodeDistance> distance_codes; 834 BuildAndStoreEntropyCodes(mb.literal_histograms, 256, &literal_codes, 835 storage_ix, storage); 836 BuildAndStoreEntropyCodes(mb.command_histograms, kNumCommandPrefixes, 837 &command_codes, storage_ix, storage); 838 BuildAndStoreEntropyCodes(mb.distance_histograms, num_distance_codes, 839 &distance_codes, storage_ix, storage); 840 BlockSplitIterator literal_it(mb.literal_split); 841 BlockSplitIterator command_it(mb.command_split); 842 BlockSplitIterator distance_it(mb.distance_split); 843 for (int i = 0; i < mb.cmds.size(); ++i) { 844 const Command& cmd = mb.cmds[i]; 845 MoveAndEncode(command_split_code, &command_it, storage_ix, storage); 846 EncodeCommand(cmd, command_codes[command_it.type_], storage_ix, storage); 847 for (int j = 0; j < cmd.insert_length_; ++j) { 848 MoveAndEncode(literal_split_code, &literal_it, storage_ix, storage); 849 int histogram_idx = literal_it.type_; 850 uint8_t prev_byte = *pos > 0 ? ringbuffer[(*pos - 1) & mask] : 0; 851 uint8_t prev_byte2 = *pos > 1 ? ringbuffer[(*pos - 2) & mask] : 0; 852 int context = ((literal_it.type_ << kLiteralContextBits) + 853 Context(prev_byte, prev_byte2, 854 mb.literal_context_modes[literal_it.type_])); 855 histogram_idx = mb.literal_context_map[context]; 856 int literal = ringbuffer[*pos & mask]; 857 WriteBits(literal_codes[histogram_idx].depth_[literal], 858 literal_codes[histogram_idx].bits_[literal], 859 storage_ix, storage); 860 ++(*pos); 861 } 862 if (*pos < end_pos && cmd.distance_prefix_ != 0xffff) { 863 MoveAndEncode(distance_split_code, &distance_it, storage_ix, storage); 864 int context = (distance_it.type_ << 2) + 865 ((cmd.copy_length_code_ > 4) ? 3 : cmd.copy_length_code_ - 2); 866 int histogram_index = mb.distance_context_map[context]; 867 size_t max_distance = std::min(*pos, (size_t)kMaxBackwardDistance); 868 EncodeCopyDistance(cmd, distance_codes[histogram_index], 869 storage_ix, storage); 870 } 871 *pos += cmd.copy_length_; 872 } 873 } 874 875 BrotliCompressor::BrotliCompressor(BrotliParams params) 876 : params_(params), 877 window_bits_(kWindowBits), 878 hashers_(new Hashers()), 879 dist_ringbuffer_idx_(0), 880 input_pos_(0), 881 ringbuffer_(kRingBufferBits, kMetaBlockSizeBits), 882 literal_cost_(1 << kRingBufferBits), 883 storage_ix_(0), 884 storage_(new uint8_t[2 << kMetaBlockSizeBits]) { 885 dist_ringbuffer_[0] = 16; 886 dist_ringbuffer_[1] = 15; 887 dist_ringbuffer_[2] = 11; 888 dist_ringbuffer_[3] = 4; 889 storage_[0] = 0; 890 switch (params.mode) { 891 case BrotliParams::MODE_TEXT: hash_type_ = Hashers::HASH_15_8_4; break; 892 case BrotliParams::MODE_FONT: hash_type_ = Hashers::HASH_15_8_2; break; 893 default: break; 894 } 895 hashers_->Init(hash_type_); 896 if (params.mode == BrotliParams::MODE_TEXT) { 897 StoreDictionaryWordHashes(); 898 } 899 } 900 901 BrotliCompressor::~BrotliCompressor() { 902 delete[] storage_; 903 } 904 905 StaticDictionary *BrotliCompressor::static_dictionary_ = NULL; 906 907 void BrotliCompressor::StoreDictionaryWordHashes() { 908 const int num_transforms = kNumTransforms; 909 if (static_dictionary_ == NULL) { 910 static_dictionary_ = new StaticDictionary; 911 for (int t = num_transforms - 1; t >= 0; --t) { 912 for (int i = kMaxDictionaryWordLength; 913 i >= kMinDictionaryWordLength; --i) { 914 const int num_words = 1 << kBrotliDictionarySizeBitsByLength[i]; 915 for (int j = num_words - 1; j >= 0; --j) { 916 int word_id = t * num_words + j; 917 std::string word = GetTransformedDictionaryWord(i, word_id); 918 if (word.size() >= 4) { 919 static_dictionary_->Insert(word, i, word_id); 920 } 921 } 922 } 923 } 924 } 925 hashers_->SetStaticDictionary(static_dictionary_); 926 } 927 928 void BrotliCompressor::WriteStreamHeader() { 929 // Encode window size. 930 if (window_bits_ == 16) { 931 WriteBits(1, 0, &storage_ix_, storage_); 932 } else { 933 WriteBits(1, 1, &storage_ix_, storage_); 934 WriteBits(3, window_bits_ - 17, &storage_ix_, storage_); 935 } 936 } 937 938 void BrotliCompressor::WriteMetaBlock(const size_t input_size, 939 const uint8_t* input_buffer, 940 const bool is_last, 941 size_t* encoded_size, 942 uint8_t* encoded_buffer) { 943 static const double kMinUTF8Ratio = 0.75; 944 bool utf8_mode = false; 945 std::vector<Command> commands; 946 if (input_size > 0) { 947 ringbuffer_.Write(input_buffer, input_size); 948 utf8_mode = IsMostlyUTF8( 949 &ringbuffer_.start()[input_pos_ & kRingBufferMask], 950 input_size, kMinUTF8Ratio); 951 if (utf8_mode) { 952 EstimateBitCostsForLiteralsUTF8(input_pos_, input_size, 953 kRingBufferMask, kRingBufferMask, 954 ringbuffer_.start(), &literal_cost_[0]); 955 } else { 956 EstimateBitCostsForLiterals(input_pos_, input_size, 957 kRingBufferMask, kRingBufferMask, 958 ringbuffer_.start(), &literal_cost_[0]); 959 } 960 CreateBackwardReferences( 961 input_size, input_pos_, 962 ringbuffer_.start(), 963 &literal_cost_[0], 964 kRingBufferMask, kMaxBackwardDistance, 965 hashers_.get(), 966 hash_type_, 967 &commands); 968 ComputeDistanceShortCodes(&commands, input_pos_, kMaxBackwardDistance, 969 dist_ringbuffer_, 970 &dist_ringbuffer_idx_); 971 } 972 EncodingParams params; 973 params.num_direct_distance_codes = 974 params_.mode == BrotliParams::MODE_FONT ? 12 : 0; 975 params.distance_postfix_bits = 976 params_.mode == BrotliParams::MODE_FONT ? 1 : 0; 977 params.literal_context_mode = CONTEXT_SIGNED; 978 const int storage_ix0 = storage_ix_; 979 MetaBlock mb; 980 BuildMetaBlock(params, commands, ringbuffer_.start(), input_pos_, 981 kRingBufferMask, &mb); 982 StoreMetaBlock(mb, is_last, ringbuffer_.start(), kRingBufferMask, 983 &input_pos_, &storage_ix_, storage_); 984 size_t output_size = is_last ? ((storage_ix_ + 7) >> 3) : (storage_ix_ >> 3); 985 output_size -= (storage_ix0 >> 3); 986 if (input_size + 4 < output_size) { 987 storage_ix_ = storage_ix0; 988 storage_[storage_ix_ >> 3] &= (1 << (storage_ix_ & 7)) - 1; 989 EncodeMetaBlockLength(input_size, false, true, &storage_ix_, storage_); 990 size_t hdr_size = (storage_ix_ + 7) >> 3; 991 memcpy(encoded_buffer, storage_, hdr_size); 992 memcpy(encoded_buffer + hdr_size, input_buffer, input_size); 993 *encoded_size = hdr_size + input_size; 994 if (is_last) { 995 encoded_buffer[*encoded_size] = 0x3; // ISLAST, ISEMPTY 996 ++(*encoded_size); 997 } 998 storage_ix_ = 0; 999 storage_[0] = 0; 1000 } else { 1001 memcpy(encoded_buffer, storage_, output_size); 1002 *encoded_size = output_size; 1003 if (is_last) { 1004 storage_ix_ = 0; 1005 storage_[0] = 0; 1006 } else { 1007 storage_ix_ -= output_size << 3; 1008 storage_[storage_ix_ >> 3] = storage_[output_size]; 1009 } 1010 } 1011 } 1012 1013 void BrotliCompressor::FinishStream( 1014 size_t* encoded_size, uint8_t* encoded_buffer) { 1015 WriteMetaBlock(0, NULL, true, encoded_size, encoded_buffer); 1016 } 1017 1018 1019 int BrotliCompressBuffer(BrotliParams params, 1020 size_t input_size, 1021 const uint8_t* input_buffer, 1022 size_t* encoded_size, 1023 uint8_t* encoded_buffer) { 1024 if (input_size == 0) { 1025 encoded_buffer[0] = 6; 1026 *encoded_size = 1; 1027 return 1; 1028 } 1029 1030 BrotliCompressor compressor(params); 1031 compressor.WriteStreamHeader(); 1032 1033 const int max_block_size = 1 << kMetaBlockSizeBits; 1034 size_t max_output_size = *encoded_size; 1035 const uint8_t* input_end = input_buffer + input_size; 1036 *encoded_size = 0; 1037 1038 while (input_buffer < input_end) { 1039 int block_size = max_block_size; 1040 bool is_last = false; 1041 if (block_size >= input_end - input_buffer) { 1042 block_size = input_end - input_buffer; 1043 is_last = true; 1044 } 1045 size_t output_size = max_output_size; 1046 compressor.WriteMetaBlock(block_size, input_buffer, is_last, 1047 &output_size, &encoded_buffer[*encoded_size]); 1048 input_buffer += block_size; 1049 *encoded_size += output_size; 1050 max_output_size -= output_size; 1051 } 1052 1053 return 1; 1054 } 1055 1056 } // namespace brotli 1057