1 // Copyright 2013 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // Literal cost model to allow backward reference replacement to be efficient. 16 17 #include "./literal_cost.h" 18 19 #include <math.h> 20 #include <stdint.h> 21 #include <algorithm> 22 23 namespace brotli { 24 25 static int UTF8Position(int last, int c, int clamp) { 26 if (c < 128) { 27 return 0; // Next one is the 'Byte 1' again. 28 } else if (c >= 192) { 29 return std::min(1, clamp); // Next one is the 'Byte 2' of utf-8 encoding. 30 } else { 31 // Let's decide over the last byte if this ends the sequence. 32 if (last < 0xe0) { 33 return 0; // Completed two or three byte coding. 34 } else { 35 return std::min(2, clamp); // Next one is the 'Byte 3' of utf-8 encoding. 36 } 37 } 38 } 39 40 static int DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask, 41 const uint8_t *data) { 42 int counts[3] = { 0 }; 43 int max_utf8 = 1; // should be 2, but 1 compresses better. 44 int last_c = 0; 45 int utf8_pos = 0; 46 for (int i = 0; i < len; ++i) { 47 int c = data[(pos + i) & mask]; 48 utf8_pos = UTF8Position(last_c, c, 2); 49 ++counts[utf8_pos]; 50 last_c = c; 51 } 52 if (counts[2] < 500) { 53 max_utf8 = 1; 54 } 55 if (counts[1] + counts[2] < 25) { 56 max_utf8 = 0; 57 } 58 return max_utf8; 59 } 60 61 void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask, 62 size_t cost_mask, const uint8_t *data, 63 float *cost) { 64 65 // max_utf8 is 0 (normal ascii single byte modeling), 66 // 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling). 67 const int max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data); 68 int histogram[3][256] = { { 0 } }; 69 int window_half = 495; 70 int in_window = std::min(static_cast<size_t>(window_half), len); 71 int in_window_utf8[3] = { 0 }; 72 73 // Bootstrap histograms. 74 int last_c = 0; 75 int utf8_pos = 0; 76 for (int i = 0; i < in_window; ++i) { 77 int c = data[(pos + i) & mask]; 78 ++histogram[utf8_pos][c]; 79 ++in_window_utf8[utf8_pos]; 80 utf8_pos = UTF8Position(last_c, c, max_utf8); 81 last_c = c; 82 } 83 84 // Compute bit costs with sliding window. 85 for (int i = 0; i < len; ++i) { 86 if (i - window_half >= 0) { 87 // Remove a byte in the past. 88 int c = (i - window_half - 1) < 0 ? 89 0 : data[(pos + i - window_half - 1) & mask]; 90 int last_c = (i - window_half - 2) < 0 ? 91 0 : data[(pos + i - window_half - 2) & mask]; 92 int utf8_pos2 = UTF8Position(last_c, c, max_utf8); 93 --histogram[utf8_pos2][data[(pos + i - window_half) & mask]]; 94 --in_window_utf8[utf8_pos2]; 95 } 96 if (i + window_half < len) { 97 // Add a byte in the future. 98 int c = (i + window_half - 1) < 0 ? 99 0 : data[(pos + i + window_half - 1) & mask]; 100 int last_c = (i + window_half - 2) < 0 ? 101 0 : data[(pos + i + window_half - 2) & mask]; 102 int utf8_pos2 = UTF8Position(last_c, c, max_utf8); 103 ++histogram[utf8_pos2][data[(pos + i + window_half) & mask]]; 104 ++in_window_utf8[utf8_pos2]; 105 } 106 int c = i < 1 ? 0 : data[(pos + i - 1) & mask]; 107 int last_c = i < 2 ? 0 : data[(pos + i - 2) & mask]; 108 int utf8_pos = UTF8Position(last_c, c, max_utf8); 109 int masked_pos = (pos + i) & mask; 110 int histo = histogram[utf8_pos][data[masked_pos]]; 111 if (histo == 0) { 112 histo = 1; 113 } 114 float lit_cost = log2(static_cast<double>(in_window_utf8[utf8_pos]) 115 / histo); 116 lit_cost += 0.02905; 117 if (lit_cost < 1.0) { 118 lit_cost *= 0.5; 119 lit_cost += 0.5; 120 } 121 // Make the first bytes more expensive -- seems to help, not sure why. 122 // Perhaps because the entropy source is changing its properties 123 // rapidly in the beginning of the file, perhaps because the beginning 124 // of the data is a statistical "anomaly". 125 if (i < 2000) { 126 lit_cost += 0.7 - ((2000 - i) / 2000.0 * 0.35); 127 } 128 cost[(pos + i) & cost_mask] = lit_cost; 129 } 130 } 131 132 void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask, 133 size_t cost_mask, const uint8_t *data, 134 float *cost) { 135 int histogram[256] = { 0 }; 136 int window_half = 2000; 137 int in_window = std::min(static_cast<size_t>(window_half), len); 138 139 // Bootstrap histogram. 140 for (int i = 0; i < in_window; ++i) { 141 ++histogram[data[(pos + i) & mask]]; 142 } 143 144 // Compute bit costs with sliding window. 145 for (int i = 0; i < len; ++i) { 146 if (i - window_half >= 0) { 147 // Remove a byte in the past. 148 --histogram[data[(pos + i - window_half) & mask]]; 149 --in_window; 150 } 151 if (i + window_half < len) { 152 // Add a byte in the future. 153 ++histogram[data[(pos + i + window_half) & mask]]; 154 ++in_window; 155 } 156 int histo = histogram[data[(pos + i) & mask]]; 157 if (histo == 0) { 158 histo = 1; 159 } 160 float lit_cost = log2(static_cast<double>(in_window) / histo); 161 lit_cost += 0.029; 162 if (lit_cost < 1.0) { 163 lit_cost *= 0.5; 164 lit_cost += 0.5; 165 } 166 cost[(pos + i) & cost_mask] = lit_cost; 167 } 168 } 169 170 171 } // namespace brotli 172