1 // Copyright 2013 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // Function to find backward reference copies. 16 17 #include "./backward_references.h" 18 19 #include <algorithm> 20 #include <vector> 21 22 #include "./command.h" 23 24 namespace brotli { 25 26 template<typename Hasher> 27 void CreateBackwardReferences(size_t num_bytes, 28 size_t position, 29 const uint8_t* ringbuffer, 30 const float* literal_cost, 31 size_t ringbuffer_mask, 32 const size_t max_backward_limit, 33 Hasher* hasher, 34 std::vector<Command>* commands) { 35 // Length heuristic that seems to help probably by better selection 36 // of lazy matches of similar lengths. 37 int insert_length = 0; 38 size_t i = position & ringbuffer_mask; 39 const int i_diff = position - i; 40 const size_t i_end = i + num_bytes; 41 42 const int random_heuristics_window_size = 512; 43 int apply_random_heuristics = i + random_heuristics_window_size; 44 45 double average_cost = 0.0; 46 for (int k = position; k < position + num_bytes; ++k) { 47 average_cost += literal_cost[k & ringbuffer_mask]; 48 } 49 average_cost /= num_bytes; 50 hasher->set_average_cost(average_cost); 51 52 // M1 match is for considering for two repeated copies, if moving 53 // one literal form the previous copy to the current one allows the 54 // current copy to be more efficient (because the way static dictionary 55 // codes words). M1 matching improves text compression density by ~0.15 %. 56 bool match_found_M1 = false; 57 size_t best_len_M1 = 0; 58 size_t best_len_code_M1 = 0; 59 size_t best_dist_M1 = 0; 60 double best_score_M1 = 0; 61 while (i + 2 < i_end) { 62 size_t best_len = 0; 63 size_t best_len_code = 0; 64 size_t best_dist = 0; 65 double best_score = 0; 66 size_t max_distance = std::min(i + i_diff, max_backward_limit); 67 bool in_dictionary; 68 hasher->set_insert_length(insert_length); 69 bool match_found = hasher->FindLongestMatch( 70 ringbuffer, literal_cost, ringbuffer_mask, 71 i + i_diff, i_end - i, max_distance, 72 &best_len, &best_len_code, &best_dist, &best_score, 73 &in_dictionary); 74 bool best_in_dictionary = in_dictionary; 75 if (match_found) { 76 if (match_found_M1 && best_score_M1 > best_score) { 77 // Two copies after each other. Take the last literal from the 78 // last copy, and use it as the first of this one. 79 (commands->rbegin())->copy_length_ -= 1; 80 (commands->rbegin())->copy_length_code_ -= 1; 81 hasher->Store(ringbuffer + i, i + i_diff); 82 --i; 83 best_len = best_len_M1; 84 best_len_code = best_len_code_M1; 85 best_dist = best_dist_M1; 86 best_score = best_score_M1; 87 // in_dictionary doesn't need to be correct, but it is the only 88 // reason why M1 matching should be beneficial here. Setting it here 89 // will only disable further M1 matching against this copy. 90 best_in_dictionary = true; 91 in_dictionary = true; 92 } else { 93 // Found a match. Let's look for something even better ahead. 94 int delayed_backward_references_in_row = 0; 95 while (i + 4 < i_end && 96 delayed_backward_references_in_row < 4) { 97 size_t best_len_2 = 0; 98 size_t best_len_code_2 = 0; 99 size_t best_dist_2 = 0; 100 double best_score_2 = 0; 101 max_distance = std::min(i + i_diff + 1, max_backward_limit); 102 hasher->Store(ringbuffer + i, i + i_diff); 103 match_found = hasher->FindLongestMatch( 104 ringbuffer, literal_cost, ringbuffer_mask, 105 i + i_diff + 1, i_end - i - 1, max_distance, 106 &best_len_2, &best_len_code_2, &best_dist_2, &best_score_2, 107 &in_dictionary); 108 double cost_diff_lazy = 0; 109 if (best_len >= 4) { 110 cost_diff_lazy += 111 literal_cost[(i + 4) & ringbuffer_mask] - average_cost; 112 } 113 { 114 const int tail_length = best_len_2 - best_len + 1; 115 for (int k = 0; k < tail_length; ++k) { 116 cost_diff_lazy -= 117 literal_cost[(i + best_len + k) & ringbuffer_mask] - 118 average_cost; 119 } 120 } 121 // If we are not inserting any symbols, inserting one is more 122 // expensive than if we were inserting symbols anyways. 123 if (insert_length < 1) { 124 cost_diff_lazy += 0.97; 125 } 126 // Add bias to slightly avoid lazy matching. 127 cost_diff_lazy += 2.0 + delayed_backward_references_in_row * 0.2; 128 cost_diff_lazy += 0.04 * literal_cost[i & ringbuffer_mask]; 129 130 if (match_found && best_score_2 >= best_score + cost_diff_lazy) { 131 // Ok, let's just write one byte for now and start a match from the 132 // next byte. 133 ++insert_length; 134 ++delayed_backward_references_in_row; 135 best_len = best_len_2; 136 best_len_code = best_len_code_2; 137 best_dist = best_dist_2; 138 best_score = best_score_2; 139 best_in_dictionary = in_dictionary; 140 i++; 141 } else { 142 break; 143 } 144 } 145 } 146 apply_random_heuristics = 147 i + 2 * best_len + random_heuristics_window_size; 148 Command cmd; 149 cmd.insert_length_ = insert_length; 150 cmd.copy_length_ = best_len; 151 cmd.copy_length_code_ = best_len_code; 152 cmd.copy_distance_ = best_dist; 153 commands->push_back(cmd); 154 insert_length = 0; 155 ++i; 156 if (best_dist <= std::min(i + i_diff, max_backward_limit)) { 157 hasher->set_last_distance(best_dist); 158 } 159 160 // Copy all copied literals to the hasher, except the last one. 161 // We cannot store the last one yet, otherwise we couldn't find 162 // the possible M1 match. 163 for (int j = 1; j < best_len - 1; ++j) { 164 if (i + 2 < i_end) { 165 hasher->Store(ringbuffer + i, i + i_diff); 166 } 167 ++i; 168 } 169 // Prepare M1 match. 170 if (hasher->HasStaticDictionary() && 171 best_len >= 4 && i + 20 < i_end && !best_in_dictionary) { 172 max_distance = std::min(i + i_diff, max_backward_limit); 173 match_found_M1 = hasher->FindLongestMatch( 174 ringbuffer, literal_cost, ringbuffer_mask, 175 i + i_diff, i_end - i, max_distance, 176 &best_len_M1, &best_len_code_M1, &best_dist_M1, &best_score_M1, 177 &in_dictionary); 178 } else { 179 match_found_M1 = false; 180 in_dictionary = false; 181 } 182 // This byte is just moved from the previous copy to the current, 183 // that is no gain. 184 best_score_M1 -= literal_cost[i & ringbuffer_mask]; 185 // Adjust for losing the opportunity for lazy matching. 186 best_score_M1 -= 3.75; 187 188 // Store the last one of the match. 189 if (i + 2 < i_end) { 190 hasher->Store(ringbuffer + i, i + i_diff); 191 } 192 ++i; 193 } else { 194 match_found_M1 = false; 195 ++insert_length; 196 hasher->Store(ringbuffer + i, i + i_diff); 197 ++i; 198 // If we have not seen matches for a long time, we can skip some 199 // match lookups. Unsuccessful match lookups are very very expensive 200 // and this kind of a heuristic speeds up compression quite 201 // a lot. 202 if (i > apply_random_heuristics) { 203 // Going through uncompressible data, jump. 204 if (i > apply_random_heuristics + 4 * random_heuristics_window_size) { 205 // It is quite a long time since we saw a copy, so we assume 206 // that this data is not compressible, and store hashes less 207 // often. Hashes of non compressible data are less likely to 208 // turn out to be useful in the future, too, so we store less of 209 // them to not to flood out the hash table of good compressible 210 // data. 211 int i_jump = std::min(i + 16, i_end - 4); 212 for (; i < i_jump; i += 4) { 213 hasher->Store(ringbuffer + i, i + i_diff); 214 insert_length += 4; 215 } 216 } else { 217 int i_jump = std::min(i + 8, i_end - 2); 218 for (; i < i_jump; i += 2) { 219 hasher->Store(ringbuffer + i, i + i_diff); 220 insert_length += 2; 221 } 222 } 223 } 224 } 225 } 226 insert_length += (i_end - i); 227 228 if (insert_length > 0) { 229 Command cmd; 230 cmd.insert_length_ = insert_length; 231 cmd.copy_length_ = 0; 232 cmd.copy_distance_ = 0; 233 commands->push_back(cmd); 234 } 235 } 236 237 void CreateBackwardReferences(size_t num_bytes, 238 size_t position, 239 const uint8_t* ringbuffer, 240 const float* literal_cost, 241 size_t ringbuffer_mask, 242 const size_t max_backward_limit, 243 Hashers* hashers, 244 Hashers::Type hash_type, 245 std::vector<Command>* commands) { 246 switch (hash_type) { 247 case Hashers::HASH_15_8_4: 248 CreateBackwardReferences( 249 num_bytes, position, ringbuffer, literal_cost, 250 ringbuffer_mask, max_backward_limit, 251 hashers->hash_15_8_4.get(), 252 commands); 253 break; 254 case Hashers::HASH_15_8_2: 255 CreateBackwardReferences( 256 num_bytes, position, ringbuffer, literal_cost, 257 ringbuffer_mask, max_backward_limit, 258 hashers->hash_15_8_2.get(), 259 commands); 260 break; 261 default: 262 break; 263 } 264 } 265 266 267 } // namespace brotli 268