1 #include "./deorummolae.h" 2 3 #include <array> 4 #include <vector> 5 6 #include "./esaxx/sais.hxx" 7 8 /* Used for quick SA-entry to file mapping. Each file is padded to size that 9 is a multiple of chunk size. */ 10 #define CHUNK_SIZE 64 11 /* Length of substring that is considered to be covered by dictionary string. */ 12 #define CUT_MATCH 6 13 /* Minimal dictionary entry size. */ 14 #define MIN_MATCH 24 15 16 /* Non tunable definitions. */ 17 #define CHUNK_MASK (CHUNK_SIZE - 1) 18 #define COVERAGE_SIZE (1 << (LOG_MAX_FILES - 6)) 19 20 /* File coverage: every bit set to 1 denotes a file covered by an isle. */ 21 typedef std::array<uint64_t, COVERAGE_SIZE> Coverage; 22 23 static int popcount(uint64_t u) { 24 return __builtin_popcountll(u); 25 } 26 27 /* Condense terminators and pad file entries. */ 28 static void rewriteText(std::vector<int>* text) { 29 int terminator = text->back(); 30 int prev = terminator; 31 size_t to = 0; 32 for (size_t from = 0; from < text->size(); ++from) { 33 int next = text->at(from); 34 if (next < 256 || prev < 256) { 35 text->at(to++) = next; 36 if (next >= 256) terminator = next; 37 } 38 prev = next; 39 } 40 text->resize(to); 41 if (text->empty()) text->push_back(terminator); 42 while (text->size() & CHUNK_MASK) text->push_back(terminator); 43 } 44 45 /* Reenumerate terminators for smaller alphabet. */ 46 static void remapTerminators(std::vector<int>* text, int* next_terminator) { 47 int prev = -1; 48 int x = 256; 49 for (size_t i = 0; i < text->size(); ++i) { 50 int next = text->at(i); 51 if (next < 256) { // Char. 52 // Do nothing. 53 } else if (prev < 256) { // Terminator after char. 54 next = x++; 55 } else { // Terminator after terminator. 56 next = prev; 57 } 58 text->at(i) = next; 59 prev = next; 60 } 61 *next_terminator = x; 62 } 63 64 /* Combine all file entries; create mapping position->file. */ 65 static void buildFullText(std::vector<std::vector<int>>* data, 66 std::vector<int>* full_text, std::vector<size_t>* file_map, 67 std::vector<size_t>* file_offset, int* next_terminator) { 68 file_map->resize(0); 69 file_offset->resize(0); 70 full_text->resize(0); 71 for (size_t i = 0; i < data->size(); ++i) { 72 file_offset->push_back(full_text->size()); 73 std::vector<int>& file = data->at(i); 74 rewriteText(&file); 75 full_text->insert(full_text->end(), file.begin(), file.end()); 76 file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i); 77 } 78 if (false) remapTerminators(full_text, next_terminator); 79 } 80 81 /* Build longest-common-prefix based on suffix array and text. 82 TODO: borrowed -> unknown efficiency. */ 83 static void buildLcp(std::vector<int>* text, std::vector<int>* sa, 84 std::vector<int>* lcp, std::vector<int>* invese_sa) { 85 int size = static_cast<int>(text->size()); 86 lcp->resize(size); 87 int k = 0; 88 lcp->at(size - 1) = 0; 89 for (int i = 0; i < size; ++i) { 90 if (invese_sa->at(i) == size - 1) { 91 k = 0; 92 continue; 93 } 94 int j = sa->at(invese_sa->at(i) + 1); // Suffix which follow i-th suffix. 95 while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) { 96 ++k; 97 } 98 lcp->at(invese_sa->at(i)) = k; 99 if (k > 0) --k; 100 } 101 } 102 103 /* Isle is a range in SA with LCP not less than some value. 104 When we raise the LCP requirement, the isle sunks and smaller isles appear 105 instead. */ 106 typedef struct { 107 int lcp; 108 int l; 109 int r; 110 Coverage coverage; 111 } Isle; 112 113 /* Helper routine for `cutMatch`. */ 114 static void poisonData(int pos, int length, std::vector<std::vector<int>>* data, 115 std::vector<size_t>* file_map, std::vector<size_t>* file_offset, 116 int* next_terminator) { 117 size_t f = file_map->at(pos / CHUNK_SIZE); 118 pos -= file_offset->at(f); 119 std::vector<int>& file = data->at(f); 120 int l = (length == CUT_MATCH) ? CUT_MATCH : 1; 121 for (int j = 0; j < l; j++, pos++) { 122 if (file[pos] >= 256) continue; 123 if (file[pos + 1] >= 256) { 124 file[pos] = file[pos + 1]; 125 } else if (pos > 0 && file[pos - 1] >= 256) { 126 file[pos] = file[pos - 1]; 127 } else { 128 file[pos] = (*next_terminator)++; 129 } 130 } 131 } 132 133 /* Remove substrings of a given match from files. 134 Substrings are replaced with unique terminators, so next iteration SA would 135 not allow to cross removed areas. */ 136 static void cutMatch(std::vector<std::vector<int>>* data, int index, int length, 137 std::vector<int>* sa, std::vector<int>* lcp, std::vector<int>* invese_sa, 138 int* next_terminator, std::vector<size_t>* file_map, 139 std::vector<size_t>* file_offset) { 140 while (length >= CUT_MATCH) { 141 int i = index; 142 while (lcp->at(i) >= length) { 143 i++; 144 poisonData( 145 sa->at(i), length, data, file_map, file_offset, next_terminator); 146 } 147 while (true) { 148 poisonData( 149 sa->at(index), length, data, file_map, file_offset, next_terminator); 150 if (index == 0 || lcp->at(index - 1) < length) break; 151 index--; 152 } 153 length--; 154 index = invese_sa->at(sa->at(index) + 1); 155 } 156 } 157 158 size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit, 159 size_t num_samples, const size_t* sample_sizes, 160 const uint8_t* sample_data) { 161 { 162 uint64_t tmp = 0; 163 if (popcount(tmp - 1u) != 64) { 164 fprintf(stderr, "64-bit platform is required\n"); 165 return 0; 166 } 167 } 168 169 /* Could use 256 + '0' for easier debugging. */ 170 int next_terminator = 256; 171 172 std::vector<std::vector<int>> data; 173 174 size_t offset = 0; 175 if (num_samples > MAX_FILES) num_samples = MAX_FILES; 176 for (size_t n = 0; n < num_samples; ++n) { 177 size_t next_offset = offset + sample_sizes[n]; 178 data.push_back( 179 std::vector<int>(sample_data + offset, sample_data + next_offset)); 180 offset = next_offset; 181 data.back().push_back(next_terminator++); 182 } 183 184 /* Most arrays are allocated once, and then just resized to smaller and 185 smaller sizes. */ 186 std::vector<int> full_text; 187 std::vector<size_t> file_map; 188 std::vector<size_t> file_offset; 189 std::vector<int> sa; 190 std::vector<int> invese_sa; 191 std::vector<int> lcp; 192 std::vector<Isle> isles; 193 std::vector<char> output_data; 194 size_t total = 0; 195 size_t total_cost = 0; 196 size_t best_cost; 197 Isle best_isle; 198 int min_count = num_samples; 199 200 while (true) { 201 size_t max_match = dictionary_size_limit - total; 202 buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator); 203 sa.resize(full_text.size()); 204 saisxx(full_text.data(), sa.data(), static_cast<int>(full_text.size()), 205 next_terminator); 206 invese_sa.resize(full_text.size()); 207 for (int i = 0; i < full_text.size(); ++i) invese_sa[sa[i]] = i; 208 buildLcp(&full_text, &sa, &lcp, &invese_sa); 209 210 /* Do not rebuild SA/LCP, just use different selection. */ 211 retry: 212 best_cost = 0; 213 best_isle = {0, 0, 0, {{0}}}; 214 isles.resize(0); 215 isles.push_back(best_isle); 216 217 for (int i = 0; i < static_cast<int>(lcp.size()); ++i) { 218 int l = i; 219 Coverage cov = {{0}}; 220 int f = file_map[sa[i] / CHUNK_SIZE]; 221 cov[f >> 6] = ((uint64_t)1) << (f & 63); 222 while (lcp[i] < isles.back().lcp) { 223 Isle& top = isles.back(); 224 top.r = i; 225 l = top.l; 226 for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x]; 227 int count = 0; 228 for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]); 229 int effective_lcp = top.lcp; 230 /* Restrict (last) dictionary entry length. */ 231 if (effective_lcp > max_match) effective_lcp = max_match; 232 int cost = count * effective_lcp; 233 if (cost > best_cost && count >= min_count && 234 effective_lcp >= MIN_MATCH) { 235 best_cost = cost; 236 best_isle = top; 237 best_isle.lcp = effective_lcp; 238 } 239 isles.pop_back(); 240 for (size_t x = 0; x < cov.size(); ++x) { 241 isles.back().coverage[x] |= cov[x]; 242 } 243 } 244 if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}}); 245 for (size_t x = 0; x < cov.size(); ++x) { 246 isles.back().coverage[x] |= cov[x]; 247 } 248 } 249 250 /* When saturated matches do not match length restrictions, lower the 251 saturation requirements. */ 252 if (best_cost == 0 || best_isle.lcp < MIN_MATCH) { 253 if (min_count >= 8) { 254 min_count = (min_count * 7) / 8; 255 fprintf(stderr, "Retry: min_count=%d\n", min_count); 256 goto retry; 257 } 258 break; 259 } 260 261 /* Save the entry. */ 262 fprintf(stderr, 263 "Savings: %zu+%zu, dictionary: %zu+%d\n", 264 total_cost, best_cost, total, best_isle.lcp); 265 for (size_t i = 0; i < best_isle.lcp; ++i) { 266 dictionary[total + i] = 267 static_cast<uint8_t>(full_text[sa[best_isle.l] + i]); 268 } 269 total += best_isle.lcp; 270 total_cost += best_cost; 271 cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, 272 &invese_sa, &next_terminator, &file_map, &file_offset); 273 if (total >= dictionary_size_limit) break; 274 } 275 return total; 276 } 277