1 #include "./deorummolae.h" 2 3 #include <array> 4 #include <cstdio> 5 6 #include "./esaxx/sais.hxx" 7 8 /* Used for quick SA-entry to file mapping. Each file is padded to size that 9 is a multiple of chunk size. */ 10 #define CHUNK_SIZE 64 11 /* Length of substring that is considered to be covered by dictionary string. */ 12 #define CUT_MATCH 6 13 /* Minimal dictionary entry size. */ 14 #define MIN_MATCH 24 15 16 /* Non tunable definitions. */ 17 #define CHUNK_MASK (CHUNK_SIZE - 1) 18 #define COVERAGE_SIZE (1 << (DM_LOG_MAX_FILES - 6)) 19 20 /* File coverage: every bit set to 1 denotes a file covered by an isle. */ 21 typedef std::array<uint64_t, COVERAGE_SIZE> Coverage; 22 23 /* Symbol of text alphabet. */ 24 typedef int32_t TextChar; 25 26 /* Pointer to position in text. */ 27 typedef uint32_t TextIdx; 28 29 /* SAIS sarray_type; unfortunately, must be a signed type. */ 30 typedef int32_t TextSaIdx; 31 32 static size_t popcount(uint64_t u) { 33 return static_cast<size_t>(__builtin_popcountll(u)); 34 } 35 36 /* Condense terminators and pad file entries. */ 37 static void rewriteText(std::vector<TextChar>* text) { 38 TextChar terminator = text->back(); 39 TextChar prev = terminator; 40 TextIdx to = 0; 41 for (TextIdx from = 0; from < text->size(); ++from) { 42 TextChar next = text->at(from); 43 if (next < 256 || prev < 256) { 44 text->at(to++) = next; 45 if (next >= 256) terminator = next; 46 } 47 prev = next; 48 } 49 text->resize(to); 50 if (text->empty()) text->push_back(terminator); 51 while (text->size() & CHUNK_MASK) text->push_back(terminator); 52 } 53 54 /* Reenumerate terminators for smaller alphabet. */ 55 static void remapTerminators(std::vector<TextChar>* text, 56 TextChar* next_terminator) { 57 TextChar prev = -1; 58 TextChar x = 256; 59 for (TextIdx i = 0; i < text->size(); ++i) { 60 TextChar next = text->at(i); 61 if (next < 256) { // Char. 62 // Do nothing. 63 } else if (prev < 256) { // Terminator after char. 64 next = x++; 65 } else { // Terminator after terminator. 66 next = prev; 67 } 68 text->at(i) = next; 69 prev = next; 70 } 71 *next_terminator = x; 72 } 73 74 /* Combine all file entries; create mapping position->file. */ 75 static void buildFullText(std::vector<std::vector<TextChar>>* data, 76 std::vector<TextChar>* full_text, std::vector<TextIdx>* file_map, 77 std::vector<TextIdx>* file_offset, TextChar* next_terminator) { 78 file_map->resize(0); 79 file_offset->resize(0); 80 full_text->resize(0); 81 for (TextIdx i = 0; i < data->size(); ++i) { 82 file_offset->push_back(full_text->size()); 83 std::vector<TextChar>& file = data->at(i); 84 rewriteText(&file); 85 full_text->insert(full_text->end(), file.begin(), file.end()); 86 file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i); 87 } 88 if (false) remapTerminators(full_text, next_terminator); 89 } 90 91 /* Build longest-common-prefix based on suffix array and text. 92 TODO: borrowed -> unknown efficiency. */ 93 static void buildLcp(std::vector<TextChar>* text, std::vector<TextIdx>* sa, 94 std::vector<TextIdx>* lcp, std::vector<TextIdx>* invese_sa) { 95 TextIdx size = static_cast<TextIdx>(text->size()); 96 lcp->resize(size); 97 TextIdx k = 0; 98 lcp->at(size - 1) = 0; 99 for (TextIdx i = 0; i < size; ++i) { 100 if (invese_sa->at(i) == size - 1) { 101 k = 0; 102 continue; 103 } 104 // Suffix which follow i-th suffix. 105 TextIdx j = sa->at(invese_sa->at(i) + 1); 106 while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) { 107 ++k; 108 } 109 lcp->at(invese_sa->at(i)) = k; 110 if (k > 0) --k; 111 } 112 } 113 114 /* Isle is a range in SA with LCP not less than some value. 115 When we raise the LCP requirement, the isle sunks and smaller isles appear 116 instead. */ 117 typedef struct { 118 TextIdx lcp; 119 TextIdx l; 120 TextIdx r; 121 Coverage coverage; 122 } Isle; 123 124 /* Helper routine for `cutMatch`. */ 125 static void poisonData(TextIdx pos, TextIdx length, 126 std::vector<std::vector<TextChar>>* data, std::vector<TextIdx>* file_map, 127 std::vector<TextIdx>* file_offset, TextChar* next_terminator) { 128 TextIdx f = file_map->at(pos / CHUNK_SIZE); 129 pos -= file_offset->at(f); 130 std::vector<TextChar>& file = data->at(f); 131 TextIdx l = (length == CUT_MATCH) ? CUT_MATCH : 1; 132 for (TextIdx j = 0; j < l; j++, pos++) { 133 if (file[pos] >= 256) continue; 134 if (file[pos + 1] >= 256) { 135 file[pos] = file[pos + 1]; 136 } else if (pos > 0 && file[pos - 1] >= 256) { 137 file[pos] = file[pos - 1]; 138 } else { 139 file[pos] = (*next_terminator)++; 140 } 141 } 142 } 143 144 /* Remove substrings of a given match from files. 145 Substrings are replaced with unique terminators, so next iteration SA would 146 not allow to cross removed areas. */ 147 static void cutMatch(std::vector<std::vector<TextChar>>* data, TextIdx index, 148 TextIdx length, std::vector<TextIdx>* sa, std::vector<TextIdx>* lcp, 149 std::vector<TextIdx>* invese_sa, TextChar* next_terminator, 150 std::vector<TextIdx>* file_map, std::vector<TextIdx>* file_offset) { 151 while (length >= CUT_MATCH) { 152 TextIdx i = index; 153 while (lcp->at(i) >= length) { 154 i++; 155 poisonData( 156 sa->at(i), length, data, file_map, file_offset, next_terminator); 157 } 158 while (true) { 159 poisonData( 160 sa->at(index), length, data, file_map, file_offset, next_terminator); 161 if (index == 0 || lcp->at(index - 1) < length) break; 162 index--; 163 } 164 length--; 165 index = invese_sa->at(sa->at(index) + 1); 166 } 167 } 168 169 std::string DM_generate(size_t dictionary_size_limit, 170 const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) { 171 { 172 TextIdx tmp = static_cast<TextIdx>(dictionary_size_limit); 173 if ((tmp != dictionary_size_limit) || (tmp > 1u << 30)) { 174 fprintf(stderr, "dictionary_size_limit is too large\n"); 175 return ""; 176 } 177 } 178 179 /* Could use 256 + '0' for easier debugging. */ 180 TextChar next_terminator = 256; 181 182 std::string output; 183 std::vector<std::vector<TextChar>> data; 184 185 TextIdx offset = 0; 186 size_t num_samples = sample_sizes.size(); 187 if (num_samples > DM_MAX_FILES) num_samples = DM_MAX_FILES; 188 for (size_t n = 0; n < num_samples; ++n) { 189 TextIdx delta = static_cast<TextIdx>(sample_sizes[n]); 190 if (delta != sample_sizes[n]) { 191 fprintf(stderr, "sample is too large\n"); 192 return ""; 193 } 194 if (delta == 0) { 195 fprintf(stderr, "0-length samples are prohibited\n"); 196 return ""; 197 } 198 TextIdx next_offset = offset + delta; 199 if (next_offset <= offset) { 200 fprintf(stderr, "corpus is too large\n"); 201 return ""; 202 } 203 data.push_back( 204 std::vector<TextChar>(sample_data + offset, sample_data + next_offset)); 205 offset = next_offset; 206 data.back().push_back(next_terminator++); 207 } 208 209 /* Most arrays are allocated once, and then just resized to smaller and 210 smaller sizes. */ 211 std::vector<TextChar> full_text; 212 std::vector<TextIdx> file_map; 213 std::vector<TextIdx> file_offset; 214 std::vector<TextIdx> sa; 215 std::vector<TextIdx> invese_sa; 216 std::vector<TextIdx> lcp; 217 std::vector<Isle> isles; 218 std::vector<char> output_data; 219 TextIdx total = 0; 220 TextIdx total_cost = 0; 221 TextIdx best_cost; 222 Isle best_isle; 223 size_t min_count = num_samples; 224 225 while (true) { 226 TextIdx max_match = static_cast<TextIdx>(dictionary_size_limit) - total; 227 buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator); 228 sa.resize(full_text.size()); 229 /* Hopefully, non-negative TextSaIdx is the same sa TextIdx counterpart. */ 230 saisxx(full_text.data(), reinterpret_cast<TextSaIdx*>(sa.data()), 231 static_cast<TextChar>(full_text.size()), next_terminator); 232 invese_sa.resize(full_text.size()); 233 for (TextIdx i = 0; i < full_text.size(); ++i) { 234 invese_sa[sa[i]] = i; 235 } 236 buildLcp(&full_text, &sa, &lcp, &invese_sa); 237 238 /* Do not rebuild SA/LCP, just use different selection. */ 239 retry: 240 best_cost = 0; 241 best_isle = {0, 0, 0, {{0}}}; 242 isles.resize(0); 243 isles.push_back(best_isle); 244 245 for (TextIdx i = 0; i < lcp.size(); ++i) { 246 TextIdx l = i; 247 Coverage cov = {{0}}; 248 size_t f = file_map[sa[i] / CHUNK_SIZE]; 249 cov[f >> 6] = (static_cast<uint64_t>(1)) << (f & 63); 250 while (lcp[i] < isles.back().lcp) { 251 Isle& top = isles.back(); 252 top.r = i; 253 l = top.l; 254 for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x]; 255 size_t count = 0; 256 for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]); 257 TextIdx effective_lcp = top.lcp; 258 /* Restrict (last) dictionary entry length. */ 259 if (effective_lcp > max_match) effective_lcp = max_match; 260 TextIdx cost = count * effective_lcp; 261 if (cost > best_cost && count >= min_count && 262 effective_lcp >= MIN_MATCH) { 263 best_cost = cost; 264 best_isle = top; 265 best_isle.lcp = effective_lcp; 266 } 267 isles.pop_back(); 268 for (size_t x = 0; x < cov.size(); ++x) { 269 isles.back().coverage[x] |= cov[x]; 270 } 271 } 272 if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}}); 273 for (size_t x = 0; x < cov.size(); ++x) { 274 isles.back().coverage[x] |= cov[x]; 275 } 276 } 277 278 /* When saturated matches do not match length restrictions, lower the 279 saturation requirements. */ 280 if (best_cost == 0 || best_isle.lcp < MIN_MATCH) { 281 if (min_count >= 8) { 282 min_count = (min_count * 7) / 8; 283 fprintf(stderr, "Retry: min_count=%zu\n", min_count); 284 goto retry; 285 } 286 break; 287 } 288 289 /* Save the entry. */ 290 fprintf(stderr, "Savings: %d+%d, dictionary: %d+%d\n", 291 total_cost, best_cost, total, best_isle.lcp); 292 int* piece = &full_text[sa[best_isle.l]]; 293 output.insert(output.end(), piece, piece + best_isle.lcp); 294 total += best_isle.lcp; 295 total_cost += best_cost; 296 cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa, 297 &next_terminator, &file_map, &file_offset); 298 if (total >= dictionary_size_limit) break; 299 } 300 301 return output; 302 } 303