Home | History | Annotate | Download | only in research
      1 #include "./deorummolae.h"
      2 
      3 #include <array>
      4 #include <vector>
      5 
      6 #include "./esaxx/sais.hxx"
      7 
      8 /* Used for quick SA-entry to file mapping. Each file is padded to size that
      9    is a multiple of chunk size. */
     10 #define CHUNK_SIZE 64
     11 /* Length of substring that is considered to be covered by dictionary string. */
     12 #define CUT_MATCH 6
     13 /* Minimal dictionary entry size. */
     14 #define MIN_MATCH 24
     15 
     16 /* Non tunable definitions. */
     17 #define CHUNK_MASK (CHUNK_SIZE - 1)
     18 #define COVERAGE_SIZE (1 << (LOG_MAX_FILES - 6))
     19 
     20 /* File coverage: every bit set to 1 denotes a file covered by an isle. */
     21 typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
     22 
     23 static int popcount(uint64_t u) {
     24   return __builtin_popcountll(u);
     25 }
     26 
     27 /* Condense terminators and pad file entries. */
     28 static void rewriteText(std::vector<int>* text) {
     29   int terminator = text->back();
     30   int prev = terminator;
     31   size_t to = 0;
     32   for (size_t from = 0; from < text->size(); ++from) {
     33     int next = text->at(from);
     34     if (next < 256 || prev < 256) {
     35       text->at(to++) = next;
     36       if (next >= 256) terminator = next;
     37     }
     38     prev = next;
     39   }
     40   text->resize(to);
     41   if (text->empty()) text->push_back(terminator);
     42   while (text->size() & CHUNK_MASK) text->push_back(terminator);
     43 }
     44 
     45 /* Reenumerate terminators for smaller alphabet. */
     46 static void remapTerminators(std::vector<int>* text, int* next_terminator) {
     47   int prev = -1;
     48   int x = 256;
     49   for (size_t i = 0; i < text->size(); ++i) {
     50     int next = text->at(i);
     51     if (next < 256) {  // Char.
     52       // Do nothing.
     53     } else if (prev < 256) {  // Terminator after char.
     54       next = x++;
     55     } else {  // Terminator after terminator.
     56       next = prev;
     57     }
     58     text->at(i) = next;
     59     prev = next;
     60   }
     61   *next_terminator = x;
     62 }
     63 
     64 /* Combine all file entries; create mapping position->file. */
     65 static void buildFullText(std::vector<std::vector<int>>* data,
     66     std::vector<int>* full_text, std::vector<size_t>* file_map,
     67     std::vector<size_t>* file_offset, int* next_terminator) {
     68   file_map->resize(0);
     69   file_offset->resize(0);
     70   full_text->resize(0);
     71   for (size_t i = 0; i < data->size(); ++i) {
     72     file_offset->push_back(full_text->size());
     73     std::vector<int>& file = data->at(i);
     74     rewriteText(&file);
     75     full_text->insert(full_text->end(), file.begin(), file.end());
     76     file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
     77   }
     78   if (false) remapTerminators(full_text, next_terminator);
     79 }
     80 
     81 /* Build longest-common-prefix based on suffix array and text.
     82    TODO: borrowed -> unknown efficiency. */
     83 static void buildLcp(std::vector<int>* text, std::vector<int>* sa,
     84     std::vector<int>* lcp, std::vector<int>* invese_sa) {
     85   int size = static_cast<int>(text->size());
     86   lcp->resize(size);
     87   int k = 0;
     88   lcp->at(size - 1) = 0;
     89   for (int i = 0; i < size; ++i) {
     90     if (invese_sa->at(i) == size - 1) {
     91       k = 0;
     92       continue;
     93     }
     94     int j = sa->at(invese_sa->at(i) + 1);  // Suffix which follow i-th suffix.
     95     while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
     96       ++k;
     97     }
     98     lcp->at(invese_sa->at(i)) = k;
     99     if (k > 0) --k;
    100   }
    101 }
    102 
    103 /* Isle is a range in SA with LCP not less than some value.
    104    When we raise the LCP requirement, the isle sunks and smaller isles appear
    105    instead. */
    106 typedef struct {
    107   int lcp;
    108   int l;
    109   int r;
    110   Coverage coverage;
    111 } Isle;
    112 
    113 /* Helper routine for `cutMatch`. */
    114 static void poisonData(int pos, int length, std::vector<std::vector<int>>* data,
    115     std::vector<size_t>* file_map, std::vector<size_t>* file_offset,
    116     int* next_terminator) {
    117   size_t f = file_map->at(pos / CHUNK_SIZE);
    118   pos -= file_offset->at(f);
    119   std::vector<int>& file = data->at(f);
    120   int l = (length == CUT_MATCH) ? CUT_MATCH : 1;
    121   for (int j = 0; j < l; j++, pos++) {
    122     if (file[pos] >= 256) continue;
    123     if (file[pos + 1] >= 256) {
    124       file[pos] = file[pos + 1];
    125     } else if (pos > 0 && file[pos - 1] >= 256) {
    126       file[pos] = file[pos - 1];
    127     } else {
    128       file[pos] = (*next_terminator)++;
    129     }
    130   }
    131 }
    132 
    133 /* Remove substrings of a given match from files.
    134    Substrings are replaced with unique terminators, so next iteration SA would
    135    not allow to cross removed areas. */
    136 static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
    137     std::vector<int>* sa, std::vector<int>* lcp, std::vector<int>* invese_sa,
    138     int* next_terminator, std::vector<size_t>* file_map,
    139     std::vector<size_t>* file_offset) {
    140   while (length >= CUT_MATCH) {
    141     int i = index;
    142     while (lcp->at(i) >= length) {
    143       i++;
    144       poisonData(
    145           sa->at(i), length, data, file_map, file_offset, next_terminator);
    146     }
    147     while (true) {
    148       poisonData(
    149           sa->at(index), length, data, file_map, file_offset, next_terminator);
    150       if (index == 0 || lcp->at(index - 1) < length) break;
    151       index--;
    152     }
    153     length--;
    154     index = invese_sa->at(sa->at(index) + 1);
    155   }
    156 }
    157 
    158 size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
    159     size_t num_samples, const size_t* sample_sizes,
    160     const uint8_t* sample_data) {
    161   {
    162     uint64_t tmp = 0;
    163     if (popcount(tmp - 1u) != 64) {
    164       fprintf(stderr, "64-bit platform is required\n");
    165       return 0;
    166     }
    167   }
    168 
    169   /* Could use 256 + '0' for easier debugging. */
    170   int next_terminator = 256;
    171 
    172   std::vector<std::vector<int>> data;
    173 
    174   size_t offset = 0;
    175   if (num_samples > MAX_FILES) num_samples = MAX_FILES;
    176   for (size_t n = 0; n < num_samples; ++n) {
    177     size_t next_offset = offset + sample_sizes[n];
    178     data.push_back(
    179         std::vector<int>(sample_data + offset, sample_data + next_offset));
    180     offset = next_offset;
    181     data.back().push_back(next_terminator++);
    182   }
    183 
    184   /* Most arrays are allocated once, and then just resized to smaller and
    185      smaller sizes. */
    186   std::vector<int> full_text;
    187   std::vector<size_t> file_map;
    188   std::vector<size_t> file_offset;
    189   std::vector<int> sa;
    190   std::vector<int> invese_sa;
    191   std::vector<int> lcp;
    192   std::vector<Isle> isles;
    193   std::vector<char> output_data;
    194   size_t total = 0;
    195   size_t total_cost = 0;
    196   size_t best_cost;
    197   Isle best_isle;
    198   int min_count = num_samples;
    199 
    200   while (true) {
    201     size_t max_match = dictionary_size_limit - total;
    202     buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
    203     sa.resize(full_text.size());
    204     saisxx(full_text.data(), sa.data(), static_cast<int>(full_text.size()),
    205         next_terminator);
    206     invese_sa.resize(full_text.size());
    207     for (int i = 0; i < full_text.size(); ++i) invese_sa[sa[i]] = i;
    208     buildLcp(&full_text, &sa, &lcp, &invese_sa);
    209 
    210     /* Do not rebuild SA/LCP, just use different selection. */
    211 retry:
    212     best_cost = 0;
    213     best_isle = {0, 0, 0, {{0}}};
    214     isles.resize(0);
    215     isles.push_back(best_isle);
    216 
    217     for (int i = 0; i < static_cast<int>(lcp.size()); ++i) {
    218       int l = i;
    219       Coverage cov = {{0}};
    220       int f = file_map[sa[i] / CHUNK_SIZE];
    221       cov[f >> 6] = ((uint64_t)1) << (f & 63);
    222       while (lcp[i] < isles.back().lcp) {
    223         Isle& top = isles.back();
    224         top.r = i;
    225         l = top.l;
    226         for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
    227         int count = 0;
    228         for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
    229         int effective_lcp = top.lcp;
    230         /* Restrict (last) dictionary entry length. */
    231         if (effective_lcp > max_match) effective_lcp = max_match;
    232         int cost = count * effective_lcp;
    233         if (cost > best_cost && count >= min_count &&
    234             effective_lcp >= MIN_MATCH) {
    235           best_cost = cost;
    236           best_isle = top;
    237           best_isle.lcp = effective_lcp;
    238         }
    239         isles.pop_back();
    240         for (size_t x = 0; x < cov.size(); ++x) {
    241           isles.back().coverage[x] |= cov[x];
    242         }
    243       }
    244       if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}});
    245       for (size_t x = 0; x < cov.size(); ++x) {
    246         isles.back().coverage[x] |= cov[x];
    247       }
    248     }
    249 
    250     /* When saturated matches do not match length restrictions, lower the
    251        saturation requirements. */
    252     if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
    253       if (min_count >= 8) {
    254         min_count = (min_count * 7) / 8;
    255         fprintf(stderr, "Retry: min_count=%d\n", min_count);
    256         goto retry;
    257       }
    258       break;
    259     }
    260 
    261     /* Save the entry. */
    262     fprintf(stderr,
    263       "Savings: %zu+%zu, dictionary: %zu+%d\n",
    264       total_cost, best_cost, total, best_isle.lcp);
    265     for (size_t i = 0; i < best_isle.lcp; ++i) {
    266       dictionary[total + i] =
    267           static_cast<uint8_t>(full_text[sa[best_isle.l] + i]);
    268     }
    269     total += best_isle.lcp;
    270     total_cost += best_cost;
    271     cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp,
    272         &invese_sa, &next_terminator, &file_map, &file_offset);
    273     if (total >= dictionary_size_limit) break;
    274   }
    275   return total;
    276 }
    277