Home | History | Annotate | Download | only in research
      1 #include "./deorummolae.h"
      2 
      3 #include <array>
      4 #include <cstdio>
      5 
      6 #include "./esaxx/sais.hxx"
      7 
      8 /* Used for quick SA-entry to file mapping. Each file is padded to size that
      9    is a multiple of chunk size. */
     10 #define CHUNK_SIZE 64
     11 /* Length of substring that is considered to be covered by dictionary string. */
     12 #define CUT_MATCH 6
     13 /* Minimal dictionary entry size. */
     14 #define MIN_MATCH 24
     15 
     16 /* Non tunable definitions. */
     17 #define CHUNK_MASK (CHUNK_SIZE - 1)
     18 #define COVERAGE_SIZE (1 << (DM_LOG_MAX_FILES - 6))
     19 
     20 /* File coverage: every bit set to 1 denotes a file covered by an isle. */
     21 typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
     22 
     23 /* Symbol of text alphabet. */
     24 typedef int32_t TextChar;
     25 
     26 /* Pointer to position in text. */
     27 typedef uint32_t TextIdx;
     28 
     29 /* SAIS sarray_type; unfortunately, must be a signed type. */
     30 typedef int32_t TextSaIdx;
     31 
     32 static size_t popcount(uint64_t u) {
     33   return static_cast<size_t>(__builtin_popcountll(u));
     34 }
     35 
     36 /* Condense terminators and pad file entries. */
     37 static void rewriteText(std::vector<TextChar>* text) {
     38   TextChar terminator = text->back();
     39   TextChar prev = terminator;
     40   TextIdx to = 0;
     41   for (TextIdx from = 0; from < text->size(); ++from) {
     42     TextChar next = text->at(from);
     43     if (next < 256 || prev < 256) {
     44       text->at(to++) = next;
     45       if (next >= 256) terminator = next;
     46     }
     47     prev = next;
     48   }
     49   text->resize(to);
     50   if (text->empty()) text->push_back(terminator);
     51   while (text->size() & CHUNK_MASK) text->push_back(terminator);
     52 }
     53 
     54 /* Reenumerate terminators for smaller alphabet. */
     55 static void remapTerminators(std::vector<TextChar>* text,
     56     TextChar* next_terminator) {
     57   TextChar prev = -1;
     58   TextChar x = 256;
     59   for (TextIdx i = 0; i < text->size(); ++i) {
     60     TextChar next = text->at(i);
     61     if (next < 256) {  // Char.
     62       // Do nothing.
     63     } else if (prev < 256) {  // Terminator after char.
     64       next = x++;
     65     } else {  // Terminator after terminator.
     66       next = prev;
     67     }
     68     text->at(i) = next;
     69     prev = next;
     70   }
     71   *next_terminator = x;
     72 }
     73 
     74 /* Combine all file entries; create mapping position->file. */
     75 static void buildFullText(std::vector<std::vector<TextChar>>* data,
     76     std::vector<TextChar>* full_text, std::vector<TextIdx>* file_map,
     77     std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
     78   file_map->resize(0);
     79   file_offset->resize(0);
     80   full_text->resize(0);
     81   for (TextIdx i = 0; i < data->size(); ++i) {
     82     file_offset->push_back(full_text->size());
     83     std::vector<TextChar>& file = data->at(i);
     84     rewriteText(&file);
     85     full_text->insert(full_text->end(), file.begin(), file.end());
     86     file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
     87   }
     88   if (false) remapTerminators(full_text, next_terminator);
     89 }
     90 
     91 /* Build longest-common-prefix based on suffix array and text.
     92    TODO: borrowed -> unknown efficiency. */
     93 static void buildLcp(std::vector<TextChar>* text, std::vector<TextIdx>* sa,
     94     std::vector<TextIdx>* lcp, std::vector<TextIdx>* invese_sa) {
     95   TextIdx size = static_cast<TextIdx>(text->size());
     96   lcp->resize(size);
     97   TextIdx k = 0;
     98   lcp->at(size - 1) = 0;
     99   for (TextIdx i = 0; i < size; ++i) {
    100     if (invese_sa->at(i) == size - 1) {
    101       k = 0;
    102       continue;
    103     }
    104     // Suffix which follow i-th suffix.
    105     TextIdx j = sa->at(invese_sa->at(i) + 1);
    106     while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
    107       ++k;
    108     }
    109     lcp->at(invese_sa->at(i)) = k;
    110     if (k > 0) --k;
    111   }
    112 }
    113 
    114 /* Isle is a range in SA with LCP not less than some value.
    115    When we raise the LCP requirement, the isle sunks and smaller isles appear
    116    instead. */
    117 typedef struct {
    118   TextIdx lcp;
    119   TextIdx l;
    120   TextIdx r;
    121   Coverage coverage;
    122 } Isle;
    123 
    124 /* Helper routine for `cutMatch`. */
    125 static void poisonData(TextIdx pos, TextIdx length,
    126     std::vector<std::vector<TextChar>>* data, std::vector<TextIdx>* file_map,
    127     std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
    128   TextIdx f = file_map->at(pos / CHUNK_SIZE);
    129   pos -= file_offset->at(f);
    130   std::vector<TextChar>& file = data->at(f);
    131   TextIdx l = (length == CUT_MATCH) ? CUT_MATCH : 1;
    132   for (TextIdx j = 0; j < l; j++, pos++) {
    133     if (file[pos] >= 256) continue;
    134     if (file[pos + 1] >= 256) {
    135       file[pos] = file[pos + 1];
    136     } else if (pos > 0 && file[pos - 1] >= 256) {
    137       file[pos] = file[pos - 1];
    138     } else {
    139       file[pos] = (*next_terminator)++;
    140     }
    141   }
    142 }
    143 
    144 /* Remove substrings of a given match from files.
    145    Substrings are replaced with unique terminators, so next iteration SA would
    146    not allow to cross removed areas. */
    147 static void cutMatch(std::vector<std::vector<TextChar>>* data, TextIdx index,
    148     TextIdx length, std::vector<TextIdx>* sa, std::vector<TextIdx>* lcp,
    149     std::vector<TextIdx>* invese_sa, TextChar* next_terminator,
    150     std::vector<TextIdx>* file_map, std::vector<TextIdx>* file_offset) {
    151   while (length >= CUT_MATCH) {
    152     TextIdx i = index;
    153     while (lcp->at(i) >= length) {
    154       i++;
    155       poisonData(
    156           sa->at(i), length, data, file_map, file_offset, next_terminator);
    157     }
    158     while (true) {
    159       poisonData(
    160           sa->at(index), length, data, file_map, file_offset, next_terminator);
    161       if (index == 0 || lcp->at(index - 1) < length) break;
    162       index--;
    163     }
    164     length--;
    165     index = invese_sa->at(sa->at(index) + 1);
    166   }
    167 }
    168 
    169 std::string DM_generate(size_t dictionary_size_limit,
    170     const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
    171   {
    172     TextIdx tmp = static_cast<TextIdx>(dictionary_size_limit);
    173     if ((tmp != dictionary_size_limit) || (tmp > 1u << 30)) {
    174       fprintf(stderr, "dictionary_size_limit is too large\n");
    175       return "";
    176     }
    177   }
    178 
    179   /* Could use 256 + '0' for easier debugging. */
    180   TextChar next_terminator = 256;
    181 
    182   std::string output;
    183   std::vector<std::vector<TextChar>> data;
    184 
    185   TextIdx offset = 0;
    186   size_t num_samples = sample_sizes.size();
    187   if (num_samples > DM_MAX_FILES) num_samples = DM_MAX_FILES;
    188   for (size_t n = 0; n < num_samples; ++n) {
    189     TextIdx delta = static_cast<TextIdx>(sample_sizes[n]);
    190     if (delta != sample_sizes[n]) {
    191       fprintf(stderr, "sample is too large\n");
    192       return "";
    193     }
    194     if (delta == 0) {
    195       fprintf(stderr, "0-length samples are prohibited\n");
    196       return "";
    197     }
    198     TextIdx next_offset = offset + delta;
    199     if (next_offset <= offset) {
    200       fprintf(stderr, "corpus is too large\n");
    201       return "";
    202     }
    203     data.push_back(
    204         std::vector<TextChar>(sample_data + offset, sample_data + next_offset));
    205     offset = next_offset;
    206     data.back().push_back(next_terminator++);
    207   }
    208 
    209   /* Most arrays are allocated once, and then just resized to smaller and
    210      smaller sizes. */
    211   std::vector<TextChar> full_text;
    212   std::vector<TextIdx> file_map;
    213   std::vector<TextIdx> file_offset;
    214   std::vector<TextIdx> sa;
    215   std::vector<TextIdx> invese_sa;
    216   std::vector<TextIdx> lcp;
    217   std::vector<Isle> isles;
    218   std::vector<char> output_data;
    219   TextIdx total = 0;
    220   TextIdx total_cost = 0;
    221   TextIdx best_cost;
    222   Isle best_isle;
    223   size_t min_count = num_samples;
    224 
    225   while (true) {
    226     TextIdx max_match = static_cast<TextIdx>(dictionary_size_limit) - total;
    227     buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
    228     sa.resize(full_text.size());
    229     /* Hopefully, non-negative TextSaIdx is the same sa TextIdx counterpart. */
    230     saisxx(full_text.data(), reinterpret_cast<TextSaIdx*>(sa.data()),
    231         static_cast<TextChar>(full_text.size()), next_terminator);
    232     invese_sa.resize(full_text.size());
    233     for (TextIdx i = 0; i < full_text.size(); ++i) {
    234       invese_sa[sa[i]] = i;
    235     }
    236     buildLcp(&full_text, &sa, &lcp, &invese_sa);
    237 
    238     /* Do not rebuild SA/LCP, just use different selection. */
    239   retry:
    240     best_cost = 0;
    241     best_isle = {0, 0, 0, {{0}}};
    242     isles.resize(0);
    243     isles.push_back(best_isle);
    244 
    245     for (TextIdx i = 0; i < lcp.size(); ++i) {
    246       TextIdx l = i;
    247       Coverage cov = {{0}};
    248       size_t f = file_map[sa[i] / CHUNK_SIZE];
    249       cov[f >> 6] = (static_cast<uint64_t>(1)) << (f & 63);
    250       while (lcp[i] < isles.back().lcp) {
    251         Isle& top = isles.back();
    252         top.r = i;
    253         l = top.l;
    254         for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
    255         size_t count = 0;
    256         for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
    257         TextIdx effective_lcp = top.lcp;
    258         /* Restrict (last) dictionary entry length. */
    259         if (effective_lcp > max_match) effective_lcp = max_match;
    260         TextIdx cost = count * effective_lcp;
    261         if (cost > best_cost && count >= min_count &&
    262             effective_lcp >= MIN_MATCH) {
    263           best_cost = cost;
    264           best_isle = top;
    265           best_isle.lcp = effective_lcp;
    266         }
    267         isles.pop_back();
    268         for (size_t x = 0; x < cov.size(); ++x) {
    269           isles.back().coverage[x] |= cov[x];
    270         }
    271       }
    272       if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}});
    273       for (size_t x = 0; x < cov.size(); ++x) {
    274         isles.back().coverage[x] |= cov[x];
    275       }
    276     }
    277 
    278     /* When saturated matches do not match length restrictions, lower the
    279        saturation requirements. */
    280     if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
    281       if (min_count >= 8) {
    282         min_count = (min_count * 7) / 8;
    283         fprintf(stderr, "Retry: min_count=%zu\n", min_count);
    284         goto retry;
    285       }
    286       break;
    287     }
    288 
    289     /* Save the entry. */
    290     fprintf(stderr, "Savings: %d+%d, dictionary: %d+%d\n",
    291         total_cost, best_cost, total, best_isle.lcp);
    292     int* piece = &full_text[sa[best_isle.l]];
    293     output.insert(output.end(), piece, piece + best_isle.lcp);
    294     total += best_isle.lcp;
    295     total_cost += best_cost;
    296     cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa,
    297         &next_terminator, &file_map, &file_offset);
    298     if (total >= dictionary_size_limit) break;
    299   }
    300 
    301   return output;
    302 }
    303