Home | History | Annotate | Download | only in research
      1 #include <cstddef>
      2 #include <cstdio>
      3 #include <cstring>
      4 #include <fstream>
      5 #include <vector>
      6 
      7 #include "./deorummolae.h"
      8 #include "./durchschlag.h"
      9 #include "./sieve.h"
     10 
     11 #define METHOD_DM 0
     12 #define METHOD_SIEVE 1
     13 #define METHOD_DURCHSCHLAG 2
     14 #define METHOD_DISTILL 3
     15 #define METHOD_PURIFY 4
     16 
     17 static size_t readInt(const char* str) {
     18   size_t result = 0;
     19   if (str[0] == 0 || str[0] == '0') {
     20     return 0;
     21   }
     22   for (size_t i = 0; i < 13; ++i) {
     23     if (str[i] == 0) {
     24       return result;
     25     }
     26     if (str[i] == 'k' || str[i] == 'K') {
     27       if ((str[i + 1] == 0) && ((result << 10) > result)) {
     28         return result << 10;
     29       }
     30       return 0;
     31     }
     32     if (str[i] == 'm' || str[i] == 'M') {
     33       if ((str[i + 1] == 0) && ((result << 20) > result)) {
     34         return result << 20;
     35       }
     36       return 0;
     37     }
     38     if (str[i] < '0' || str[i] > '9') {
     39       return 0;
     40     }
     41     size_t next = (10 * result) + (str[i] - '0');
     42     if (next <= result) {
     43       return 0;
     44     }
     45     result = next;
     46   }
     47   return 0;
     48 }
     49 
     50 static std::string readFile(const std::string& path) {
     51   std::ifstream file(path);
     52   std::string content(
     53       (std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
     54   return content;
     55 }
     56 
     57 static void writeFile(const char* file, const std::string& content) {
     58   std::ofstream outfile(file, std::ofstream::binary);
     59   outfile.write(content.c_str(), static_cast<std::streamsize>(content.size()));
     60   outfile.close();
     61 }
     62 
     63 static void writeSamples(char const* argv[], const std::vector<int>& pathArgs,
     64     const std::vector<size_t>& sizes, const uint8_t* data) {
     65   size_t offset = 0;
     66   for (size_t i = 0; i < pathArgs.size(); ++i) {
     67     int j = pathArgs[i];
     68     const char* file = argv[j];
     69     size_t sampleSize = sizes[i];
     70     std::ofstream outfile(file, std::ofstream::binary);
     71     outfile.write(reinterpret_cast<const char*>(data + offset),
     72         static_cast<std::streamsize>(sampleSize));
     73     outfile.close();
     74     offset += sampleSize;
     75   }
     76 }
     77 
     78 /* Returns "base file name" or its tail, if it contains '/' or '\'. */
     79 static const char* fileName(const char* path) {
     80   const char* separator_position = strrchr(path, '/');
     81   if (separator_position) path = separator_position + 1;
     82   separator_position = strrchr(path, '\\');
     83   if (separator_position) path = separator_position + 1;
     84   return path;
     85 }
     86 
     87 static void printHelp(const char* name) {
     88   fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
     89   fprintf(stderr,
     90       "Options:\n"
     91       "  --dm       use 'deorummolae' engine\n"
     92       "  --distill  rewrite samples; unique text parts are removed\n"
     93       "  --dsh      use 'durchschlag' engine (default)\n"
     94       "  --purify   rewrite samples; unique text parts are zeroed out\n"
     95       "  --sieve    use 'sieve' engine\n"
     96       "  -b#        set block length for 'durchschlag'; default: 1024\n"
     97       "  -s#        set slice length for 'distill', 'durchschlag', 'purify'\n"
     98       "             and 'sieve'; default: 16\n"
     99       "  -t#        set target dictionary size (limit); default: 16K\n"
    100       "  -u#        set minimum slice population (for rewrites); default: 2\n"
    101       "# is a decimal number with optional k/K/m/M suffix.\n"
    102       "WARNING: 'distill' and 'purify' will overwrite original samples!\n"
    103       "         Completely unique samples might become empty files.\n\n");
    104 }
    105 
    106 int main(int argc, char const* argv[]) {
    107   int dictionaryArg = -1;
    108   int method = METHOD_DURCHSCHLAG;
    109   size_t sliceLen = 16;
    110   size_t targetSize = 16 << 10;
    111   size_t blockSize = 1024;
    112   size_t minimumPopulation = 2;
    113 
    114   std::vector<uint8_t> data;
    115   std::vector<size_t> sizes;
    116   std::vector<int> pathArgs;
    117   size_t total = 0;
    118   for (int i = 1; i < argc; ++i) {
    119     if (argv[i] == nullptr) {
    120       continue;
    121     }
    122     if (argv[i][0] == '-') {
    123       if (argv[i][1] == '-') {
    124         if (dictionaryArg != -1) {
    125           fprintf(stderr,
    126               "Method should be specified before dictionary / sample '%s'\n",
    127               argv[i]);
    128           exit(1);
    129         }
    130         if (std::strcmp("--sieve", argv[i]) == 0) {
    131           method = METHOD_SIEVE;
    132           continue;
    133         }
    134         if (std::strcmp("--dm", argv[i]) == 0) {
    135           method = METHOD_DM;
    136           continue;
    137         }
    138         if (std::strcmp("--dsh", argv[i]) == 0) {
    139           method = METHOD_DURCHSCHLAG;
    140           continue;
    141         }
    142         if (std::strcmp("--distill", argv[i]) == 0) {
    143           method = METHOD_DISTILL;
    144           continue;
    145         }
    146         if (std::strcmp("--purify", argv[i]) == 0) {
    147           method = METHOD_PURIFY;
    148           continue;
    149         }
    150         printHelp(fileName(argv[0]));
    151         fprintf(stderr, "Invalid option '%s'\n", argv[i]);
    152         exit(1);
    153       }
    154       if (argv[i][1] == 'b') {
    155         blockSize = readInt(&argv[i][2]);
    156         if (blockSize < 16 || blockSize > 65536) {
    157           printHelp(fileName(argv[0]));
    158           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
    159           exit(1);
    160         }
    161       } else if (argv[i][1] == 's') {
    162         sliceLen = readInt(&argv[i][2]);
    163         if (sliceLen < 4 || sliceLen > 256) {
    164           printHelp(fileName(argv[0]));
    165           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
    166           exit(1);
    167         }
    168       } else if (argv[i][1] == 't') {
    169         targetSize = readInt(&argv[i][2]);
    170         if (targetSize < 256 || targetSize > (1 << 25)) {
    171           printHelp(fileName(argv[0]));
    172           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
    173           exit(1);
    174         }
    175       } else if (argv[i][1] == 'u') {
    176         minimumPopulation = readInt(&argv[i][2]);
    177         if (minimumPopulation < 256 || minimumPopulation > 65536) {
    178           printHelp(fileName(argv[0]));
    179           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
    180           exit(1);
    181         }
    182       } else {
    183         printHelp(fileName(argv[0]));
    184         fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
    185         exit(1);
    186       }
    187       continue;
    188     }
    189     if (dictionaryArg == -1) {
    190       if (method != METHOD_DISTILL && method != METHOD_PURIFY) {
    191         dictionaryArg = i;
    192         continue;
    193       }
    194     }
    195     std::string content = readFile(argv[i]);
    196     data.insert(data.end(), content.begin(), content.end());
    197     total += content.size();
    198     pathArgs.push_back(i);
    199     sizes.push_back(content.size());
    200   }
    201   bool wantDictionary = (dictionaryArg == -1);
    202   if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
    203     wantDictionary = false;
    204   }
    205   if (wantDictionary || total == 0) {
    206     printHelp(fileName(argv[0]));
    207     fprintf(stderr, "Not enough arguments\n");
    208     exit(1);
    209   }
    210 
    211   if (method == METHOD_SIEVE) {
    212     writeFile(argv[dictionaryArg], sieve_generate(
    213         targetSize, sliceLen, sizes, data.data()));
    214   } else if (method == METHOD_DM) {
    215     writeFile(argv[dictionaryArg], DM_generate(
    216         targetSize, sizes, data.data()));
    217   } else if (method == METHOD_DURCHSCHLAG) {
    218     writeFile(argv[dictionaryArg], durchschlag_generate(
    219         targetSize, sliceLen, blockSize, sizes, data.data()));
    220   } else if (method == METHOD_DISTILL) {
    221     durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data());
    222     writeSamples(argv, pathArgs, sizes, data.data());
    223   } else if (method == METHOD_PURIFY) {
    224     durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data());
    225     writeSamples(argv, pathArgs, sizes, data.data());
    226   } else {
    227     printHelp(fileName(argv[0]));
    228     fprintf(stderr, "Unknown generator\n");
    229     exit(1);
    230   }
    231   return 0;
    232 }
    233