Home | History | Annotate | Download | only in pending
      1 /* compress.c - deflate/inflate code for zip, gzip, zlib, and raw
      2  *
      3  * Copyright 2014 Rob Landley <rob (at) landley.net>
      4  *
      5  * The inflate/deflate code lives here, so the various things that use it
      6  * either live here or call these commands to pipe data through them.
      7  *
      8  * Divergence from posix: replace obsolete/patented "compress" with mutiplexer.
      9  * (gzip already replaces "uncompress".)
     10  *
     11  * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
     12  * LSB 4.1 has gzip, gunzip, and zcat
     13  * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
     14 
     15 // Accept many different kinds of command line argument.
     16 // Leave Lrg at end so flag values line up.
     17 
     18 USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN))
     19 USE_GZIP(NEWTOY(gzip, USE_GZIP_D("d")"19dcflqStvgLRz[!gLRz]", TOYFLAG_USR|TOYFLAG_BIN))
     20 USE_ZCAT(NEWTOY(zcat, 0, TOYFLAG_USR|TOYFLAG_BIN))
     21 USE_GUNZIP(NEWTOY(gunzip, "cflqStv", TOYFLAG_USR|TOYFLAG_BIN))
     22 
     23 //zip unzip gzip gunzip zcat
     24 
     25 config COMPRESS
     26   bool "compress"
     27   default n
     28   help
     29     usage: compress [-zgLR19] [FILE]
     30 
     31     Compress or decompress file (or stdin) using "deflate" algorithm.
     32 
     33     -1	min compression
     34     -9	max compression (default)
     35     -g	gzip (default)
     36     -L	zlib
     37     -R	raw
     38     -z	zip
     39 
     40 config GZIP
     41   bool "gzip"
     42   default y
     43   depends on COMPRESS
     44   help
     45     usage: gzip [-19cfqStvzgLR] [FILE...]
     46 
     47     Compess (deflate) file(s). With no files, compress stdin to stdout.
     48 
     49     On successful decompression, compressed files are replaced with the
     50     uncompressed version. The input file is removed and replaced with
     51     a new file without the .gz extension (with same ownership/permissions).
     52 
     53     -1	Minimal compression (fastest)
     54     -9	Max compression (default)
     55     -c	cat to stdout (act as zcat)
     56     -f	force (if output file exists, input is tty, unrecognized extension)
     57     -q	quiet (no warnings)
     58     -S	specify exension (default .*)
     59     -t	test compressed file(s)
     60     -v	verbose (like -l, but compress files)
     61 
     62     Compression type:
     63     -g gzip (default)    -L zlib    -R raw    -z zip
     64 
     65 config GZIP_D
     66   bool
     67   default y
     68   depends on GZIP && DECOMPRESS
     69   help
     70     usage: gzip [-d]
     71 
     72     -d	decompress (act as gunzip)
     73 
     74 config DECOMPRESS
     75   bool "decompress"
     76   default n
     77   help
     78     usage: compress [-zglrcd9] [FILE]
     79 
     80     Compress or decompress file (or stdin) using "deflate" algorithm.
     81 
     82     -c	compress with -g gzip (default)  -l zlib  -r raw  -z zip
     83     -d	decompress (autodetects type)
     84 
     85 
     86 config ZCAT
     87   bool "zcat"
     88   default y
     89   depends on DECOMPRESS
     90   help
     91     usage: zcat [FILE...]
     92 
     93     Decompress deflated file(s) to stdout
     94 
     95 config GUNZIP
     96   bool "gunzip"
     97   default y
     98   depends on DECOMPRESS
     99   help
    100     usage: gunzip [-cflqStv] [FILE...]
    101 
    102     Decompess (deflate) file(s). With no files, compress stdin to stdout.
    103 
    104     On successful decompression, compressed files are replaced with the
    105     uncompressed version. The input file is removed and replaced with
    106     a new file without the .gz extension (with same ownership/permissions).
    107 
    108     -c	cat to stdout (act as zcat)
    109     -f	force (output file exists, input is tty, unrecognized extension)
    110     -l	list compressed/uncompressed/ratio/name for each input file.
    111     -q	quiet (no warnings)
    112     -S	specify exension (default .*)
    113     -t	test compressed file(s)
    114     -v	verbose (like -l, but decompress files)
    115 */
    116 
    117 #define FOR_compress
    118 #include "toys.h"
    119 
    120 GLOBALS(
    121   // Huffman codes: base offset and extra bits tables (length and distance)
    122   char lenbits[29], distbits[30];
    123   unsigned short lenbase[29], distbase[30];
    124   void *fixdisthuff, *fixlithuff;
    125 
    126   // CRC
    127   void (*crcfunc)(char *data, int len);
    128   unsigned crc;
    129 
    130   // Compressed data buffer
    131   char *data;
    132   unsigned pos, len;
    133   int infd, outfd;
    134 
    135   // Tables only used for deflation
    136   unsigned short *hashhead, *hashchain;
    137 )
    138 
    139 // little endian bit buffer
    140 struct bitbuf {
    141   int fd, bitpos, len, max;
    142   char buf[];
    143 };
    144 
    145 // malloc a struct bitbuf
    146 struct bitbuf *bitbuf_init(int fd, int size)
    147 {
    148   struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
    149 
    150   bb->max = size;
    151   bb->fd = fd;
    152 
    153   return bb;
    154 }
    155 
    156 // Advance bitpos without the overhead of recording bits
    157 void bitbuf_skip(struct bitbuf *bb, int bits)
    158 {
    159   int pos = bb->bitpos + bits, len = bb->len << 3;
    160 
    161   while (pos >= len) {
    162     pos -= len;
    163     len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
    164     if (bb->len < 1) perror_exit("inflate EOF");
    165   }
    166   bb->bitpos = pos;
    167 }
    168 
    169 // Optimized single bit inlined version
    170 static inline int bitbuf_bit(struct bitbuf *bb)
    171 {
    172   int bufpos = bb->bitpos>>3;
    173 
    174   if (bufpos == bb->len) {
    175     bitbuf_skip(bb, 0);
    176     bufpos = 0;
    177   }
    178 
    179   return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
    180 }
    181 
    182 // Fetch the next X bits from the bitbuf, little endian
    183 unsigned bitbuf_get(struct bitbuf *bb, int bits)
    184 {
    185   int result = 0, offset = 0;
    186 
    187   while (bits) {
    188     int click = bb->bitpos >> 3, blow, blen;
    189 
    190     // Load more data if buffer empty
    191     if (click == bb->len) bitbuf_skip(bb, click = 0);
    192 
    193     // grab bits from next byte
    194     blow = bb->bitpos & 7;
    195     blen = 8-blow;
    196     if (blen > bits) blen = bits;
    197     result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
    198     offset += blen;
    199     bits -= blen;
    200     bb->bitpos += blen;
    201   }
    202 
    203   return result;
    204 }
    205 
    206 void bitbuf_flush(struct bitbuf *bb)
    207 {
    208   if (!bb->bitpos) return;
    209 
    210   xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8);
    211   memset(bb->buf, 0, bb->max);
    212   bb->bitpos = 0;
    213 }
    214 
    215 void bitbuf_put(struct bitbuf *bb, int data, int len)
    216 {
    217   while (len) {
    218     int click = bb->bitpos >> 3, blow, blen;
    219 
    220     // Flush buffer if necessary
    221     if (click == bb->max) {
    222       bitbuf_flush(bb);
    223       click = 0;
    224     }
    225     blow = bb->bitpos & 7;
    226     blen = 8-blow;
    227     if (blen > len) blen = len;
    228     bb->buf[click] |= data << blow;
    229     bb->bitpos += blen;
    230     data >>= blen;
    231     len -= blen;
    232   }
    233 }
    234 
    235 static void output_byte(char sym)
    236 {
    237   int pos = TT.pos++ & 32767;
    238 
    239   TT.data[pos] = sym;
    240 
    241   if (!pos) {
    242     xwrite(TT.outfd, TT.data, 32768);
    243     if (TT.crcfunc) TT.crcfunc(TT.data, 32768);
    244   }
    245 }
    246 
    247 // Huffman coding uses bits to traverse a binary tree to a leaf node,
    248 // By placing frequently occurring symbols at shorter paths, frequently
    249 // used symbols may be represented in fewer bits than uncommon symbols.
    250 
    251 struct huff {
    252   unsigned short length[16];
    253   unsigned short symbol[288];
    254 };
    255 
    256 // Create simple huffman tree from array of bit lengths.
    257 
    258 // The symbols in the huffman trees are sorted (first by bit length
    259 // of the code to reach them, then by symbol number). This means that given
    260 // the bit length of each symbol, we can construct a unique tree.
    261 static void len2huff(struct huff *huff, char bitlen[], int len)
    262 {
    263   int offset[16];
    264   int i;
    265 
    266   // Count number of codes at each bit length
    267   memset(huff, 0, sizeof(struct huff));
    268   for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
    269 
    270   // Sort symbols by bit length. (They'll remain sorted by symbol within that.)
    271   *huff->length = *offset = 0;
    272   for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
    273 
    274   for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
    275 }
    276 
    277 // Fetch and decode next huffman coded symbol from bitbuf.
    278 // This takes advantage of the sorting to navigate the tree as an array:
    279 // each time we fetch a bit we have all the codes at that bit level in
    280 // order with no gaps.
    281 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
    282 {
    283   unsigned short *length = huff->length;
    284   int start = 0, offset = 0;
    285 
    286   // Traverse through the bit lengths until our code is in this range
    287   for (;;) {
    288     offset = (offset << 1) | bitbuf_bit(bb);
    289     start += *++length;
    290     if ((offset -= *length) < 0) break;
    291     if ((length - huff->length) & 16) error_exit("bad symbol");
    292   }
    293 
    294   return huff->symbol[start + offset];
    295 }
    296 
    297 // Decompress deflated data from bitbuf to TT.outfd.
    298 static void inflate(struct bitbuf *bb)
    299 {
    300   TT.crc = ~0;
    301   // repeat until spanked
    302   for (;;) {
    303     int final, type;
    304 
    305     final = bitbuf_get(bb, 1);
    306     type = bitbuf_get(bb, 2);
    307 
    308     if (type == 3) error_exit("bad type");
    309 
    310     // Uncompressed block?
    311     if (!type) {
    312       int len, nlen;
    313 
    314       // Align to byte, read length
    315       bitbuf_skip(bb, (8-bb->bitpos)&7);
    316       len = bitbuf_get(bb, 16);
    317       nlen = bitbuf_get(bb, 16);
    318       if (len != (0xffff & ~nlen)) error_exit("bad len");
    319 
    320       // Dump literal output data
    321       while (len) {
    322         int pos = bb->bitpos >> 3, bblen = bb->len - pos;
    323         char *p = bb->buf+pos;
    324 
    325         // dump bytes until done or end of current bitbuf contents
    326         if (bblen > len) bblen = len;
    327         pos = bblen;
    328         while (pos--) output_byte(*(p++));
    329         bitbuf_skip(bb, bblen << 3);
    330         len -= bblen;
    331       }
    332 
    333     // Compressed block
    334     } else {
    335       struct huff *disthuff, *lithuff;
    336 
    337       // Dynamic huffman codes?
    338       if (type == 2) {
    339         struct huff *h2 = ((struct huff *)toybuf)+1;
    340         int i, litlen, distlen, hufflen;
    341         char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
    342                               "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
    343 
    344         // The huffman trees are stored as a series of bit lengths
    345         litlen = bitbuf_get(bb, 5)+257;  // max 288
    346         distlen = bitbuf_get(bb, 5)+1;   // max 32
    347         hufflen = bitbuf_get(bb, 4)+4;   // max 19
    348 
    349         // The literal and distance codes are themselves compressed, in
    350         // a complicated way: an array of bit lengths (hufflen many
    351         // entries, each 3 bits) is used to fill out an array of 19 entries
    352         // in a magic order, leaving the rest 0. Then make a tree out of it:
    353         memset(bits = toybuf+1, 0, 19);
    354         for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
    355         len2huff(h2, bits, 19);
    356 
    357         // Use that tree to read in the literal and distance bit lengths
    358         for (i = 0; i < litlen + distlen;) {
    359           int sym = huff_and_puff(bb, h2);
    360 
    361           // 0-15 are literals, 16 = repeat previous code 3-6 times,
    362           // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
    363           if (sym < 16) bits[i++] = sym;
    364           else {
    365             int len = sym & 2;
    366 
    367             len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
    368             memset(bits+i, bits[i-1] * !(sym&3), len);
    369             i += len;
    370           }
    371         }
    372         if (i > litlen+distlen) error_exit("bad tree");
    373 
    374         len2huff(lithuff = h2, bits, litlen);
    375         len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen);
    376 
    377       // Static huffman codes
    378       } else {
    379         lithuff = TT.fixlithuff;
    380         disthuff = TT.fixdisthuff;
    381       }
    382 
    383       // Use huffman tables to decode block of compressed symbols
    384       for (;;) {
    385         int sym = huff_and_puff(bb, lithuff);
    386 
    387         // Literal?
    388         if (sym < 256) output_byte(sym);
    389 
    390         // Copy range?
    391         else if (sym > 256) {
    392           int len, dist;
    393 
    394           sym -= 257;
    395           len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]);
    396           sym = huff_and_puff(bb, disthuff);
    397           dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]);
    398           sym = TT.pos & 32767;
    399 
    400           while (len--) output_byte(TT.data[(TT.pos-dist) & 32767]);
    401 
    402         // End of block
    403         } else break;
    404       }
    405     }
    406 
    407     // Was that the last block?
    408     if (final) break;
    409   }
    410 
    411   if (TT.pos & 32767) {
    412     xwrite(TT.outfd, TT.data, TT.pos & 32767);
    413     if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767);
    414   }
    415 }
    416 
    417 // Deflate from TT.infd to bitbuf
    418 // For deflate, TT.len = input read, TT.pos = input consumed
    419 static void deflate(struct bitbuf *bb)
    420 {
    421   char *data = TT.data;
    422   int len, final = 0;
    423 
    424   TT.crc = ~0;
    425 
    426   while (!final) {
    427     // Read next half-window of data if we haven't hit EOF yet.
    428     len = readall(TT.infd, data+(TT.len&32768), 32768);
    429     if (len < 0) perror_exit("read"); // todo: add filename
    430     if (len != 32768) final++;
    431     if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len);
    432     // TT.len += len;  crcfunc advances len
    433 
    434     // store block as literal
    435     bitbuf_put(bb, final, 1);
    436     bitbuf_put(bb, 0, 1);
    437 
    438     bitbuf_put(bb, 0, (8-bb->bitpos)&7);
    439     bitbuf_put(bb, len, 16);
    440     bitbuf_put(bb, 0xffff & ~len, 16);
    441 
    442     // repeat until spanked
    443     while (TT.pos != TT.len) {
    444       unsigned pos = TT.pos & 65535;
    445 
    446       bitbuf_put(bb, data[pos], 8);
    447 
    448       // need to refill buffer?
    449       if (!(32767 & ++TT.pos) && !final) break;
    450     }
    451   }
    452   bitbuf_flush(bb);
    453 }
    454 
    455 // Allocate memory for deflate/inflate.
    456 static void init_deflate(int compress)
    457 {
    458   int i, n = 1;
    459 
    460   // compress needs 64k data and 32k each for hashhead and hashchain.
    461   // decompress just needs 32k data.
    462   TT.data = xmalloc(32768*(compress ? 4 : 1));
    463   if (compress) {
    464     TT.hashhead = (unsigned short *)(TT.data + 65536);
    465     TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768);
    466   }
    467 
    468   // Calculate lenbits, lenbase, distbits, distbase
    469   *TT.lenbase = 3;
    470   for (i = 0; i<sizeof(TT.lenbits)-1; i++) {
    471     if (i>4) {
    472       if (!(i&3)) {
    473         TT.lenbits[i]++;
    474         n <<= 1;
    475       }
    476       if (i == 27) n--;
    477       else TT.lenbits[i+1] = TT.lenbits[i];
    478     }
    479     TT.lenbase[i+1] = n + TT.lenbase[i];
    480   }
    481   n = 0;
    482   for (i = 0; i<sizeof(TT.distbits); i++) {
    483     TT.distbase[i] = 1<<n;
    484     if (i) TT.distbase[i] += TT.distbase[i-1];
    485     if (i>3 && !(i&1)) n++;
    486     TT.distbits[i] = n;
    487   }
    488 
    489   // Init fixed huffman tables
    490   for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
    491   len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288);
    492   memset(toybuf, 5, 30);
    493   len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30);
    494 }
    495 
    496 // Return true/false whether we consumed a gzip header.
    497 static int is_gzip(struct bitbuf *bb)
    498 {
    499   int flags;
    500 
    501   // Confirm signature
    502   if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
    503     return 0;
    504   bitbuf_skip(bb, 6*8);
    505 
    506   // Skip extra, name, comment, header CRC fields
    507   if (flags & 4) bitbuf_skip(bb, 16);
    508   if (flags & 8) while (bitbuf_get(bb, 8));
    509   if (flags & 16) while (bitbuf_get(bb, 8));
    510   if (flags & 2) bitbuf_skip(bb, 16);
    511 
    512   return 1;
    513 }
    514 
    515 void gzip_crc(char *data, int len)
    516 {
    517   int i;
    518   unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024);
    519 
    520   crc = TT.crc;
    521   for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
    522   TT.crc = crc;
    523   TT.len += len;
    524 }
    525 
    526 static void do_gzip(int fd, char *name)
    527 {
    528   struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf));
    529 
    530   // Header from RFC 1952 section 2.2:
    531   // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
    532   // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
    533   // Operating System (FF=unknown)
    534 
    535   TT.infd = fd;
    536   xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
    537 
    538   // Use last 1k of toybuf for little endian crc table
    539   crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
    540   TT.crcfunc = gzip_crc;
    541 
    542   deflate(bb);
    543 
    544   // tail: crc32, len32
    545 
    546   bitbuf_put(bb, 0, (8-bb->bitpos)&7);
    547   bitbuf_put(bb, ~TT.crc, 32);
    548   bitbuf_put(bb, TT.len, 32);
    549 
    550   bitbuf_flush(bb);
    551   free(bb);
    552 }
    553 
    554 static void do_zcat(int fd, char *name)
    555 {
    556   struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf));
    557 
    558   if (!is_gzip(bb)) error_exit("not gzip");
    559   TT.outfd = 1;
    560 
    561   // Use last 1k of toybuf for little endian crc table
    562   crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
    563   TT.crcfunc = gzip_crc;
    564 
    565   inflate(bb);
    566 
    567   // tail: crc32, len32
    568 
    569   bitbuf_skip(bb, (8-bb->bitpos)&7);
    570   if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32))
    571     error_exit("bad crc");
    572   free(bb);
    573 }
    574 
    575 // Parse many different kinds of command line argument:
    576 
    577 void compress_main(void)
    578 {
    579   // todo: this
    580   printf("hello world");
    581 }
    582 
    583 //#define CLEANUP_compress
    584 //#define FOR_zcat
    585 //#include "generated/flags.h"
    586 
    587 void zcat_main(void)
    588 {
    589   init_deflate(0);
    590 
    591   loopfiles(toys.optargs, do_zcat);
    592 }
    593 
    594 void gunzip_main(void)
    595 {
    596   init_deflate(0);
    597 
    598   loopfiles(toys.optargs, do_zcat);
    599 }
    600 
    601 void gzip_main(void)
    602 {
    603   init_deflate(1);
    604 
    605   loopfiles(toys.optargs, do_gzip);
    606 }
    607