Home | History | Annotate | Download | only in pending
      1 /* compress.c - deflate/inflate code for zip, gzip, zlib, and raw
      2  *
      3  * Copyright 2014 Rob Landley <rob (at) landley.net>
      4  *
      5  * The inflate/deflate code lives here, so the various things that use it
      6  * either live here or call these commands to pipe data through them.
      7  *
      8  * Divergence from posix: replace obsolete/patented "compress" with mutiplexer.
      9  * (gzip already replaces "uncompress".)
     10  *
     11  * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
     12  * LSB 4.1 has gzip, gunzip, and zcat
     13  * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
     14 
     15 // Accept many different kinds of command line argument.
     16 // Leave Lrg at end so flag values line up.
     17 
     18 USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN))
     19 
     20 //zip unzip gzip gunzip zcat
     21 
     22 config COMPRESS
     23   bool "compress"
     24   default n
     25   help
     26     usage: compress [-zgLR19] [FILE]
     27 
     28     Compress or decompress file (or stdin) using "deflate" algorithm.
     29 
     30     -1	min compression
     31     -9	max compression (default)
     32     -g	gzip (default)
     33     -L	zlib
     34     -R	raw
     35     -z	zip
     36 
     37 config DECOMPRESS
     38   bool "decompress"
     39   default n
     40   help
     41     usage: compress [-zglrcd9] [FILE]
     42 
     43     Compress or decompress file (or stdin) using "deflate" algorithm.
     44 
     45     -c	compress with -g gzip (default)  -l zlib  -r raw  -z zip
     46     -d	decompress (autodetects type)
     47 */
     48 
     49 #define FOR_compress
     50 #include "toys.h"
     51 
     52 GLOBALS(
     53   // Huffman codes: base offset and extra bits tables (length and distance)
     54   char lenbits[29], distbits[30];
     55   unsigned short lenbase[29], distbase[30];
     56   void *fixdisthuff, *fixlithuff;
     57 
     58   // CRC
     59   void (*crcfunc)(char *data, int len);
     60   unsigned crc;
     61 
     62   // Compressed data buffer
     63   char *data;
     64   unsigned pos, len;
     65   int infd, outfd;
     66 
     67   // Tables only used for deflation
     68   unsigned short *hashhead, *hashchain;
     69 )
     70 
     71 // little endian bit buffer
     72 struct bitbuf {
     73   int fd, bitpos, len, max;
     74   char buf[];
     75 };
     76 
     77 // malloc a struct bitbuf
     78 struct bitbuf *bitbuf_init(int fd, int size)
     79 {
     80   struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
     81 
     82   bb->max = size;
     83   bb->fd = fd;
     84 
     85   return bb;
     86 }
     87 
     88 // Advance bitpos without the overhead of recording bits
     89 void bitbuf_skip(struct bitbuf *bb, int bits)
     90 {
     91   int pos = bb->bitpos + bits, len = bb->len << 3;
     92 
     93   while (pos >= len) {
     94     pos -= len;
     95     len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
     96     if (bb->len < 1) perror_exit("inflate EOF");
     97   }
     98   bb->bitpos = pos;
     99 }
    100 
    101 // Optimized single bit inlined version
    102 static inline int bitbuf_bit(struct bitbuf *bb)
    103 {
    104   int bufpos = bb->bitpos>>3;
    105 
    106   if (bufpos == bb->len) {
    107     bitbuf_skip(bb, 0);
    108     bufpos = 0;
    109   }
    110 
    111   return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
    112 }
    113 
    114 // Fetch the next X bits from the bitbuf, little endian
    115 unsigned bitbuf_get(struct bitbuf *bb, int bits)
    116 {
    117   int result = 0, offset = 0;
    118 
    119   while (bits) {
    120     int click = bb->bitpos >> 3, blow, blen;
    121 
    122     // Load more data if buffer empty
    123     if (click == bb->len) bitbuf_skip(bb, click = 0);
    124 
    125     // grab bits from next byte
    126     blow = bb->bitpos & 7;
    127     blen = 8-blow;
    128     if (blen > bits) blen = bits;
    129     result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
    130     offset += blen;
    131     bits -= blen;
    132     bb->bitpos += blen;
    133   }
    134 
    135   return result;
    136 }
    137 
    138 void bitbuf_flush(struct bitbuf *bb)
    139 {
    140   if (!bb->bitpos) return;
    141 
    142   xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8);
    143   memset(bb->buf, 0, bb->max);
    144   bb->bitpos = 0;
    145 }
    146 
    147 void bitbuf_put(struct bitbuf *bb, int data, int len)
    148 {
    149   while (len) {
    150     int click = bb->bitpos >> 3, blow, blen;
    151 
    152     // Flush buffer if necessary
    153     if (click == bb->max) {
    154       bitbuf_flush(bb);
    155       click = 0;
    156     }
    157     blow = bb->bitpos & 7;
    158     blen = 8-blow;
    159     if (blen > len) blen = len;
    160     bb->buf[click] |= data << blow;
    161     bb->bitpos += blen;
    162     data >>= blen;
    163     len -= blen;
    164   }
    165 }
    166 
    167 static void output_byte(char sym)
    168 {
    169   int pos = TT.pos++ & 32767;
    170 
    171   TT.data[pos] = sym;
    172 
    173   if (pos == 32767) {
    174     xwrite(TT.outfd, TT.data, 32768);
    175     if (TT.crcfunc) TT.crcfunc(TT.data, 32768);
    176   }
    177 }
    178 
    179 // Huffman coding uses bits to traverse a binary tree to a leaf node,
    180 // By placing frequently occurring symbols at shorter paths, frequently
    181 // used symbols may be represented in fewer bits than uncommon symbols.
    182 
    183 struct huff {
    184   unsigned short length[16];
    185   unsigned short symbol[288];
    186 };
    187 
    188 // Create simple huffman tree from array of bit lengths.
    189 
    190 // The symbols in the huffman trees are sorted (first by bit length
    191 // of the code to reach them, then by symbol number). This means that given
    192 // the bit length of each symbol, we can construct a unique tree.
    193 static void len2huff(struct huff *huff, char bitlen[], int len)
    194 {
    195   int offset[16];
    196   int i;
    197 
    198   // Count number of codes at each bit length
    199   memset(huff, 0, sizeof(struct huff));
    200   for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
    201 
    202   // Sort symbols by bit length. (They'll remain sorted by symbol within that.)
    203   *huff->length = *offset = 0;
    204   for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
    205 
    206   for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
    207 }
    208 
    209 // Fetch and decode next huffman coded symbol from bitbuf.
    210 // This takes advantage of the sorting to navigate the tree as an array:
    211 // each time we fetch a bit we have all the codes at that bit level in
    212 // order with no gaps.
    213 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
    214 {
    215   unsigned short *length = huff->length;
    216   int start = 0, offset = 0;
    217 
    218   // Traverse through the bit lengths until our code is in this range
    219   for (;;) {
    220     offset = (offset << 1) | bitbuf_bit(bb);
    221     start += *++length;
    222     if ((offset -= *length) < 0) break;
    223     if ((length - huff->length) & 16) error_exit("bad symbol");
    224   }
    225 
    226   return huff->symbol[start + offset];
    227 }
    228 
    229 // Decompress deflated data from bitbuf to TT.outfd.
    230 static void inflate(struct bitbuf *bb)
    231 {
    232   TT.crc = ~0;
    233   // repeat until spanked
    234   for (;;) {
    235     int final, type;
    236 
    237     final = bitbuf_get(bb, 1);
    238     type = bitbuf_get(bb, 2);
    239 
    240     if (type == 3) error_exit("bad type");
    241 
    242     // Uncompressed block?
    243     if (!type) {
    244       int len, nlen;
    245 
    246       // Align to byte, read length
    247       bitbuf_skip(bb, (8-bb->bitpos)&7);
    248       len = bitbuf_get(bb, 16);
    249       nlen = bitbuf_get(bb, 16);
    250       if (len != (0xffff & ~nlen)) error_exit("bad len");
    251 
    252       // Dump literal output data
    253       while (len) {
    254         int pos = bb->bitpos >> 3, bblen = bb->len - pos;
    255         char *p = bb->buf+pos;
    256 
    257         // dump bytes until done or end of current bitbuf contents
    258         if (bblen > len) bblen = len;
    259         pos = bblen;
    260         while (pos--) output_byte(*(p++));
    261         bitbuf_skip(bb, bblen << 3);
    262         len -= bblen;
    263       }
    264 
    265     // Compressed block
    266     } else {
    267       struct huff *disthuff, *lithuff;
    268 
    269       // Dynamic huffman codes?
    270       if (type == 2) {
    271         struct huff *h2 = ((struct huff *)toybuf)+1;
    272         int i, litlen, distlen, hufflen;
    273         char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
    274                               "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
    275 
    276         // The huffman trees are stored as a series of bit lengths
    277         litlen = bitbuf_get(bb, 5)+257;  // max 288
    278         distlen = bitbuf_get(bb, 5)+1;   // max 32
    279         hufflen = bitbuf_get(bb, 4)+4;   // max 19
    280 
    281         // The literal and distance codes are themselves compressed, in
    282         // a complicated way: an array of bit lengths (hufflen many
    283         // entries, each 3 bits) is used to fill out an array of 19 entries
    284         // in a magic order, leaving the rest 0. Then make a tree out of it:
    285         memset(bits = toybuf+1, 0, 19);
    286         for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
    287         len2huff(h2, bits, 19);
    288 
    289         // Use that tree to read in the literal and distance bit lengths
    290         for (i = 0; i < litlen + distlen;) {
    291           int sym = huff_and_puff(bb, h2);
    292 
    293           // 0-15 are literals, 16 = repeat previous code 3-6 times,
    294           // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
    295           if (sym < 16) bits[i++] = sym;
    296           else {
    297             int len = sym & 2;
    298 
    299             len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
    300             memset(bits+i, bits[i-1] * !(sym&3), len);
    301             i += len;
    302           }
    303         }
    304         if (i > litlen+distlen) error_exit("bad tree");
    305 
    306         len2huff(lithuff = h2, bits, litlen);
    307         len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen);
    308 
    309       // Static huffman codes
    310       } else {
    311         lithuff = TT.fixlithuff;
    312         disthuff = TT.fixdisthuff;
    313       }
    314 
    315       // Use huffman tables to decode block of compressed symbols
    316       for (;;) {
    317         int sym = huff_and_puff(bb, lithuff);
    318 
    319         // Literal?
    320         if (sym < 256) output_byte(sym);
    321 
    322         // Copy range?
    323         else if (sym > 256) {
    324           int len, dist;
    325 
    326           sym -= 257;
    327           len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]);
    328           sym = huff_and_puff(bb, disthuff);
    329           dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]);
    330           sym = TT.pos & 32767;
    331 
    332           while (len--) output_byte(TT.data[(TT.pos-dist) & 32767]);
    333 
    334         // End of block
    335         } else break;
    336       }
    337     }
    338 
    339     // Was that the last block?
    340     if (final) break;
    341   }
    342 
    343   if (TT.pos & 32767) {
    344     xwrite(TT.outfd, TT.data, TT.pos & 32767);
    345     if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767);
    346   }
    347 }
    348 
    349 // Deflate from TT.infd to bitbuf
    350 // For deflate, TT.len = input read, TT.pos = input consumed
    351 static void deflate(struct bitbuf *bb)
    352 {
    353   char *data = TT.data;
    354   int len, final = 0;
    355 
    356   TT.crc = ~0;
    357 
    358   while (!final) {
    359     // Read next half-window of data if we haven't hit EOF yet.
    360     len = readall(TT.infd, data+(TT.len&32768), 32768);
    361     if (len < 0) perror_exit("read"); // todo: add filename
    362     if (len != 32768) final++;
    363     if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len);
    364     // TT.len += len;  crcfunc advances len
    365 
    366     // store block as literal
    367     bitbuf_put(bb, final, 1);
    368     bitbuf_put(bb, 0, 1);
    369 
    370     bitbuf_put(bb, 0, (8-bb->bitpos)&7);
    371     bitbuf_put(bb, len, 16);
    372     bitbuf_put(bb, 0xffff & ~len, 16);
    373 
    374     // repeat until spanked
    375     while (TT.pos != TT.len) {
    376       unsigned pos = TT.pos & 65535;
    377 
    378       bitbuf_put(bb, data[pos], 8);
    379 
    380       // need to refill buffer?
    381       if (!(32767 & ++TT.pos) && !final) break;
    382     }
    383   }
    384   bitbuf_flush(bb);
    385 }
    386 
    387 // Allocate memory for deflate/inflate.
    388 static void init_deflate(int compress)
    389 {
    390   int i, n = 1;
    391 
    392   // compress needs 64k data and 32k each for hashhead and hashchain.
    393   // decompress just needs 32k data.
    394   TT.data = xmalloc(32768*(compress ? 4 : 1));
    395   if (compress) {
    396     TT.hashhead = (unsigned short *)(TT.data + 65536);
    397     TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768);
    398   }
    399 
    400   // Calculate lenbits, lenbase, distbits, distbase
    401   *TT.lenbase = 3;
    402   for (i = 0; i<sizeof(TT.lenbits)-1; i++) {
    403     if (i>4) {
    404       if (!(i&3)) {
    405         TT.lenbits[i]++;
    406         n <<= 1;
    407       }
    408       if (i == 27) n--;
    409       else TT.lenbits[i+1] = TT.lenbits[i];
    410     }
    411     TT.lenbase[i+1] = n + TT.lenbase[i];
    412   }
    413   n = 0;
    414   for (i = 0; i<sizeof(TT.distbits); i++) {
    415     TT.distbase[i] = 1<<n;
    416     if (i) TT.distbase[i] += TT.distbase[i-1];
    417     if (i>3 && !(i&1)) n++;
    418     TT.distbits[i] = n;
    419   }
    420 
    421   // Init fixed huffman tables
    422   for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
    423   len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288);
    424   memset(toybuf, 5, 30);
    425   len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30);
    426 }
    427 
    428 // Return true/false whether we consumed a gzip header.
    429 static int is_gzip(struct bitbuf *bb)
    430 {
    431   int flags;
    432 
    433   // Confirm signature
    434   if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
    435     return 0;
    436   bitbuf_skip(bb, 6*8);
    437 
    438   // Skip extra, name, comment, header CRC fields
    439   if (flags & 4) bitbuf_skip(bb, 16);
    440   if (flags & 8) while (bitbuf_get(bb, 8));
    441   if (flags & 16) while (bitbuf_get(bb, 8));
    442   if (flags & 2) bitbuf_skip(bb, 16);
    443 
    444   return 1;
    445 }
    446 
    447 void gzip_crc(char *data, int len)
    448 {
    449   int i;
    450   unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024);
    451 
    452   crc = TT.crc;
    453   for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
    454   TT.crc = crc;
    455   TT.len += len;
    456 }
    457 
    458 static void do_gzip(int fd, char *name)
    459 {
    460   struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf));
    461 
    462   // Header from RFC 1952 section 2.2:
    463   // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
    464   // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
    465   // Operating System (FF=unknown)
    466 
    467   TT.infd = fd;
    468   xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
    469 
    470   // Use last 1k of toybuf for little endian crc table
    471   crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
    472   TT.crcfunc = gzip_crc;
    473 
    474   deflate(bb);
    475 
    476   // tail: crc32, len32
    477 
    478   bitbuf_put(bb, 0, (8-bb->bitpos)&7);
    479   bitbuf_put(bb, ~TT.crc, 32);
    480   bitbuf_put(bb, TT.len, 32);
    481 
    482   bitbuf_flush(bb);
    483   free(bb);
    484 }
    485 
    486 static void do_zcat(int fd, char *name)
    487 {
    488   struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf));
    489 
    490   if (!is_gzip(bb)) error_exit("not gzip");
    491   TT.outfd = 1;
    492 
    493   // Use last 1k of toybuf for little endian crc table
    494   crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
    495   TT.crcfunc = gzip_crc;
    496 
    497   inflate(bb);
    498 
    499   // tail: crc32, len32
    500 
    501   bitbuf_skip(bb, (8-bb->bitpos)&7);
    502   if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32))
    503     error_exit("bad crc");
    504   free(bb);
    505 }
    506 
    507 // Parse many different kinds of command line argument:
    508 
    509 void compress_main(void)
    510 {
    511   // todo: this
    512   printf("hello world");
    513 }
    514