Home | History | Annotate | Download | only in lib
      1 /* deflate.c - deflate/inflate code for gzip and friends
      2  *
      3  * Copyright 2014 Rob Landley <rob (at) landley.net>
      4  *
      5  * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
      6  * LSB 4.1 has gzip, gunzip, and zcat
      7  *
      8  * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
      9  */
     10 
     11 #include "toys.h"
     12 
     13 struct deflate {
     14   // Huffman codes: base offset and extra bits tables (length and distance)
     15   char lenbits[29], distbits[30];
     16   unsigned short lenbase[29], distbase[30];
     17   void *fixdisthuff, *fixlithuff;
     18 
     19   // CRC
     20   void (*crcfunc)(struct deflate *dd, char *data, int len);
     21   unsigned crctable[256], crc;
     22 
     23 
     24   // Tables only used for deflation
     25   unsigned short *hashhead, *hashchain;
     26 
     27   // Compressed data buffer (extra space malloced at end)
     28   unsigned pos, len;
     29   int infd, outfd;
     30   char data[];
     31 };
     32 
     33 // little endian bit buffer
     34 struct bitbuf {
     35   int fd, bitpos, len, max;
     36   char buf[];
     37 };
     38 
     39 // malloc a struct bitbuf
     40 struct bitbuf *bitbuf_init(int fd, int size)
     41 {
     42   struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
     43 
     44   bb->max = size;
     45   bb->fd = fd;
     46 
     47   return bb;
     48 }
     49 
     50 // Advance bitpos without the overhead of recording bits
     51 // Loads more data when input buffer empty
     52 void bitbuf_skip(struct bitbuf *bb, int bits)
     53 {
     54   int pos = bb->bitpos + bits, len = bb->len << 3;
     55 
     56   while (pos >= len) {
     57     pos -= len;
     58     len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
     59     if (bb->len < 1) perror_exit("inflate EOF");
     60   }
     61   bb->bitpos = pos;
     62 }
     63 
     64 // Optimized single bit inlined version
     65 static inline int bitbuf_bit(struct bitbuf *bb)
     66 {
     67   int bufpos = bb->bitpos>>3;
     68 
     69   if (bufpos == bb->len) {
     70     bitbuf_skip(bb, 0);
     71     bufpos = 0;
     72   }
     73 
     74   return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
     75 }
     76 
     77 // Fetch the next X bits from the bitbuf, little endian
     78 unsigned bitbuf_get(struct bitbuf *bb, int bits)
     79 {
     80   int result = 0, offset = 0;
     81 
     82   while (bits) {
     83     int click = bb->bitpos >> 3, blow, blen;
     84 
     85     // Load more data if buffer empty
     86     if (click == bb->len) bitbuf_skip(bb, click = 0);
     87 
     88     // grab bits from next byte
     89     blow = bb->bitpos & 7;
     90     blen = 8-blow;
     91     if (blen > bits) blen = bits;
     92     result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
     93     offset += blen;
     94     bits -= blen;
     95     bb->bitpos += blen;
     96   }
     97 
     98   return result;
     99 }
    100 
    101 void bitbuf_flush(struct bitbuf *bb)
    102 {
    103   if (!bb->bitpos) return;
    104 
    105   xwrite(bb->fd, bb->buf, (bb->bitpos+7)>>3);
    106   memset(bb->buf, 0, bb->max);
    107   bb->bitpos = 0;
    108 }
    109 
    110 void bitbuf_put(struct bitbuf *bb, int data, int len)
    111 {
    112   while (len) {
    113     int click = bb->bitpos >> 3, blow, blen;
    114 
    115     // Flush buffer if necessary
    116     if (click == bb->max) {
    117       bitbuf_flush(bb);
    118       click = 0;
    119     }
    120     blow = bb->bitpos & 7;
    121     blen = 8-blow;
    122     if (blen > len) blen = len;
    123     bb->buf[click] |= data << blow;
    124     bb->bitpos += blen;
    125     data >>= blen;
    126     len -= blen;
    127   }
    128 }
    129 
    130 static void output_byte(struct deflate *dd, char sym)
    131 {
    132   int pos = dd->pos++ & 32767;
    133 
    134   dd->data[pos] = sym;
    135 
    136   if (pos == 32767) {
    137     xwrite(dd->outfd, dd->data, 32768);
    138     if (dd->crcfunc) dd->crcfunc(dd, dd->data, 32768);
    139   }
    140 }
    141 
    142 // Huffman coding uses bits to traverse a binary tree to a leaf node,
    143 // By placing frequently occurring symbols at shorter paths, frequently
    144 // used symbols may be represented in fewer bits than uncommon symbols.
    145 // (length[0] isn't used but code's clearer if it's there.)
    146 
    147 struct huff {
    148   unsigned short length[16];  // How many symbols have this bit length?
    149   unsigned short symbol[288]; // sorted by bit length, then ascending order
    150 };
    151 
    152 // Create simple huffman tree from array of bit lengths.
    153 
    154 // The symbols in the huffman trees are sorted (first by bit length
    155 // of the code to reach them, then by symbol number). This means that given
    156 // the bit length of each symbol, we can construct a unique tree.
    157 static void len2huff(struct huff *huff, char bitlen[], int len)
    158 {
    159   int offset[16];
    160   int i;
    161 
    162   // Count number of codes at each bit length
    163   memset(huff, 0, sizeof(struct huff));
    164   for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
    165 
    166   // Sort symbols by bit length, then symbol. Get list of starting positions
    167   // for each group, then write each symbol to next position within its group.
    168   *huff->length = *offset = 0;
    169   for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
    170   for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
    171 }
    172 
    173 // Fetch and decode next huffman coded symbol from bitbuf.
    174 // This takes advantage of the sorting to navigate the tree as an array:
    175 // each time we fetch a bit we have all the codes at that bit level in
    176 // order with no gaps.
    177 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
    178 {
    179   unsigned short *length = huff->length;
    180   int start = 0, offset = 0;
    181 
    182   // Traverse through the bit lengths until our code is in this range
    183   for (;;) {
    184     offset = (offset << 1) | bitbuf_bit(bb);
    185     start += *++length;
    186     if ((offset -= *length) < 0) break;
    187     if ((length - huff->length) & 16) error_exit("bad symbol");
    188   }
    189 
    190   return huff->symbol[start + offset];
    191 }
    192 
    193 // Decompress deflated data from bitbuf to dd->outfd.
    194 static void inflate(struct deflate *dd, struct bitbuf *bb)
    195 {
    196   dd->crc = ~0;
    197   // repeat until spanked
    198   for (;;) {
    199     int final, type;
    200 
    201     final = bitbuf_get(bb, 1);
    202     type = bitbuf_get(bb, 2);
    203 
    204     if (type == 3) error_exit("bad type");
    205 
    206     // Uncompressed block?
    207     if (!type) {
    208       int len, nlen;
    209 
    210       // Align to byte, read length
    211       bitbuf_skip(bb, (8-bb->bitpos)&7);
    212       len = bitbuf_get(bb, 16);
    213       nlen = bitbuf_get(bb, 16);
    214       if (len != (0xffff & ~nlen)) error_exit("bad len");
    215 
    216       // Dump literal output data
    217       while (len) {
    218         int pos = bb->bitpos >> 3, bblen = bb->len - pos;
    219         char *p = bb->buf+pos;
    220 
    221         // dump bytes until done or end of current bitbuf contents
    222         if (bblen > len) bblen = len;
    223         pos = bblen;
    224         while (pos--) output_byte(dd, *(p++));
    225         bitbuf_skip(bb, bblen << 3);
    226         len -= bblen;
    227       }
    228 
    229     // Compressed block
    230     } else {
    231       struct huff *disthuff, *lithuff;
    232 
    233       // Dynamic huffman codes?
    234       if (type == 2) {
    235         struct huff *h2 = ((struct huff *)toybuf)+1;
    236         int i, litlen, distlen, hufflen;
    237         char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
    238                               "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
    239 
    240         // The huffman trees are stored as a series of bit lengths
    241         litlen = bitbuf_get(bb, 5)+257;  // max 288
    242         distlen = bitbuf_get(bb, 5)+1;   // max 32
    243         hufflen = bitbuf_get(bb, 4)+4;   // max 19
    244 
    245         // The literal and distance codes are themselves compressed, in
    246         // a complicated way: an array of bit lengths (hufflen many
    247         // entries, each 3 bits) is used to fill out an array of 19 entries
    248         // in a magic order, leaving the rest 0. Then make a tree out of it:
    249         memset(bits = toybuf+1, 0, 19);
    250         for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
    251         len2huff(h2, bits, 19);
    252 
    253         // Use that tree to read in the literal and distance bit lengths
    254         for (i = 0; i < litlen + distlen;) {
    255           int sym = huff_and_puff(bb, h2);
    256 
    257           // 0-15 are literals, 16 = repeat previous code 3-6 times,
    258           // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
    259           if (sym < 16) bits[i++] = sym;
    260           else {
    261             int len = sym & 2;
    262 
    263             len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
    264             memset(bits+i, bits[i-1] * !(sym&3), len);
    265             i += len;
    266           }
    267         }
    268         if (i > litlen+distlen) error_exit("bad tree");
    269 
    270         len2huff(lithuff = h2, bits, litlen);
    271         len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen);
    272 
    273       // Static huffman codes
    274       } else {
    275         lithuff = dd->fixlithuff;
    276         disthuff = dd->fixdisthuff;
    277       }
    278 
    279       // Use huffman tables to decode block of compressed symbols
    280       for (;;) {
    281         int sym = huff_and_puff(bb, lithuff);
    282 
    283         // Literal?
    284         if (sym < 256) output_byte(dd, sym);
    285 
    286         // Copy range?
    287         else if (sym > 256) {
    288           int len, dist;
    289 
    290           sym -= 257;
    291           len = dd->lenbase[sym] + bitbuf_get(bb, dd->lenbits[sym]);
    292           sym = huff_and_puff(bb, disthuff);
    293           dist = dd->distbase[sym] + bitbuf_get(bb, dd->distbits[sym]);
    294           sym = dd->pos & 32767;
    295 
    296           while (len--) output_byte(dd, dd->data[(dd->pos-dist) & 32767]);
    297 
    298         // End of block
    299         } else break;
    300       }
    301     }
    302 
    303     // Was that the last block?
    304     if (final) break;
    305   }
    306 
    307   if (dd->pos & 32767) {
    308     xwrite(dd->outfd, dd->data, dd->pos&32767);
    309     if (dd->crcfunc) dd->crcfunc(dd, dd->data, dd->pos&32767);
    310   }
    311 }
    312 
    313 // Deflate from dd->infd to bitbuf
    314 // For deflate, dd->len = input read, dd->pos = input consumed
    315 static void deflate(struct deflate *dd, struct bitbuf *bb)
    316 {
    317   char *data = dd->data;
    318   int len, final = 0;
    319 
    320   dd->crc = ~0;
    321 
    322   while (!final) {
    323     // Read next half-window of data if we haven't hit EOF yet.
    324     len = readall(dd->infd, data+(dd->len&32768), 32768);
    325     if (len < 0) perror_exit("read"); // todo: add filename
    326     if (len != 32768) final++;
    327     if (dd->crcfunc) dd->crcfunc(dd, data+(dd->len&32768), len);
    328     // dd->len += len;  crcfunc advances len TODO
    329 
    330     // store block as literal
    331     bitbuf_put(bb, final, 1);
    332     bitbuf_put(bb, 0, 1);
    333 
    334     bitbuf_put(bb, 0, (8-bb->bitpos)&7);
    335     bitbuf_put(bb, len, 16);
    336     bitbuf_put(bb, 0xffff & ~len, 16);
    337 
    338     // repeat until spanked
    339     while (dd->pos != dd->len) {
    340       unsigned pos = dd->pos&65535;
    341 
    342       bitbuf_put(bb, data[pos], 8);
    343 
    344       // need to refill buffer?
    345       if (!(32767 & ++dd->pos) && !final) break;
    346     }
    347   }
    348   bitbuf_flush(bb);
    349 }
    350 
    351 // Allocate memory for deflate/inflate.
    352 static struct deflate *init_deflate(int compress)
    353 {
    354   int i, n = 1;
    355   struct deflate *dd = xmalloc(sizeof(struct deflate)+32768*(compress ? 4 : 1));
    356 
    357 // TODO sizeof and order of these?
    358   // decompress needs 32k history, compress adds 64k hashhead and 32k hashchain
    359   if (compress) {
    360     dd->hashhead = (unsigned short *)(dd->data+65536);
    361     dd->hashchain = (unsigned short *)(dd->data+65536+32768);
    362   }
    363 
    364   // Calculate lenbits, lenbase, distbits, distbase
    365   *dd->lenbase = 3;
    366   for (i = 0; i<sizeof(dd->lenbits)-1; i++) {
    367     if (i>4) {
    368       if (!(i&3)) {
    369         dd->lenbits[i]++;
    370         n <<= 1;
    371       }
    372       if (i == 27) n--;
    373       else dd->lenbits[i+1] = dd->lenbits[i];
    374     }
    375     dd->lenbase[i+1] = n + dd->lenbase[i];
    376   }
    377   n = 0;
    378   for (i = 0; i<sizeof(dd->distbits); i++) {
    379     dd->distbase[i] = 1<<n;
    380     if (i) dd->distbase[i] += dd->distbase[i-1];
    381     if (i>3 && !(i&1)) n++;
    382     dd->distbits[i] = n;
    383   }
    384 
    385 // TODO layout and lifetime of this?
    386   // Init fixed huffman tables
    387   for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
    388   len2huff(dd->fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288);
    389   memset(toybuf, 5, 30);
    390   len2huff(dd->fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30);
    391 
    392   return dd;
    393 }
    394 
    395 // Return true/false whether we consumed a gzip header.
    396 static int is_gzip(struct bitbuf *bb)
    397 {
    398   int flags;
    399 
    400   // Confirm signature
    401   if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
    402     return 0;
    403   bitbuf_skip(bb, 6*8);
    404 
    405   // Skip extra, name, comment, header CRC fields
    406   if (flags & 4) bitbuf_skip(bb, 16);
    407   if (flags & 8) while (bitbuf_get(bb, 8));
    408   if (flags & 16) while (bitbuf_get(bb, 8));
    409   if (flags & 2) bitbuf_skip(bb, 16);
    410 
    411   return 1;
    412 }
    413 
    414 void gzip_crc(struct deflate *dd, char *data, int len)
    415 {
    416   int i;
    417   unsigned crc, *crc_table = dd->crctable;
    418 
    419   crc = dd->crc;
    420   for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
    421   dd->crc = crc;
    422   dd->len += len;
    423 }
    424 
    425 long long gzip_fd(int infd, int outfd)
    426 {
    427   struct bitbuf *bb = bitbuf_init(outfd, 4096);
    428   struct deflate *dd = init_deflate(1);
    429   long long rc;
    430 
    431   // Header from RFC 1952 section 2.2:
    432   // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
    433   // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
    434   // Operating System (FF=unknown)
    435 
    436   dd->infd = infd;
    437   xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
    438 
    439   // Little endian crc table
    440   crc_init(dd->crctable, 1);
    441   dd->crcfunc = gzip_crc;
    442 
    443   deflate(dd, bb);
    444 
    445   // tail: crc32, len32
    446 
    447   bitbuf_put(bb, 0, (8-bb->bitpos)&7);
    448   bitbuf_put(bb, ~dd->crc, 32);
    449   bitbuf_put(bb, dd->len, 32);
    450   rc = dd->len;
    451 
    452   bitbuf_flush(bb);
    453   free(bb);
    454   free(dd);
    455 
    456   return rc;
    457 }
    458 
    459 long long gunzip_fd(int infd, int outfd)
    460 {
    461   struct bitbuf *bb = bitbuf_init(infd, 4096);
    462   struct deflate *dd = init_deflate(0);
    463   long long rc;
    464 
    465   if (!is_gzip(bb)) error_exit("not gzip");
    466   dd->outfd = outfd;
    467 
    468   // Little endian crc table
    469   crc_init(dd->crctable, 1);
    470   dd->crcfunc = gzip_crc;
    471 
    472   inflate(dd, bb);
    473 
    474   // tail: crc32, len32
    475 
    476   bitbuf_skip(bb, (8-bb->bitpos)&7);
    477   if (~dd->crc != bitbuf_get(bb, 32) || dd->len != bitbuf_get(bb, 32))
    478     error_exit("bad crc");
    479 
    480   rc = dd->len;
    481   free(bb);
    482   free(dd);
    483 
    484   return rc;
    485 }
    486