Home | History | Annotate | Download | only in examples
      1 /* gzappend -- command to append to a gzip file
      2 
      3   Copyright (C) 2003, 2012 Mark Adler, all rights reserved
      4   version 1.2, 11 Oct 2012
      5 
      6   This software is provided 'as-is', without any express or implied
      7   warranty.  In no event will the author be held liable for any damages
      8   arising from the use of this software.
      9 
     10   Permission is granted to anyone to use this software for any purpose,
     11   including commercial applications, and to alter it and redistribute it
     12   freely, subject to the following restrictions:
     13 
     14   1. The origin of this software must not be misrepresented; you must not
     15      claim that you wrote the original software. If you use this software
     16      in a product, an acknowledgment in the product documentation would be
     17      appreciated but is not required.
     18   2. Altered source versions must be plainly marked as such, and must not be
     19      misrepresented as being the original software.
     20   3. This notice may not be removed or altered from any source distribution.
     21 
     22   Mark Adler    madler (at) alumni.caltech.edu
     23  */
     24 
     25 /*
     26  * Change history:
     27  *
     28  * 1.0  19 Oct 2003     - First version
     29  * 1.1   4 Nov 2003     - Expand and clarify some comments and notes
     30  *                      - Add version and copyright to help
     31  *                      - Send help to stdout instead of stderr
     32  *                      - Add some preemptive typecasts
     33  *                      - Add L to constants in lseek() calls
     34  *                      - Remove some debugging information in error messages
     35  *                      - Use new data_type definition for zlib 1.2.1
     36  *                      - Simplfy and unify file operations
     37  *                      - Finish off gzip file in gztack()
     38  *                      - Use deflatePrime() instead of adding empty blocks
     39  *                      - Keep gzip file clean on appended file read errors
     40  *                      - Use in-place rotate instead of auxiliary buffer
     41  *                        (Why you ask?  Because it was fun to write!)
     42  * 1.2  11 Oct 2012     - Fix for proper z_const usage
     43  *                      - Check for input buffer malloc failure
     44  */
     45 
     46 /*
     47    gzappend takes a gzip file and appends to it, compressing files from the
     48    command line or data from stdin.  The gzip file is written to directly, to
     49    avoid copying that file, in case it's large.  Note that this results in the
     50    unfriendly behavior that if gzappend fails, the gzip file is corrupted.
     51 
     52    This program was written to illustrate the use of the new Z_BLOCK option of
     53    zlib 1.2.x's inflate() function.  This option returns from inflate() at each
     54    block boundary to facilitate locating and modifying the last block bit at
     55    the start of the final deflate block.  Also whether using Z_BLOCK or not,
     56    another required feature of zlib 1.2.x is that inflate() now provides the
     57    number of unusued bits in the last input byte used.  gzappend will not work
     58    with versions of zlib earlier than 1.2.1.
     59 
     60    gzappend first decompresses the gzip file internally, discarding all but
     61    the last 32K of uncompressed data, and noting the location of the last block
     62    bit and the number of unused bits in the last byte of the compressed data.
     63    The gzip trailer containing the CRC-32 and length of the uncompressed data
     64    is verified.  This trailer will be later overwritten.
     65 
     66    Then the last block bit is cleared by seeking back in the file and rewriting
     67    the byte that contains it.  Seeking forward, the last byte of the compressed
     68    data is saved along with the number of unused bits to initialize deflate.
     69 
     70    A deflate process is initialized, using the last 32K of the uncompressed
     71    data from the gzip file to initialize the dictionary.  If the total
     72    uncompressed data was less than 32K, then all of it is used to initialize
     73    the dictionary.  The deflate output bit buffer is also initialized with the
     74    last bits from the original deflate stream.  From here on, the data to
     75    append is simply compressed using deflate, and written to the gzip file.
     76    When that is complete, the new CRC-32 and uncompressed length are written
     77    as the trailer of the gzip file.
     78  */
     79 
     80 #include <stdio.h>
     81 #include <stdlib.h>
     82 #include <string.h>
     83 #include <fcntl.h>
     84 #include <unistd.h>
     85 #include "zlib.h"
     86 
     87 #define local static
     88 #define LGCHUNK 14
     89 #define CHUNK (1U << LGCHUNK)
     90 #define DSIZE 32768U
     91 
     92 /* print an error message and terminate with extreme prejudice */
     93 local void bye(char *msg1, char *msg2)
     94 {
     95     fprintf(stderr, "gzappend error: %s%s\n", msg1, msg2);
     96     exit(1);
     97 }
     98 
     99 /* return the greatest common divisor of a and b using Euclid's algorithm,
    100    modified to be fast when one argument much greater than the other, and
    101    coded to avoid unnecessary swapping */
    102 local unsigned gcd(unsigned a, unsigned b)
    103 {
    104     unsigned c;
    105 
    106     while (a && b)
    107         if (a > b) {
    108             c = b;
    109             while (a - c >= c)
    110                 c <<= 1;
    111             a -= c;
    112         }
    113         else {
    114             c = a;
    115             while (b - c >= c)
    116                 c <<= 1;
    117             b -= c;
    118         }
    119     return a + b;
    120 }
    121 
    122 /* rotate list[0..len-1] left by rot positions, in place */
    123 local void rotate(unsigned char *list, unsigned len, unsigned rot)
    124 {
    125     unsigned char tmp;
    126     unsigned cycles;
    127     unsigned char *start, *last, *to, *from;
    128 
    129     /* normalize rot and handle degenerate cases */
    130     if (len < 2) return;
    131     if (rot >= len) rot %= len;
    132     if (rot == 0) return;
    133 
    134     /* pointer to last entry in list */
    135     last = list + (len - 1);
    136 
    137     /* do simple left shift by one */
    138     if (rot == 1) {
    139         tmp = *list;
    140         memcpy(list, list + 1, len - 1);
    141         *last = tmp;
    142         return;
    143     }
    144 
    145     /* do simple right shift by one */
    146     if (rot == len - 1) {
    147         tmp = *last;
    148         memmove(list + 1, list, len - 1);
    149         *list = tmp;
    150         return;
    151     }
    152 
    153     /* otherwise do rotate as a set of cycles in place */
    154     cycles = gcd(len, rot);             /* number of cycles */
    155     do {
    156         start = from = list + cycles;   /* start index is arbitrary */
    157         tmp = *from;                    /* save entry to be overwritten */
    158         for (;;) {
    159             to = from;                  /* next step in cycle */
    160             from += rot;                /* go right rot positions */
    161             if (from > last) from -= len;   /* (pointer better not wrap) */
    162             if (from == start) break;   /* all but one shifted */
    163             *to = *from;                /* shift left */
    164         }
    165         *to = tmp;                      /* complete the circle */
    166     } while (--cycles);
    167 }
    168 
    169 /* structure for gzip file read operations */
    170 typedef struct {
    171     int fd;                     /* file descriptor */
    172     int size;                   /* 1 << size is bytes in buf */
    173     unsigned left;              /* bytes available at next */
    174     unsigned char *buf;         /* buffer */
    175     z_const unsigned char *next;    /* next byte in buffer */
    176     char *name;                 /* file name for error messages */
    177 } file;
    178 
    179 /* reload buffer */
    180 local int readin(file *in)
    181 {
    182     int len;
    183 
    184     len = read(in->fd, in->buf, 1 << in->size);
    185     if (len == -1) bye("error reading ", in->name);
    186     in->left = (unsigned)len;
    187     in->next = in->buf;
    188     return len;
    189 }
    190 
    191 /* read from file in, exit if end-of-file */
    192 local int readmore(file *in)
    193 {
    194     if (readin(in) == 0) bye("unexpected end of ", in->name);
    195     return 0;
    196 }
    197 
    198 #define read1(in) (in->left == 0 ? readmore(in) : 0, \
    199                    in->left--, *(in->next)++)
    200 
    201 /* skip over n bytes of in */
    202 local void skip(file *in, unsigned n)
    203 {
    204     unsigned bypass;
    205 
    206     if (n > in->left) {
    207         n -= in->left;
    208         bypass = n & ~((1U << in->size) - 1);
    209         if (bypass) {
    210             if (lseek(in->fd, (off_t)bypass, SEEK_CUR) == -1)
    211                 bye("seeking ", in->name);
    212             n -= bypass;
    213         }
    214         readmore(in);
    215         if (n > in->left)
    216             bye("unexpected end of ", in->name);
    217     }
    218     in->left -= n;
    219     in->next += n;
    220 }
    221 
    222 /* read a four-byte unsigned integer, little-endian, from in */
    223 unsigned long read4(file *in)
    224 {
    225     unsigned long val;
    226 
    227     val = read1(in);
    228     val += (unsigned)read1(in) << 8;
    229     val += (unsigned long)read1(in) << 16;
    230     val += (unsigned long)read1(in) << 24;
    231     return val;
    232 }
    233 
    234 /* skip over gzip header */
    235 local void gzheader(file *in)
    236 {
    237     int flags;
    238     unsigned n;
    239 
    240     if (read1(in) != 31 || read1(in) != 139) bye(in->name, " not a gzip file");
    241     if (read1(in) != 8) bye("unknown compression method in", in->name);
    242     flags = read1(in);
    243     if (flags & 0xe0) bye("unknown header flags set in", in->name);
    244     skip(in, 6);
    245     if (flags & 4) {
    246         n = read1(in);
    247         n += (unsigned)(read1(in)) << 8;
    248         skip(in, n);
    249     }
    250     if (flags & 8) while (read1(in) != 0) ;
    251     if (flags & 16) while (read1(in) != 0) ;
    252     if (flags & 2) skip(in, 2);
    253 }
    254 
    255 /* decompress gzip file "name", return strm with a deflate stream ready to
    256    continue compression of the data in the gzip file, and return a file
    257    descriptor pointing to where to write the compressed data -- the deflate
    258    stream is initialized to compress using level "level" */
    259 local int gzscan(char *name, z_stream *strm, int level)
    260 {
    261     int ret, lastbit, left, full;
    262     unsigned have;
    263     unsigned long crc, tot;
    264     unsigned char *window;
    265     off_t lastoff, end;
    266     file gz;
    267 
    268     /* open gzip file */
    269     gz.name = name;
    270     gz.fd = open(name, O_RDWR, 0);
    271     if (gz.fd == -1) bye("cannot open ", name);
    272     gz.buf = malloc(CHUNK);
    273     if (gz.buf == NULL) bye("out of memory", "");
    274     gz.size = LGCHUNK;
    275     gz.left = 0;
    276 
    277     /* skip gzip header */
    278     gzheader(&gz);
    279 
    280     /* prepare to decompress */
    281     window = malloc(DSIZE);
    282     if (window == NULL) bye("out of memory", "");
    283     strm->zalloc = Z_NULL;
    284     strm->zfree = Z_NULL;
    285     strm->opaque = Z_NULL;
    286     ret = inflateInit2(strm, -15);
    287     if (ret != Z_OK) bye("out of memory", " or library mismatch");
    288 
    289     /* decompress the deflate stream, saving append information */
    290     lastbit = 0;
    291     lastoff = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
    292     left = 0;
    293     strm->avail_in = gz.left;
    294     strm->next_in = gz.next;
    295     crc = crc32(0L, Z_NULL, 0);
    296     have = full = 0;
    297     do {
    298         /* if needed, get more input */
    299         if (strm->avail_in == 0) {
    300             readmore(&gz);
    301             strm->avail_in = gz.left;
    302             strm->next_in = gz.next;
    303         }
    304 
    305         /* set up output to next available section of sliding window */
    306         strm->avail_out = DSIZE - have;
    307         strm->next_out = window + have;
    308 
    309         /* inflate and check for errors */
    310         ret = inflate(strm, Z_BLOCK);
    311         if (ret == Z_STREAM_ERROR) bye("internal stream error!", "");
    312         if (ret == Z_MEM_ERROR) bye("out of memory", "");
    313         if (ret == Z_DATA_ERROR)
    314             bye("invalid compressed data--format violated in", name);
    315 
    316         /* update crc and sliding window pointer */
    317         crc = crc32(crc, window + have, DSIZE - have - strm->avail_out);
    318         if (strm->avail_out)
    319             have = DSIZE - strm->avail_out;
    320         else {
    321             have = 0;
    322             full = 1;
    323         }
    324 
    325         /* process end of block */
    326         if (strm->data_type & 128) {
    327             if (strm->data_type & 64)
    328                 left = strm->data_type & 0x1f;
    329             else {
    330                 lastbit = strm->data_type & 0x1f;
    331                 lastoff = lseek(gz.fd, 0L, SEEK_CUR) - strm->avail_in;
    332             }
    333         }
    334     } while (ret != Z_STREAM_END);
    335     inflateEnd(strm);
    336     gz.left = strm->avail_in;
    337     gz.next = strm->next_in;
    338 
    339     /* save the location of the end of the compressed data */
    340     end = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
    341 
    342     /* check gzip trailer and save total for deflate */
    343     if (crc != read4(&gz))
    344         bye("invalid compressed data--crc mismatch in ", name);
    345     tot = strm->total_out;
    346     if ((tot & 0xffffffffUL) != read4(&gz))
    347         bye("invalid compressed data--length mismatch in", name);
    348 
    349     /* if not at end of file, warn */
    350     if (gz.left || readin(&gz))
    351         fprintf(stderr,
    352             "gzappend warning: junk at end of gzip file overwritten\n");
    353 
    354     /* clear last block bit */
    355     lseek(gz.fd, lastoff - (lastbit != 0), SEEK_SET);
    356     if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
    357     *gz.buf = (unsigned char)(*gz.buf ^ (1 << ((8 - lastbit) & 7)));
    358     lseek(gz.fd, -1L, SEEK_CUR);
    359     if (write(gz.fd, gz.buf, 1) != 1) bye("writing after seek to ", name);
    360 
    361     /* if window wrapped, build dictionary from window by rotating */
    362     if (full) {
    363         rotate(window, DSIZE, have);
    364         have = DSIZE;
    365     }
    366 
    367     /* set up deflate stream with window, crc, total_in, and leftover bits */
    368     ret = deflateInit2(strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
    369     if (ret != Z_OK) bye("out of memory", "");
    370     deflateSetDictionary(strm, window, have);
    371     strm->adler = crc;
    372     strm->total_in = tot;
    373     if (left) {
    374         lseek(gz.fd, --end, SEEK_SET);
    375         if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
    376         deflatePrime(strm, 8 - left, *gz.buf);
    377     }
    378     lseek(gz.fd, end, SEEK_SET);
    379 
    380     /* clean up and return */
    381     free(window);
    382     free(gz.buf);
    383     return gz.fd;
    384 }
    385 
    386 /* append file "name" to gzip file gd using deflate stream strm -- if last
    387    is true, then finish off the deflate stream at the end */
    388 local void gztack(char *name, int gd, z_stream *strm, int last)
    389 {
    390     int fd, len, ret;
    391     unsigned left;
    392     unsigned char *in, *out;
    393 
    394     /* open file to compress and append */
    395     fd = 0;
    396     if (name != NULL) {
    397         fd = open(name, O_RDONLY, 0);
    398         if (fd == -1)
    399             fprintf(stderr, "gzappend warning: %s not found, skipping ...\n",
    400                     name);
    401     }
    402 
    403     /* allocate buffers */
    404     in = malloc(CHUNK);
    405     out = malloc(CHUNK);
    406     if (in == NULL || out == NULL) bye("out of memory", "");
    407 
    408     /* compress input file and append to gzip file */
    409     do {
    410         /* get more input */
    411         len = read(fd, in, CHUNK);
    412         if (len == -1) {
    413             fprintf(stderr,
    414                     "gzappend warning: error reading %s, skipping rest ...\n",
    415                     name);
    416             len = 0;
    417         }
    418         strm->avail_in = (unsigned)len;
    419         strm->next_in = in;
    420         if (len) strm->adler = crc32(strm->adler, in, (unsigned)len);
    421 
    422         /* compress and write all available output */
    423         do {
    424             strm->avail_out = CHUNK;
    425             strm->next_out = out;
    426             ret = deflate(strm, last && len == 0 ? Z_FINISH : Z_NO_FLUSH);
    427             left = CHUNK - strm->avail_out;
    428             while (left) {
    429                 len = write(gd, out + CHUNK - strm->avail_out - left, left);
    430                 if (len == -1) bye("writing gzip file", "");
    431                 left -= (unsigned)len;
    432             }
    433         } while (strm->avail_out == 0 && ret != Z_STREAM_END);
    434     } while (len != 0);
    435 
    436     /* write trailer after last entry */
    437     if (last) {
    438         deflateEnd(strm);
    439         out[0] = (unsigned char)(strm->adler);
    440         out[1] = (unsigned char)(strm->adler >> 8);
    441         out[2] = (unsigned char)(strm->adler >> 16);
    442         out[3] = (unsigned char)(strm->adler >> 24);
    443         out[4] = (unsigned char)(strm->total_in);
    444         out[5] = (unsigned char)(strm->total_in >> 8);
    445         out[6] = (unsigned char)(strm->total_in >> 16);
    446         out[7] = (unsigned char)(strm->total_in >> 24);
    447         len = 8;
    448         do {
    449             ret = write(gd, out + 8 - len, len);
    450             if (ret == -1) bye("writing gzip file", "");
    451             len -= ret;
    452         } while (len);
    453         close(gd);
    454     }
    455 
    456     /* clean up and return */
    457     free(out);
    458     free(in);
    459     if (fd > 0) close(fd);
    460 }
    461 
    462 /* process the compression level option if present, scan the gzip file, and
    463    append the specified files, or append the data from stdin if no other file
    464    names are provided on the command line -- the gzip file must be writable
    465    and seekable */
    466 int main(int argc, char **argv)
    467 {
    468     int gd, level;
    469     z_stream strm;
    470 
    471     /* ignore command name */
    472     argc--; argv++;
    473 
    474     /* provide usage if no arguments */
    475     if (*argv == NULL) {
    476         printf(
    477             "gzappend 1.2 (11 Oct 2012) Copyright (C) 2003, 2012 Mark Adler\n"
    478                );
    479         printf(
    480             "usage: gzappend [-level] file.gz [ addthis [ andthis ... ]]\n");
    481         return 0;
    482     }
    483 
    484     /* set compression level */
    485     level = Z_DEFAULT_COMPRESSION;
    486     if (argv[0][0] == '-') {
    487         if (argv[0][1] < '0' || argv[0][1] > '9' || argv[0][2] != 0)
    488             bye("invalid compression level", "");
    489         level = argv[0][1] - '0';
    490         if (*++argv == NULL) bye("no gzip file name after options", "");
    491     }
    492 
    493     /* prepare to append to gzip file */
    494     gd = gzscan(*argv++, &strm, level);
    495 
    496     /* append files on command line, or from stdin if none */
    497     if (*argv == NULL)
    498         gztack(NULL, gd, &strm, 1);
    499     else
    500         do {
    501             gztack(*argv, gd, &strm, argv[1] == NULL);
    502         } while (*++argv != NULL);
    503     return 0;
    504 }
    505