Home | History | Annotate | Download | only in examples
      1 /* gzjoin -- command to join gzip files into one gzip file
      2 
      3   Copyright (C) 2004 Mark Adler, all rights reserved
      4   version 1.0, 11 Dec 2004
      5 
      6   This software is provided 'as-is', without any express or implied
      7   warranty.  In no event will the author be held liable for any damages
      8   arising from the use of this software.
      9 
     10   Permission is granted to anyone to use this software for any purpose,
     11   including commercial applications, and to alter it and redistribute it
     12   freely, subject to the following restrictions:
     13 
     14   1. The origin of this software must not be misrepresented; you must not
     15      claim that you wrote the original software. If you use this software
     16      in a product, an acknowledgment in the product documentation would be
     17      appreciated but is not required.
     18   2. Altered source versions must be plainly marked as such, and must not be
     19      misrepresented as being the original software.
     20   3. This notice may not be removed or altered from any source distribution.
     21 
     22   Mark Adler    madler (at) alumni.caltech.edu
     23  */
     24 
     25 /*
     26  * Change history:
     27  *
     28  * 1.0  11 Dec 2004     - First version
     29  * 1.1  12 Jun 2005     - Changed ssize_t to long for portability
     30  */
     31 
     32 /*
     33    gzjoin takes one or more gzip files on the command line and writes out a
     34    single gzip file that will uncompress to the concatenation of the
     35    uncompressed data from the individual gzip files.  gzjoin does this without
     36    having to recompress any of the data and without having to calculate a new
     37    crc32 for the concatenated uncompressed data.  gzjoin does however have to
     38    decompress all of the input data in order to find the bits in the compressed
     39    data that need to be modified to concatenate the streams.
     40 
     41    gzjoin does not do an integrity check on the input gzip files other than
     42    checking the gzip header and decompressing the compressed data.  They are
     43    otherwise assumed to be complete and correct.
     44 
     45    Each joint between gzip files removes at least 18 bytes of previous trailer
     46    and subsequent header, and inserts an average of about three bytes to the
     47    compressed data in order to connect the streams.  The output gzip file
     48    has a minimal ten-byte gzip header with no file name or modification time.
     49 
     50    This program was written to illustrate the use of the Z_BLOCK option of
     51    inflate() and the crc32_combine() function.  gzjoin will not compile with
     52    versions of zlib earlier than 1.2.3.
     53  */
     54 
     55 #include <stdio.h>      /* fputs(), fprintf(), fwrite(), putc() */
     56 #include <stdlib.h>     /* exit(), malloc(), free() */
     57 #include <fcntl.h>      /* open() */
     58 #include <unistd.h>     /* close(), read(), lseek() */
     59 #include "zlib.h"
     60     /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
     61 
     62 #define local static
     63 
     64 /* exit with an error (return a value to allow use in an expression) */
     65 local int bail(char *why1, char *why2)
     66 {
     67     fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
     68     exit(1);
     69     return 0;
     70 }
     71 
     72 /* -- simple buffered file input with access to the buffer -- */
     73 
     74 #define CHUNK 32768         /* must be a power of two and fit in unsigned */
     75 
     76 /* bin buffered input file type */
     77 typedef struct {
     78     char *name;             /* name of file for error messages */
     79     int fd;                 /* file descriptor */
     80     unsigned left;          /* bytes remaining at next */
     81     unsigned char *next;    /* next byte to read */
     82     unsigned char *buf;     /* allocated buffer of length CHUNK */
     83 } bin;
     84 
     85 /* close a buffered file and free allocated memory */
     86 local void bclose(bin *in)
     87 {
     88     if (in != NULL) {
     89         if (in->fd != -1)
     90             close(in->fd);
     91         if (in->buf != NULL)
     92             free(in->buf);
     93         free(in);
     94     }
     95 }
     96 
     97 /* open a buffered file for input, return a pointer to type bin, or NULL on
     98    failure */
     99 local bin *bopen(char *name)
    100 {
    101     bin *in;
    102 
    103     in = malloc(sizeof(bin));
    104     if (in == NULL)
    105         return NULL;
    106     in->buf = malloc(CHUNK);
    107     in->fd = open(name, O_RDONLY, 0);
    108     if (in->buf == NULL || in->fd == -1) {
    109         bclose(in);
    110         return NULL;
    111     }
    112     in->left = 0;
    113     in->next = in->buf;
    114     in->name = name;
    115     return in;
    116 }
    117 
    118 /* load buffer from file, return -1 on read error, 0 or 1 on success, with
    119    1 indicating that end-of-file was reached */
    120 local int bload(bin *in)
    121 {
    122     long len;
    123 
    124     if (in == NULL)
    125         return -1;
    126     if (in->left != 0)
    127         return 0;
    128     in->next = in->buf;
    129     do {
    130         len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
    131         if (len < 0)
    132             return -1;
    133         in->left += (unsigned)len;
    134     } while (len != 0 && in->left < CHUNK);
    135     return len == 0 ? 1 : 0;
    136 }
    137 
    138 /* get a byte from the file, bail if end of file */
    139 #define bget(in) (in->left ? 0 : bload(in), \
    140                   in->left ? (in->left--, *(in->next)++) : \
    141                     bail("unexpected end of file on ", in->name))
    142 
    143 /* get a four-byte little-endian unsigned integer from file */
    144 local unsigned long bget4(bin *in)
    145 {
    146     unsigned long val;
    147 
    148     val = bget(in);
    149     val += (unsigned long)(bget(in)) << 8;
    150     val += (unsigned long)(bget(in)) << 16;
    151     val += (unsigned long)(bget(in)) << 24;
    152     return val;
    153 }
    154 
    155 /* skip bytes in file */
    156 local void bskip(bin *in, unsigned skip)
    157 {
    158     /* check pointer */
    159     if (in == NULL)
    160         return;
    161 
    162     /* easy case -- skip bytes in buffer */
    163     if (skip <= in->left) {
    164         in->left -= skip;
    165         in->next += skip;
    166         return;
    167     }
    168 
    169     /* skip what's in buffer, discard buffer contents */
    170     skip -= in->left;
    171     in->left = 0;
    172 
    173     /* seek past multiples of CHUNK bytes */
    174     if (skip > CHUNK) {
    175         unsigned left;
    176 
    177         left = skip & (CHUNK - 1);
    178         if (left == 0) {
    179             /* exact number of chunks: seek all the way minus one byte to check
    180                for end-of-file with a read */
    181             lseek(in->fd, skip - 1, SEEK_CUR);
    182             if (read(in->fd, in->buf, 1) != 1)
    183                 bail("unexpected end of file on ", in->name);
    184             return;
    185         }
    186 
    187         /* skip the integral chunks, update skip with remainder */
    188         lseek(in->fd, skip - left, SEEK_CUR);
    189         skip = left;
    190     }
    191 
    192     /* read more input and skip remainder */
    193     bload(in);
    194     if (skip > in->left)
    195         bail("unexpected end of file on ", in->name);
    196     in->left -= skip;
    197     in->next += skip;
    198 }
    199 
    200 /* -- end of buffered input functions -- */
    201 
    202 /* skip the gzip header from file in */
    203 local void gzhead(bin *in)
    204 {
    205     int flags;
    206 
    207     /* verify gzip magic header and compression method */
    208     if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
    209         bail(in->name, " is not a valid gzip file");
    210 
    211     /* get and verify flags */
    212     flags = bget(in);
    213     if ((flags & 0xe0) != 0)
    214         bail("unknown reserved bits set in ", in->name);
    215 
    216     /* skip modification time, extra flags, and os */
    217     bskip(in, 6);
    218 
    219     /* skip extra field if present */
    220     if (flags & 4) {
    221         unsigned len;
    222 
    223         len = bget(in);
    224         len += (unsigned)(bget(in)) << 8;
    225         bskip(in, len);
    226     }
    227 
    228     /* skip file name if present */
    229     if (flags & 8)
    230         while (bget(in) != 0)
    231             ;
    232 
    233     /* skip comment if present */
    234     if (flags & 16)
    235         while (bget(in) != 0)
    236             ;
    237 
    238     /* skip header crc if present */
    239     if (flags & 2)
    240         bskip(in, 2);
    241 }
    242 
    243 /* write a four-byte little-endian unsigned integer to out */
    244 local void put4(unsigned long val, FILE *out)
    245 {
    246     putc(val & 0xff, out);
    247     putc((val >> 8) & 0xff, out);
    248     putc((val >> 16) & 0xff, out);
    249     putc((val >> 24) & 0xff, out);
    250 }
    251 
    252 /* Load up zlib stream from buffered input, bail if end of file */
    253 local void zpull(z_streamp strm, bin *in)
    254 {
    255     if (in->left == 0)
    256         bload(in);
    257     if (in->left == 0)
    258         bail("unexpected end of file on ", in->name);
    259     strm->avail_in = in->left;
    260     strm->next_in = in->next;
    261 }
    262 
    263 /* Write header for gzip file to out and initialize trailer. */
    264 local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
    265 {
    266     fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
    267     *crc = crc32(0L, Z_NULL, 0);
    268     *tot = 0;
    269 }
    270 
    271 /* Copy the compressed data from name, zeroing the last block bit of the last
    272    block if clr is true, and adding empty blocks as needed to get to a byte
    273    boundary.  If clr is false, then the last block becomes the last block of
    274    the output, and the gzip trailer is written.  crc and tot maintains the
    275    crc and length (modulo 2^32) of the output for the trailer.  The resulting
    276    gzip file is written to out.  gzinit() must be called before the first call
    277    of gzcopy() to write the gzip header and to initialize crc and tot. */
    278 local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
    279                   FILE *out)
    280 {
    281     int ret;                /* return value from zlib functions */
    282     int pos;                /* where the "last block" bit is in byte */
    283     int last;               /* true if processing the last block */
    284     bin *in;                /* buffered input file */
    285     unsigned char *start;   /* start of compressed data in buffer */
    286     unsigned char *junk;    /* buffer for uncompressed data -- discarded */
    287     z_off_t len;            /* length of uncompressed data (support > 4 GB) */
    288     z_stream strm;          /* zlib inflate stream */
    289 
    290     /* open gzip file and skip header */
    291     in = bopen(name);
    292     if (in == NULL)
    293         bail("could not open ", name);
    294     gzhead(in);
    295 
    296     /* allocate buffer for uncompressed data and initialize raw inflate
    297        stream */
    298     junk = malloc(CHUNK);
    299     strm.zalloc = Z_NULL;
    300     strm.zfree = Z_NULL;
    301     strm.opaque = Z_NULL;
    302     strm.avail_in = 0;
    303     strm.next_in = Z_NULL;
    304     ret = inflateInit2(&strm, -15);
    305     if (junk == NULL || ret != Z_OK)
    306         bail("out of memory", "");
    307 
    308     /* inflate and copy compressed data, clear last-block bit if requested */
    309     len = 0;
    310     zpull(&strm, in);
    311     start = strm.next_in;
    312     last = start[0] & 1;
    313     if (last && clr)
    314         start[0] &= ~1;
    315     strm.avail_out = 0;
    316     for (;;) {
    317         /* if input used and output done, write used input and get more */
    318         if (strm.avail_in == 0 && strm.avail_out != 0) {
    319             fwrite(start, 1, strm.next_in - start, out);
    320             start = in->buf;
    321             in->left = 0;
    322             zpull(&strm, in);
    323         }
    324 
    325         /* decompress -- return early when end-of-block reached */
    326         strm.avail_out = CHUNK;
    327         strm.next_out = junk;
    328         ret = inflate(&strm, Z_BLOCK);
    329         switch (ret) {
    330         case Z_MEM_ERROR:
    331             bail("out of memory", "");
    332         case Z_DATA_ERROR:
    333             bail("invalid compressed data in ", in->name);
    334         }
    335 
    336         /* update length of uncompressed data */
    337         len += CHUNK - strm.avail_out;
    338 
    339         /* check for block boundary (only get this when block copied out) */
    340         if (strm.data_type & 128) {
    341             /* if that was the last block, then done */
    342             if (last)
    343                 break;
    344 
    345             /* number of unused bits in last byte */
    346             pos = strm.data_type & 7;
    347 
    348             /* find the next last-block bit */
    349             if (pos != 0) {
    350                 /* next last-block bit is in last used byte */
    351                 pos = 0x100 >> pos;
    352                 last = strm.next_in[-1] & pos;
    353                 if (last && clr)
    354                     strm.next_in[-1] &= ~pos;
    355             }
    356             else {
    357                 /* next last-block bit is in next unused byte */
    358                 if (strm.avail_in == 0) {
    359                     /* don't have that byte yet -- get it */
    360                     fwrite(start, 1, strm.next_in - start, out);
    361                     start = in->buf;
    362                     in->left = 0;
    363                     zpull(&strm, in);
    364                 }
    365                 last = strm.next_in[0] & 1;
    366                 if (last && clr)
    367                     strm.next_in[0] &= ~1;
    368             }
    369         }
    370     }
    371 
    372     /* update buffer with unused input */
    373     in->left = strm.avail_in;
    374     in->next = strm.next_in;
    375 
    376     /* copy used input, write empty blocks to get to byte boundary */
    377     pos = strm.data_type & 7;
    378     fwrite(start, 1, in->next - start - 1, out);
    379     last = in->next[-1];
    380     if (pos == 0 || !clr)
    381         /* already at byte boundary, or last file: write last byte */
    382         putc(last, out);
    383     else {
    384         /* append empty blocks to last byte */
    385         last &= ((0x100 >> pos) - 1);       /* assure unused bits are zero */
    386         if (pos & 1) {
    387             /* odd -- append an empty stored block */
    388             putc(last, out);
    389             if (pos == 1)
    390                 putc(0, out);               /* two more bits in block header */
    391             fwrite("\0\0\xff\xff", 1, 4, out);
    392         }
    393         else {
    394             /* even -- append 1, 2, or 3 empty fixed blocks */
    395             switch (pos) {
    396             case 6:
    397                 putc(last | 8, out);
    398                 last = 0;
    399             case 4:
    400                 putc(last | 0x20, out);
    401                 last = 0;
    402             case 2:
    403                 putc(last | 0x80, out);
    404                 putc(0, out);
    405             }
    406         }
    407     }
    408 
    409     /* update crc and tot */
    410     *crc = crc32_combine(*crc, bget4(in), len);
    411     *tot += (unsigned long)len;
    412 
    413     /* clean up */
    414     inflateEnd(&strm);
    415     free(junk);
    416     bclose(in);
    417 
    418     /* write trailer if this is the last gzip file */
    419     if (!clr) {
    420         put4(*crc, out);
    421         put4(*tot, out);
    422     }
    423 }
    424 
    425 /* join the gzip files on the command line, write result to stdout */
    426 int main(int argc, char **argv)
    427 {
    428     unsigned long crc, tot;     /* running crc and total uncompressed length */
    429 
    430     /* skip command name */
    431     argc--;
    432     argv++;
    433 
    434     /* show usage if no arguments */
    435     if (argc == 0) {
    436         fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
    437               stderr);
    438         return 0;
    439     }
    440 
    441     /* join gzip files on command line and write to stdout */
    442     gzinit(&crc, &tot, stdout);
    443     while (argc--)
    444         gzcopy(*argv++, argc, &crc, &tot, stdout);
    445 
    446     /* done */
    447     return 0;
    448 }
    449