1 /* compress.c - deflate/inflate code for zip, gzip, zlib, and raw 2 * 3 * Copyright 2014 Rob Landley <rob (at) landley.net> 4 * 5 * The inflate/deflate code lives here, so the various things that use it 6 * either live here or call these commands to pipe data through them. 7 * 8 * Divergence from posix: replace obsolete/patented "compress" with mutiplexer. 9 * (gzip already replaces "uncompress".) 10 * 11 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip) 12 * LSB 4.1 has gzip, gunzip, and zcat 13 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout 14 15 // Accept many different kinds of command line argument. 16 // Leave Lrg at end so flag values line up. 17 18 USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN)) 19 USE_GZIP(NEWTOY(gzip, USE_GZIP_D("d")"19dcflqStvgLRz[!gLRz]", TOYFLAG_USR|TOYFLAG_BIN)) 20 USE_ZCAT(NEWTOY(zcat, 0, TOYFLAG_USR|TOYFLAG_BIN)) 21 USE_GUNZIP(NEWTOY(gunzip, "cflqStv", TOYFLAG_USR|TOYFLAG_BIN)) 22 23 //zip unzip gzip gunzip zcat 24 25 config COMPRESS 26 bool "compress" 27 default n 28 help 29 usage: compress [-zgLR19] [FILE] 30 31 Compress or decompress file (or stdin) using "deflate" algorithm. 32 33 -1 min compression 34 -9 max compression (default) 35 -g gzip (default) 36 -L zlib 37 -R raw 38 -z zip 39 40 config GZIP 41 bool "gzip" 42 default y 43 depends on COMPRESS 44 help 45 usage: gzip [-19cfqStvzgLR] [FILE...] 46 47 Compess (deflate) file(s). With no files, compress stdin to stdout. 48 49 On successful decompression, compressed files are replaced with the 50 uncompressed version. The input file is removed and replaced with 51 a new file without the .gz extension (with same ownership/permissions). 52 53 -1 Minimal compression (fastest) 54 -9 Max compression (default) 55 -c cat to stdout (act as zcat) 56 -f force (if output file exists, input is tty, unrecognized extension) 57 -q quiet (no warnings) 58 -S specify exension (default .*) 59 -t test compressed file(s) 60 -v verbose (like -l, but compress files) 61 62 Compression type: 63 -g gzip (default) -L zlib -R raw -z zip 64 65 config GZIP_D 66 bool 67 default y 68 depends on GZIP && DECOMPRESS 69 help 70 usage: gzip [-d] 71 72 -d decompress (act as gunzip) 73 74 config DECOMPRESS 75 bool "decompress" 76 default n 77 help 78 usage: compress [-zglrcd9] [FILE] 79 80 Compress or decompress file (or stdin) using "deflate" algorithm. 81 82 -c compress with -g gzip (default) -l zlib -r raw -z zip 83 -d decompress (autodetects type) 84 85 86 config ZCAT 87 bool "zcat" 88 default y 89 depends on DECOMPRESS 90 help 91 usage: zcat [FILE...] 92 93 Decompress deflated file(s) to stdout 94 95 config GUNZIP 96 bool "gunzip" 97 default y 98 depends on DECOMPRESS 99 help 100 usage: gunzip [-cflqStv] [FILE...] 101 102 Decompess (deflate) file(s). With no files, compress stdin to stdout. 103 104 On successful decompression, compressed files are replaced with the 105 uncompressed version. The input file is removed and replaced with 106 a new file without the .gz extension (with same ownership/permissions). 107 108 -c cat to stdout (act as zcat) 109 -f force (output file exists, input is tty, unrecognized extension) 110 -l list compressed/uncompressed/ratio/name for each input file. 111 -q quiet (no warnings) 112 -S specify exension (default .*) 113 -t test compressed file(s) 114 -v verbose (like -l, but decompress files) 115 */ 116 117 #define FOR_compress 118 #include "toys.h" 119 120 GLOBALS( 121 // Huffman codes: base offset and extra bits tables (length and distance) 122 char lenbits[29], distbits[30]; 123 unsigned short lenbase[29], distbase[30]; 124 void *fixdisthuff, *fixlithuff; 125 126 // CRC 127 void (*crcfunc)(char *data, int len); 128 unsigned crc; 129 130 // Compressed data buffer 131 char *data; 132 unsigned pos, len; 133 int infd, outfd; 134 135 // Tables only used for deflation 136 unsigned short *hashhead, *hashchain; 137 ) 138 139 // little endian bit buffer 140 struct bitbuf { 141 int fd, bitpos, len, max; 142 char buf[]; 143 }; 144 145 // malloc a struct bitbuf 146 struct bitbuf *bitbuf_init(int fd, int size) 147 { 148 struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size); 149 150 bb->max = size; 151 bb->fd = fd; 152 153 return bb; 154 } 155 156 // Advance bitpos without the overhead of recording bits 157 void bitbuf_skip(struct bitbuf *bb, int bits) 158 { 159 int pos = bb->bitpos + bits, len = bb->len << 3; 160 161 while (pos >= len) { 162 pos -= len; 163 len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3; 164 if (bb->len < 1) perror_exit("inflate EOF"); 165 } 166 bb->bitpos = pos; 167 } 168 169 // Optimized single bit inlined version 170 static inline int bitbuf_bit(struct bitbuf *bb) 171 { 172 int bufpos = bb->bitpos>>3; 173 174 if (bufpos == bb->len) { 175 bitbuf_skip(bb, 0); 176 bufpos = 0; 177 } 178 179 return (bb->buf[bufpos]>>(bb->bitpos++&7))&1; 180 } 181 182 // Fetch the next X bits from the bitbuf, little endian 183 unsigned bitbuf_get(struct bitbuf *bb, int bits) 184 { 185 int result = 0, offset = 0; 186 187 while (bits) { 188 int click = bb->bitpos >> 3, blow, blen; 189 190 // Load more data if buffer empty 191 if (click == bb->len) bitbuf_skip(bb, click = 0); 192 193 // grab bits from next byte 194 blow = bb->bitpos & 7; 195 blen = 8-blow; 196 if (blen > bits) blen = bits; 197 result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset; 198 offset += blen; 199 bits -= blen; 200 bb->bitpos += blen; 201 } 202 203 return result; 204 } 205 206 void bitbuf_flush(struct bitbuf *bb) 207 { 208 if (!bb->bitpos) return; 209 210 xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8); 211 memset(bb->buf, 0, bb->max); 212 bb->bitpos = 0; 213 } 214 215 void bitbuf_put(struct bitbuf *bb, int data, int len) 216 { 217 while (len) { 218 int click = bb->bitpos >> 3, blow, blen; 219 220 // Flush buffer if necessary 221 if (click == bb->max) { 222 bitbuf_flush(bb); 223 click = 0; 224 } 225 blow = bb->bitpos & 7; 226 blen = 8-blow; 227 if (blen > len) blen = len; 228 bb->buf[click] |= data << blow; 229 bb->bitpos += blen; 230 data >>= blen; 231 len -= blen; 232 } 233 } 234 235 static void output_byte(char sym) 236 { 237 int pos = TT.pos++ & 32767; 238 239 TT.data[pos] = sym; 240 241 if (!pos) { 242 xwrite(TT.outfd, TT.data, 32768); 243 if (TT.crcfunc) TT.crcfunc(TT.data, 32768); 244 } 245 } 246 247 // Huffman coding uses bits to traverse a binary tree to a leaf node, 248 // By placing frequently occurring symbols at shorter paths, frequently 249 // used symbols may be represented in fewer bits than uncommon symbols. 250 251 struct huff { 252 unsigned short length[16]; 253 unsigned short symbol[288]; 254 }; 255 256 // Create simple huffman tree from array of bit lengths. 257 258 // The symbols in the huffman trees are sorted (first by bit length 259 // of the code to reach them, then by symbol number). This means that given 260 // the bit length of each symbol, we can construct a unique tree. 261 static void len2huff(struct huff *huff, char bitlen[], int len) 262 { 263 int offset[16]; 264 int i; 265 266 // Count number of codes at each bit length 267 memset(huff, 0, sizeof(struct huff)); 268 for (i = 0; i<len; i++) huff->length[bitlen[i]]++; 269 270 // Sort symbols by bit length. (They'll remain sorted by symbol within that.) 271 *huff->length = *offset = 0; 272 for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1]; 273 274 for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i; 275 } 276 277 // Fetch and decode next huffman coded symbol from bitbuf. 278 // This takes advantage of the sorting to navigate the tree as an array: 279 // each time we fetch a bit we have all the codes at that bit level in 280 // order with no gaps. 281 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff) 282 { 283 unsigned short *length = huff->length; 284 int start = 0, offset = 0; 285 286 // Traverse through the bit lengths until our code is in this range 287 for (;;) { 288 offset = (offset << 1) | bitbuf_bit(bb); 289 start += *++length; 290 if ((offset -= *length) < 0) break; 291 if ((length - huff->length) & 16) error_exit("bad symbol"); 292 } 293 294 return huff->symbol[start + offset]; 295 } 296 297 // Decompress deflated data from bitbuf to TT.outfd. 298 static void inflate(struct bitbuf *bb) 299 { 300 TT.crc = ~0; 301 // repeat until spanked 302 for (;;) { 303 int final, type; 304 305 final = bitbuf_get(bb, 1); 306 type = bitbuf_get(bb, 2); 307 308 if (type == 3) error_exit("bad type"); 309 310 // Uncompressed block? 311 if (!type) { 312 int len, nlen; 313 314 // Align to byte, read length 315 bitbuf_skip(bb, (8-bb->bitpos)&7); 316 len = bitbuf_get(bb, 16); 317 nlen = bitbuf_get(bb, 16); 318 if (len != (0xffff & ~nlen)) error_exit("bad len"); 319 320 // Dump literal output data 321 while (len) { 322 int pos = bb->bitpos >> 3, bblen = bb->len - pos; 323 char *p = bb->buf+pos; 324 325 // dump bytes until done or end of current bitbuf contents 326 if (bblen > len) bblen = len; 327 pos = bblen; 328 while (pos--) output_byte(*(p++)); 329 bitbuf_skip(bb, bblen << 3); 330 len -= bblen; 331 } 332 333 // Compressed block 334 } else { 335 struct huff *disthuff, *lithuff; 336 337 // Dynamic huffman codes? 338 if (type == 2) { 339 struct huff *h2 = ((struct huff *)toybuf)+1; 340 int i, litlen, distlen, hufflen; 341 char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b" 342 "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits; 343 344 // The huffman trees are stored as a series of bit lengths 345 litlen = bitbuf_get(bb, 5)+257; // max 288 346 distlen = bitbuf_get(bb, 5)+1; // max 32 347 hufflen = bitbuf_get(bb, 4)+4; // max 19 348 349 // The literal and distance codes are themselves compressed, in 350 // a complicated way: an array of bit lengths (hufflen many 351 // entries, each 3 bits) is used to fill out an array of 19 entries 352 // in a magic order, leaving the rest 0. Then make a tree out of it: 353 memset(bits = toybuf+1, 0, 19); 354 for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3); 355 len2huff(h2, bits, 19); 356 357 // Use that tree to read in the literal and distance bit lengths 358 for (i = 0; i < litlen + distlen;) { 359 int sym = huff_and_puff(bb, h2); 360 361 // 0-15 are literals, 16 = repeat previous code 3-6 times, 362 // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit) 363 if (sym < 16) bits[i++] = sym; 364 else { 365 int len = sym & 2; 366 367 len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2); 368 memset(bits+i, bits[i-1] * !(sym&3), len); 369 i += len; 370 } 371 } 372 if (i > litlen+distlen) error_exit("bad tree"); 373 374 len2huff(lithuff = h2, bits, litlen); 375 len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen); 376 377 // Static huffman codes 378 } else { 379 lithuff = TT.fixlithuff; 380 disthuff = TT.fixdisthuff; 381 } 382 383 // Use huffman tables to decode block of compressed symbols 384 for (;;) { 385 int sym = huff_and_puff(bb, lithuff); 386 387 // Literal? 388 if (sym < 256) output_byte(sym); 389 390 // Copy range? 391 else if (sym > 256) { 392 int len, dist; 393 394 sym -= 257; 395 len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]); 396 sym = huff_and_puff(bb, disthuff); 397 dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]); 398 sym = TT.pos & 32767; 399 400 while (len--) output_byte(TT.data[(TT.pos-dist) & 32767]); 401 402 // End of block 403 } else break; 404 } 405 } 406 407 // Was that the last block? 408 if (final) break; 409 } 410 411 if (TT.pos & 32767) { 412 xwrite(TT.outfd, TT.data, TT.pos & 32767); 413 if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767); 414 } 415 } 416 417 // Deflate from TT.infd to bitbuf 418 // For deflate, TT.len = input read, TT.pos = input consumed 419 static void deflate(struct bitbuf *bb) 420 { 421 char *data = TT.data; 422 int len, final = 0; 423 424 TT.crc = ~0; 425 426 while (!final) { 427 // Read next half-window of data if we haven't hit EOF yet. 428 len = readall(TT.infd, data+(TT.len&32768), 32768); 429 if (len < 0) perror_exit("read"); // todo: add filename 430 if (len != 32768) final++; 431 if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len); 432 // TT.len += len; crcfunc advances len 433 434 // store block as literal 435 bitbuf_put(bb, final, 1); 436 bitbuf_put(bb, 0, 1); 437 438 bitbuf_put(bb, 0, (8-bb->bitpos)&7); 439 bitbuf_put(bb, len, 16); 440 bitbuf_put(bb, 0xffff & ~len, 16); 441 442 // repeat until spanked 443 while (TT.pos != TT.len) { 444 unsigned pos = TT.pos & 65535; 445 446 bitbuf_put(bb, data[pos], 8); 447 448 // need to refill buffer? 449 if (!(32767 & ++TT.pos) && !final) break; 450 } 451 } 452 bitbuf_flush(bb); 453 } 454 455 // Allocate memory for deflate/inflate. 456 static void init_deflate(int compress) 457 { 458 int i, n = 1; 459 460 // compress needs 64k data and 32k each for hashhead and hashchain. 461 // decompress just needs 32k data. 462 TT.data = xmalloc(32768*(compress ? 4 : 1)); 463 if (compress) { 464 TT.hashhead = (unsigned short *)(TT.data + 65536); 465 TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768); 466 } 467 468 // Calculate lenbits, lenbase, distbits, distbase 469 *TT.lenbase = 3; 470 for (i = 0; i<sizeof(TT.lenbits)-1; i++) { 471 if (i>4) { 472 if (!(i&3)) { 473 TT.lenbits[i]++; 474 n <<= 1; 475 } 476 if (i == 27) n--; 477 else TT.lenbits[i+1] = TT.lenbits[i]; 478 } 479 TT.lenbase[i+1] = n + TT.lenbase[i]; 480 } 481 n = 0; 482 for (i = 0; i<sizeof(TT.distbits); i++) { 483 TT.distbase[i] = 1<<n; 484 if (i) TT.distbase[i] += TT.distbase[i-1]; 485 if (i>3 && !(i&1)) n++; 486 TT.distbits[i] = n; 487 } 488 489 // Init fixed huffman tables 490 for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279); 491 len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288); 492 memset(toybuf, 5, 30); 493 len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30); 494 } 495 496 // Return true/false whether we consumed a gzip header. 497 static int is_gzip(struct bitbuf *bb) 498 { 499 int flags; 500 501 // Confirm signature 502 if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31) 503 return 0; 504 bitbuf_skip(bb, 6*8); 505 506 // Skip extra, name, comment, header CRC fields 507 if (flags & 4) bitbuf_skip(bb, 16); 508 if (flags & 8) while (bitbuf_get(bb, 8)); 509 if (flags & 16) while (bitbuf_get(bb, 8)); 510 if (flags & 2) bitbuf_skip(bb, 16); 511 512 return 1; 513 } 514 515 void gzip_crc(char *data, int len) 516 { 517 int i; 518 unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024); 519 520 crc = TT.crc; 521 for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8); 522 TT.crc = crc; 523 TT.len += len; 524 } 525 526 static void do_gzip(int fd, char *name) 527 { 528 struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf)); 529 530 // Header from RFC 1952 section 2.2: 531 // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none), 532 // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression), 533 // Operating System (FF=unknown) 534 535 TT.infd = fd; 536 xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10); 537 538 // Use last 1k of toybuf for little endian crc table 539 crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1); 540 TT.crcfunc = gzip_crc; 541 542 deflate(bb); 543 544 // tail: crc32, len32 545 546 bitbuf_put(bb, 0, (8-bb->bitpos)&7); 547 bitbuf_put(bb, ~TT.crc, 32); 548 bitbuf_put(bb, TT.len, 32); 549 550 bitbuf_flush(bb); 551 free(bb); 552 } 553 554 static void do_zcat(int fd, char *name) 555 { 556 struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf)); 557 558 if (!is_gzip(bb)) error_exit("not gzip"); 559 TT.outfd = 1; 560 561 // Use last 1k of toybuf for little endian crc table 562 crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1); 563 TT.crcfunc = gzip_crc; 564 565 inflate(bb); 566 567 // tail: crc32, len32 568 569 bitbuf_skip(bb, (8-bb->bitpos)&7); 570 if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32)) 571 error_exit("bad crc"); 572 free(bb); 573 } 574 575 // Parse many different kinds of command line argument: 576 577 void compress_main(void) 578 { 579 // todo: this 580 printf("hello world"); 581 } 582 583 //#define CLEANUP_compress 584 //#define FOR_zcat 585 //#include "generated/flags.h" 586 587 void zcat_main(void) 588 { 589 init_deflate(0); 590 591 loopfiles(toys.optargs, do_zcat); 592 } 593 594 void gunzip_main(void) 595 { 596 init_deflate(0); 597 598 loopfiles(toys.optargs, do_zcat); 599 } 600 601 void gzip_main(void) 602 { 603 init_deflate(1); 604 605 loopfiles(toys.optargs, do_gzip); 606 } 607