1 /* deflate.c - deflate/inflate code for gzip and friends 2 * 3 * Copyright 2014 Rob Landley <rob (at) landley.net> 4 * 5 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip) 6 * LSB 4.1 has gzip, gunzip, and zcat 7 * 8 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout 9 */ 10 11 #include "toys.h" 12 13 struct deflate { 14 // Huffman codes: base offset and extra bits tables (length and distance) 15 char lenbits[29], distbits[30]; 16 unsigned short lenbase[29], distbase[30]; 17 void *fixdisthuff, *fixlithuff; 18 19 // CRC 20 void (*crcfunc)(struct deflate *dd, char *data, int len); 21 unsigned crctable[256], crc; 22 23 24 // Tables only used for deflation 25 unsigned short *hashhead, *hashchain; 26 27 // Compressed data buffer (extra space malloced at end) 28 unsigned pos, len; 29 int infd, outfd; 30 char data[]; 31 }; 32 33 // little endian bit buffer 34 struct bitbuf { 35 int fd, bitpos, len, max; 36 char buf[]; 37 }; 38 39 // malloc a struct bitbuf 40 struct bitbuf *bitbuf_init(int fd, int size) 41 { 42 struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size); 43 44 bb->max = size; 45 bb->fd = fd; 46 47 return bb; 48 } 49 50 // Advance bitpos without the overhead of recording bits 51 // Loads more data when input buffer empty 52 void bitbuf_skip(struct bitbuf *bb, int bits) 53 { 54 int pos = bb->bitpos + bits, len = bb->len << 3; 55 56 while (pos >= len) { 57 pos -= len; 58 len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3; 59 if (bb->len < 1) perror_exit("inflate EOF"); 60 } 61 bb->bitpos = pos; 62 } 63 64 // Optimized single bit inlined version 65 static inline int bitbuf_bit(struct bitbuf *bb) 66 { 67 int bufpos = bb->bitpos>>3; 68 69 if (bufpos == bb->len) { 70 bitbuf_skip(bb, 0); 71 bufpos = 0; 72 } 73 74 return (bb->buf[bufpos]>>(bb->bitpos++&7))&1; 75 } 76 77 // Fetch the next X bits from the bitbuf, little endian 78 unsigned bitbuf_get(struct bitbuf *bb, int bits) 79 { 80 int result = 0, offset = 0; 81 82 while (bits) { 83 int click = bb->bitpos >> 3, blow, blen; 84 85 // Load more data if buffer empty 86 if (click == bb->len) bitbuf_skip(bb, click = 0); 87 88 // grab bits from next byte 89 blow = bb->bitpos & 7; 90 blen = 8-blow; 91 if (blen > bits) blen = bits; 92 result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset; 93 offset += blen; 94 bits -= blen; 95 bb->bitpos += blen; 96 } 97 98 return result; 99 } 100 101 void bitbuf_flush(struct bitbuf *bb) 102 { 103 if (!bb->bitpos) return; 104 105 xwrite(bb->fd, bb->buf, (bb->bitpos+7)>>3); 106 memset(bb->buf, 0, bb->max); 107 bb->bitpos = 0; 108 } 109 110 void bitbuf_put(struct bitbuf *bb, int data, int len) 111 { 112 while (len) { 113 int click = bb->bitpos >> 3, blow, blen; 114 115 // Flush buffer if necessary 116 if (click == bb->max) { 117 bitbuf_flush(bb); 118 click = 0; 119 } 120 blow = bb->bitpos & 7; 121 blen = 8-blow; 122 if (blen > len) blen = len; 123 bb->buf[click] |= data << blow; 124 bb->bitpos += blen; 125 data >>= blen; 126 len -= blen; 127 } 128 } 129 130 static void output_byte(struct deflate *dd, char sym) 131 { 132 int pos = dd->pos++ & 32767; 133 134 dd->data[pos] = sym; 135 136 if (pos == 32767) { 137 xwrite(dd->outfd, dd->data, 32768); 138 if (dd->crcfunc) dd->crcfunc(dd, dd->data, 32768); 139 } 140 } 141 142 // Huffman coding uses bits to traverse a binary tree to a leaf node, 143 // By placing frequently occurring symbols at shorter paths, frequently 144 // used symbols may be represented in fewer bits than uncommon symbols. 145 // (length[0] isn't used but code's clearer if it's there.) 146 147 struct huff { 148 unsigned short length[16]; // How many symbols have this bit length? 149 unsigned short symbol[288]; // sorted by bit length, then ascending order 150 }; 151 152 // Create simple huffman tree from array of bit lengths. 153 154 // The symbols in the huffman trees are sorted (first by bit length 155 // of the code to reach them, then by symbol number). This means that given 156 // the bit length of each symbol, we can construct a unique tree. 157 static void len2huff(struct huff *huff, char bitlen[], int len) 158 { 159 int offset[16]; 160 int i; 161 162 // Count number of codes at each bit length 163 memset(huff, 0, sizeof(struct huff)); 164 for (i = 0; i<len; i++) huff->length[bitlen[i]]++; 165 166 // Sort symbols by bit length, then symbol. Get list of starting positions 167 // for each group, then write each symbol to next position within its group. 168 *huff->length = *offset = 0; 169 for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1]; 170 for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i; 171 } 172 173 // Fetch and decode next huffman coded symbol from bitbuf. 174 // This takes advantage of the sorting to navigate the tree as an array: 175 // each time we fetch a bit we have all the codes at that bit level in 176 // order with no gaps. 177 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff) 178 { 179 unsigned short *length = huff->length; 180 int start = 0, offset = 0; 181 182 // Traverse through the bit lengths until our code is in this range 183 for (;;) { 184 offset = (offset << 1) | bitbuf_bit(bb); 185 start += *++length; 186 if ((offset -= *length) < 0) break; 187 if ((length - huff->length) & 16) error_exit("bad symbol"); 188 } 189 190 return huff->symbol[start + offset]; 191 } 192 193 // Decompress deflated data from bitbuf to dd->outfd. 194 static void inflate(struct deflate *dd, struct bitbuf *bb) 195 { 196 dd->crc = ~0; 197 // repeat until spanked 198 for (;;) { 199 int final, type; 200 201 final = bitbuf_get(bb, 1); 202 type = bitbuf_get(bb, 2); 203 204 if (type == 3) error_exit("bad type"); 205 206 // Uncompressed block? 207 if (!type) { 208 int len, nlen; 209 210 // Align to byte, read length 211 bitbuf_skip(bb, (8-bb->bitpos)&7); 212 len = bitbuf_get(bb, 16); 213 nlen = bitbuf_get(bb, 16); 214 if (len != (0xffff & ~nlen)) error_exit("bad len"); 215 216 // Dump literal output data 217 while (len) { 218 int pos = bb->bitpos >> 3, bblen = bb->len - pos; 219 char *p = bb->buf+pos; 220 221 // dump bytes until done or end of current bitbuf contents 222 if (bblen > len) bblen = len; 223 pos = bblen; 224 while (pos--) output_byte(dd, *(p++)); 225 bitbuf_skip(bb, bblen << 3); 226 len -= bblen; 227 } 228 229 // Compressed block 230 } else { 231 struct huff *disthuff, *lithuff; 232 233 // Dynamic huffman codes? 234 if (type == 2) { 235 struct huff *h2 = ((struct huff *)toybuf)+1; 236 int i, litlen, distlen, hufflen; 237 char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b" 238 "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits; 239 240 // The huffman trees are stored as a series of bit lengths 241 litlen = bitbuf_get(bb, 5)+257; // max 288 242 distlen = bitbuf_get(bb, 5)+1; // max 32 243 hufflen = bitbuf_get(bb, 4)+4; // max 19 244 245 // The literal and distance codes are themselves compressed, in 246 // a complicated way: an array of bit lengths (hufflen many 247 // entries, each 3 bits) is used to fill out an array of 19 entries 248 // in a magic order, leaving the rest 0. Then make a tree out of it: 249 memset(bits = toybuf+1, 0, 19); 250 for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3); 251 len2huff(h2, bits, 19); 252 253 // Use that tree to read in the literal and distance bit lengths 254 for (i = 0; i < litlen + distlen;) { 255 int sym = huff_and_puff(bb, h2); 256 257 // 0-15 are literals, 16 = repeat previous code 3-6 times, 258 // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit) 259 if (sym < 16) bits[i++] = sym; 260 else { 261 int len = sym & 2; 262 263 len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2); 264 memset(bits+i, bits[i-1] * !(sym&3), len); 265 i += len; 266 } 267 } 268 if (i > litlen+distlen) error_exit("bad tree"); 269 270 len2huff(lithuff = h2, bits, litlen); 271 len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen); 272 273 // Static huffman codes 274 } else { 275 lithuff = dd->fixlithuff; 276 disthuff = dd->fixdisthuff; 277 } 278 279 // Use huffman tables to decode block of compressed symbols 280 for (;;) { 281 int sym = huff_and_puff(bb, lithuff); 282 283 // Literal? 284 if (sym < 256) output_byte(dd, sym); 285 286 // Copy range? 287 else if (sym > 256) { 288 int len, dist; 289 290 sym -= 257; 291 len = dd->lenbase[sym] + bitbuf_get(bb, dd->lenbits[sym]); 292 sym = huff_and_puff(bb, disthuff); 293 dist = dd->distbase[sym] + bitbuf_get(bb, dd->distbits[sym]); 294 sym = dd->pos & 32767; 295 296 while (len--) output_byte(dd, dd->data[(dd->pos-dist) & 32767]); 297 298 // End of block 299 } else break; 300 } 301 } 302 303 // Was that the last block? 304 if (final) break; 305 } 306 307 if (dd->pos & 32767) { 308 xwrite(dd->outfd, dd->data, dd->pos&32767); 309 if (dd->crcfunc) dd->crcfunc(dd, dd->data, dd->pos&32767); 310 } 311 } 312 313 // Deflate from dd->infd to bitbuf 314 // For deflate, dd->len = input read, dd->pos = input consumed 315 static void deflate(struct deflate *dd, struct bitbuf *bb) 316 { 317 char *data = dd->data; 318 int len, final = 0; 319 320 dd->crc = ~0; 321 322 while (!final) { 323 // Read next half-window of data if we haven't hit EOF yet. 324 len = readall(dd->infd, data+(dd->len&32768), 32768); 325 if (len < 0) perror_exit("read"); // todo: add filename 326 if (len != 32768) final++; 327 if (dd->crcfunc) dd->crcfunc(dd, data+(dd->len&32768), len); 328 // dd->len += len; crcfunc advances len TODO 329 330 // store block as literal 331 bitbuf_put(bb, final, 1); 332 bitbuf_put(bb, 0, 1); 333 334 bitbuf_put(bb, 0, (8-bb->bitpos)&7); 335 bitbuf_put(bb, len, 16); 336 bitbuf_put(bb, 0xffff & ~len, 16); 337 338 // repeat until spanked 339 while (dd->pos != dd->len) { 340 unsigned pos = dd->pos&65535; 341 342 bitbuf_put(bb, data[pos], 8); 343 344 // need to refill buffer? 345 if (!(32767 & ++dd->pos) && !final) break; 346 } 347 } 348 bitbuf_flush(bb); 349 } 350 351 // Allocate memory for deflate/inflate. 352 static struct deflate *init_deflate(int compress) 353 { 354 int i, n = 1; 355 struct deflate *dd = xmalloc(sizeof(struct deflate)+32768*(compress ? 4 : 1)); 356 357 // TODO sizeof and order of these? 358 // decompress needs 32k history, compress adds 64k hashhead and 32k hashchain 359 if (compress) { 360 dd->hashhead = (unsigned short *)(dd->data+65536); 361 dd->hashchain = (unsigned short *)(dd->data+65536+32768); 362 } 363 364 // Calculate lenbits, lenbase, distbits, distbase 365 *dd->lenbase = 3; 366 for (i = 0; i<sizeof(dd->lenbits)-1; i++) { 367 if (i>4) { 368 if (!(i&3)) { 369 dd->lenbits[i]++; 370 n <<= 1; 371 } 372 if (i == 27) n--; 373 else dd->lenbits[i+1] = dd->lenbits[i]; 374 } 375 dd->lenbase[i+1] = n + dd->lenbase[i]; 376 } 377 n = 0; 378 for (i = 0; i<sizeof(dd->distbits); i++) { 379 dd->distbase[i] = 1<<n; 380 if (i) dd->distbase[i] += dd->distbase[i-1]; 381 if (i>3 && !(i&1)) n++; 382 dd->distbits[i] = n; 383 } 384 385 // TODO layout and lifetime of this? 386 // Init fixed huffman tables 387 for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279); 388 len2huff(dd->fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288); 389 memset(toybuf, 5, 30); 390 len2huff(dd->fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30); 391 392 return dd; 393 } 394 395 // Return true/false whether we consumed a gzip header. 396 static int is_gzip(struct bitbuf *bb) 397 { 398 int flags; 399 400 // Confirm signature 401 if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31) 402 return 0; 403 bitbuf_skip(bb, 6*8); 404 405 // Skip extra, name, comment, header CRC fields 406 if (flags & 4) bitbuf_skip(bb, 16); 407 if (flags & 8) while (bitbuf_get(bb, 8)); 408 if (flags & 16) while (bitbuf_get(bb, 8)); 409 if (flags & 2) bitbuf_skip(bb, 16); 410 411 return 1; 412 } 413 414 void gzip_crc(struct deflate *dd, char *data, int len) 415 { 416 int i; 417 unsigned crc, *crc_table = dd->crctable; 418 419 crc = dd->crc; 420 for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8); 421 dd->crc = crc; 422 dd->len += len; 423 } 424 425 long long gzip_fd(int infd, int outfd) 426 { 427 struct bitbuf *bb = bitbuf_init(outfd, 4096); 428 struct deflate *dd = init_deflate(1); 429 long long rc; 430 431 // Header from RFC 1952 section 2.2: 432 // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none), 433 // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression), 434 // Operating System (FF=unknown) 435 436 dd->infd = infd; 437 xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10); 438 439 // Little endian crc table 440 crc_init(dd->crctable, 1); 441 dd->crcfunc = gzip_crc; 442 443 deflate(dd, bb); 444 445 // tail: crc32, len32 446 447 bitbuf_put(bb, 0, (8-bb->bitpos)&7); 448 bitbuf_put(bb, ~dd->crc, 32); 449 bitbuf_put(bb, dd->len, 32); 450 rc = dd->len; 451 452 bitbuf_flush(bb); 453 free(bb); 454 free(dd); 455 456 return rc; 457 } 458 459 long long gunzip_fd(int infd, int outfd) 460 { 461 struct bitbuf *bb = bitbuf_init(infd, 4096); 462 struct deflate *dd = init_deflate(0); 463 long long rc; 464 465 if (!is_gzip(bb)) error_exit("not gzip"); 466 dd->outfd = outfd; 467 468 // Little endian crc table 469 crc_init(dd->crctable, 1); 470 dd->crcfunc = gzip_crc; 471 472 inflate(dd, bb); 473 474 // tail: crc32, len32 475 476 bitbuf_skip(bb, (8-bb->bitpos)&7); 477 if (~dd->crc != bitbuf_get(bb, 32) || dd->len != bitbuf_get(bb, 32)) 478 error_exit("bad crc"); 479 480 rc = dd->len; 481 free(bb); 482 free(dd); 483 484 return rc; 485 } 486