1 /* 2 * LZMAInputStream 3 * 4 * Authors: Lasse Collin <lasse.collin (at) tukaani.org> 5 * Igor Pavlov <http://7-zip.org/> 6 * 7 * This file has been put into the public domain. 8 * You can do whatever you want with this file. 9 */ 10 11 package org.tukaani.xz; 12 13 import java.io.InputStream; 14 import java.io.DataInputStream; 15 import java.io.IOException; 16 import org.tukaani.xz.lz.LZDecoder; 17 import org.tukaani.xz.rangecoder.RangeDecoderFromStream; 18 import org.tukaani.xz.lzma.LZMADecoder; 19 20 /** 21 * Decompresses legacy .lzma files and raw LZMA streams (no .lzma header). 22 * <p> 23 * <b>IMPORTANT:</b> In contrast to other classes in this package, this class 24 * reads data from its input stream one byte at a time. If the input stream 25 * is for example {@link java.io.FileInputStream}, wrapping it into 26 * {@link java.io.BufferedInputStream} tends to improve performance a lot. 27 * This is not automatically done by this class because there may be use 28 * cases where it is desired that this class won't read any bytes past 29 * the end of the LZMA stream. 30 * <p> 31 * Even when using <code>BufferedInputStream</code>, the performance tends 32 * to be worse (maybe 10-20 % slower) than with {@link LZMA2InputStream} 33 * or {@link XZInputStream} (when the .xz file contains LZMA2-compressed data). 34 * 35 * @since 1.4 36 */ 37 public class LZMAInputStream extends InputStream { 38 /** 39 * Largest dictionary size supported by this implementation. 40 * <p> 41 * LZMA allows dictionaries up to one byte less than 4 GiB. This 42 * implementation supports only 16 bytes less than 2 GiB. This 43 * limitation is due to Java using signed 32-bit integers for array 44 * indexing. The limitation shouldn't matter much in practice since so 45 * huge dictionaries are not normally used. 46 */ 47 public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15; 48 49 private InputStream in; 50 private LZDecoder lz; 51 private RangeDecoderFromStream rc; 52 private LZMADecoder lzma; 53 54 private boolean endReached = false; 55 56 private final byte[] tempBuf = new byte[1]; 57 58 /** 59 * Number of uncompressed bytes left to be decompressed, or -1 if 60 * the end marker is used. 61 */ 62 private long remainingSize; 63 64 private IOException exception = null; 65 66 /** 67 * Gets approximate decompressor memory requirements as kibibytes for 68 * the given dictionary size and LZMA properties byte (lc, lp, and pb). 69 * 70 * @param dictSize LZMA dictionary size as bytes, should be 71 * in the range [<code>0</code>, 72 * <code>DICT_SIZE_MAX</code>] 73 * 74 * @param propsByte LZMA properties byte that encodes the values 75 * of lc, lp, and pb 76 * 77 * @return approximate memory requirements as kibibytes (KiB) 78 * 79 * @throws UnsupportedOptionsException 80 * if <code>dictSize</code> is outside 81 * the range [<code>0</code>, 82 * <code>DICT_SIZE_MAX</code>] 83 * 84 * @throws CorruptedInputException 85 * if <code>propsByte</code> is invalid 86 */ 87 public static int getMemoryUsage(int dictSize, byte propsByte) 88 throws UnsupportedOptionsException, CorruptedInputException { 89 if (dictSize < 0 || dictSize > DICT_SIZE_MAX) 90 throw new UnsupportedOptionsException( 91 "LZMA dictionary is too big for this implementation"); 92 93 int props = propsByte & 0xFF; 94 if (props > (4 * 5 + 4) * 9 + 8) 95 throw new CorruptedInputException("Invalid LZMA properties byte"); 96 97 props %= 9 * 5; 98 int lp = props / 9; 99 int lc = props - lp * 9; 100 101 return getMemoryUsage(dictSize, lc, lp); 102 } 103 104 /** 105 * Gets approximate decompressor memory requirements as kibibytes for 106 * the given dictionary size, lc, and lp. Note that pb isn't needed. 107 * 108 * @param dictSize LZMA dictionary size as bytes, must be 109 * in the range [<code>0</code>, 110 * <code>DICT_SIZE_MAX</code>] 111 * 112 * @param lc number of literal context bits, must be 113 * in the range [0, 8] 114 * 115 * @param lp number of literal position bits, must be 116 * in the range [0, 4] 117 * 118 * @return approximate memory requirements as kibibytes (KiB) 119 */ 120 public static int getMemoryUsage(int dictSize, int lc, int lp) { 121 if (lc < 0 || lc > 8 || lp < 0 || lp > 4) 122 throw new IllegalArgumentException("Invalid lc or lp"); 123 124 // Probability variables have the type "short". There are 125 // 0x300 (768) probability variables in each literal subcoder. 126 // The number of literal subcoders is 2^(lc + lp). 127 // 128 // Roughly 10 KiB for the base state + LZ decoder's dictionary buffer 129 // + sizeof(short) * number probability variables per literal subcoder 130 // * number of literal subcoders 131 return 10 + getDictSize(dictSize) / 1024 132 + ((2 * 0x300) << (lc + lp)) / 1024; 133 } 134 135 private static int getDictSize(int dictSize) { 136 if (dictSize < 0 || dictSize > DICT_SIZE_MAX) 137 throw new IllegalArgumentException( 138 "LZMA dictionary is too big for this implementation"); 139 140 // For performance reasons, use a 4 KiB dictionary if something 141 // smaller was requested. It's a rare situation and the performance 142 // difference isn't huge, and it starts to matter mostly when the 143 // dictionary is just a few bytes. But we need to handle the special 144 // case of dictSize == 0 anyway, which is an allowed value but in 145 // practice means one-byte dictionary. 146 // 147 // Note that using a dictionary bigger than specified in the headers 148 // can hide errors if there is a reference to data beyond the original 149 // dictionary size but is still within 4 KiB. 150 if (dictSize < 4096) 151 dictSize = 4096; 152 153 // Round dictionary size upward to a multiple of 16. This way LZMA 154 // can use LZDecoder.getPos() for calculating LZMA's posMask. 155 return (dictSize + 15) & ~15; 156 } 157 158 /** 159 * Creates a new .lzma file format decompressor without 160 * a memory usage limit. 161 * 162 * @param in input stream from which .lzma data is read; 163 * it might be a good idea to wrap it in 164 * <code>BufferedInputStream</code>, see the 165 * note at the top of this page 166 * 167 * @throws CorruptedInputException 168 * file is corrupt or perhaps not in 169 * the .lzma format at all 170 * 171 * @throws UnsupportedOptionsException 172 * dictionary size or uncompressed size is too 173 * big for this implementation 174 * 175 * @throws EOFException 176 * file is truncated or perhaps not in 177 * the .lzma format at all 178 * 179 * @throws IOException may be thrown by <code>in</code> 180 */ 181 public LZMAInputStream(InputStream in) throws IOException { 182 this(in, -1); 183 } 184 185 /** 186 * Creates a new .lzma file format decompressor with an optional 187 * memory usage limit. 188 * 189 * @param in input stream from which .lzma data is read; 190 * it might be a good idea to wrap it in 191 * <code>BufferedInputStream</code>, see the 192 * note at the top of this page 193 * 194 * @param memoryLimit memory usage limit in kibibytes (KiB) 195 * or <code>-1</code> to impose no 196 * memory usage limit 197 * 198 * @throws CorruptedInputException 199 * file is corrupt or perhaps not in 200 * the .lzma format at all 201 * 202 * @throws UnsupportedOptionsException 203 * dictionary size or uncompressed size is too 204 * big for this implementation 205 * 206 * @throws MemoryLimitException 207 * memory usage limit was exceeded 208 * 209 * @throws EOFException 210 * file is truncated or perhaps not in 211 * the .lzma format at all 212 * 213 * @throws IOException may be thrown by <code>in</code> 214 */ 215 public LZMAInputStream(InputStream in, int memoryLimit) 216 throws IOException { 217 DataInputStream inData = new DataInputStream(in); 218 219 // Properties byte (lc, lp, and pb) 220 byte propsByte = inData.readByte(); 221 222 // Dictionary size is an unsigned 32-bit little endian integer. 223 int dictSize = 0; 224 for (int i = 0; i < 4; ++i) 225 dictSize |= inData.readUnsignedByte() << (8 * i); 226 227 // Uncompressed size is an unsigned 64-bit little endian integer. 228 // The maximum 64-bit value is a special case (becomes -1 here) 229 // which indicates that the end marker is used instead of knowing 230 // the uncompressed size beforehand. 231 long uncompSize = 0; 232 for (int i = 0; i < 8; ++i) 233 uncompSize |= (long)inData.readUnsignedByte() << (8 * i); 234 235 // Check the memory usage limit. 236 int memoryNeeded = getMemoryUsage(dictSize, propsByte); 237 if (memoryLimit != -1 && memoryNeeded > memoryLimit) 238 throw new MemoryLimitException(memoryNeeded, memoryLimit); 239 240 initialize(in, uncompSize, propsByte, dictSize, null); 241 } 242 243 /** 244 * Creates a new input stream that decompresses raw LZMA data (no .lzma 245 * header) from <code>in</code>. 246 * <p> 247 * The caller needs to know if the "end of payload marker (EOPM)" alias 248 * "end of stream marker (EOS marker)" alias "end marker" present. 249 * If the end marker isn't used, the caller must know the exact 250 * uncompressed size of the stream. 251 * <p> 252 * The caller also needs to provide the LZMA properties byte that encodes 253 * the number of literal context bits (lc), literal position bits (lp), 254 * and position bits (pb). 255 * <p> 256 * The dictionary size used when compressing is also needed. Specifying 257 * a too small dictionary size will prevent decompressing the stream. 258 * Specifying a too big dictionary is waste of memory but decompression 259 * will work. 260 * <p> 261 * There is no need to specify a dictionary bigger than 262 * the uncompressed size of the data even if a bigger dictionary 263 * was used when compressing. If you know the uncompressed size 264 * of the data, this might allow saving some memory. 265 * 266 * @param in input stream from which compressed 267 * data is read 268 * 269 * @param uncompSize uncompressed size of the LZMA stream or -1 270 * if the end marker is used in the LZMA stream 271 * 272 * @param propsByte LZMA properties byte that has the encoded 273 * values for literal context bits (lc), literal 274 * position bits (lp), and position bits (pb) 275 * 276 * @param dictSize dictionary size as bytes, must be in the range 277 * [<code>0</code>, <code>DICT_SIZE_MAX</code>] 278 * 279 * @throws CorruptedInputException 280 * if <code>propsByte</code> is invalid or 281 * the first input byte is not 0x00 282 * 283 * @throws UnsupportedOptionsException 284 * dictionary size or uncompressed size is too 285 * big for this implementation 286 * 287 * 288 */ 289 public LZMAInputStream(InputStream in, long uncompSize, byte propsByte, 290 int dictSize) throws IOException { 291 initialize(in, uncompSize, propsByte, dictSize, null); 292 } 293 294 /** 295 * Creates a new input stream that decompresses raw LZMA data (no .lzma 296 * header) from <code>in</code> optionally with a preset dictionary. 297 * 298 * @param in input stream from which LZMA-compressed 299 * data is read 300 * 301 * @param uncompSize uncompressed size of the LZMA stream or -1 302 * if the end marker is used in the LZMA stream 303 * 304 * @param propsByte LZMA properties byte that has the encoded 305 * values for literal context bits (lc), literal 306 * position bits (lp), and position bits (pb) 307 * 308 * @param dictSize dictionary size as bytes, must be in the range 309 * [<code>0</code>, <code>DICT_SIZE_MAX</code>] 310 * 311 * @param presetDict preset dictionary or <code>null</code> 312 * to use no preset dictionary 313 * 314 * @throws CorruptedInputException 315 * if <code>propsByte</code> is invalid or 316 * the first input byte is not 0x00 317 * 318 * @throws UnsupportedOptionsException 319 * dictionary size or uncompressed size is too 320 * big for this implementation 321 * 322 * @throws EOFException file is truncated or corrupt 323 * 324 * @throws IOException may be thrown by <code>in</code> 325 */ 326 public LZMAInputStream(InputStream in, long uncompSize, byte propsByte, 327 int dictSize, byte[] presetDict) 328 throws IOException { 329 initialize(in, uncompSize, propsByte, dictSize, presetDict); 330 } 331 332 /** 333 * Creates a new input stream that decompresses raw LZMA data (no .lzma 334 * header) from <code>in</code> optionally with a preset dictionary. 335 * 336 * @param in input stream from which LZMA-compressed 337 * data is read 338 * 339 * @param uncompSize uncompressed size of the LZMA stream or -1 340 * if the end marker is used in the LZMA stream 341 * 342 * @param lc number of literal context bits, must be 343 * in the range [0, 8] 344 * 345 * @param lp number of literal position bits, must be 346 * in the range [0, 4] 347 * 348 * @param pb number position bits, must be 349 * in the range [0, 4] 350 * 351 * @param dictSize dictionary size as bytes, must be in the range 352 * [<code>0</code>, <code>DICT_SIZE_MAX</code>] 353 * 354 * @param presetDict preset dictionary or <code>null</code> 355 * to use no preset dictionary 356 * 357 * @throws CorruptedInputException 358 * if the first input byte is not 0x00 359 * 360 * @throws EOFException file is truncated or corrupt 361 * 362 * @throws IOException may be thrown by <code>in</code> 363 */ 364 public LZMAInputStream(InputStream in, long uncompSize, 365 int lc, int lp, int pb, 366 int dictSize, byte[] presetDict) 367 throws IOException { 368 initialize(in, uncompSize, lc, lp, pb, dictSize, presetDict); 369 } 370 371 private void initialize(InputStream in, long uncompSize, byte propsByte, 372 int dictSize, byte[] presetDict) 373 throws IOException { 374 // Validate the uncompressed size since the other "initialize" throws 375 // IllegalArgumentException if uncompSize < -1. 376 if (uncompSize < -1) 377 throw new UnsupportedOptionsException( 378 "Uncompressed size is too big"); 379 380 // Decode the properties byte. In contrast to LZMA2, there is no 381 // limit of lc + lp <= 4. 382 int props = propsByte & 0xFF; 383 if (props > (4 * 5 + 4) * 9 + 8) 384 throw new CorruptedInputException("Invalid LZMA properties byte"); 385 386 int pb = props / (9 * 5); 387 props -= pb * 9 * 5; 388 int lp = props / 9; 389 int lc = props - lp * 9; 390 391 // Validate the dictionary size since the other "initialize" throws 392 // IllegalArgumentException if dictSize is not supported. 393 if (dictSize < 0 || dictSize > DICT_SIZE_MAX) 394 throw new UnsupportedOptionsException( 395 "LZMA dictionary is too big for this implementation"); 396 397 initialize(in, uncompSize, lc, lp, pb, dictSize, presetDict); 398 } 399 400 private void initialize(InputStream in, long uncompSize, 401 int lc, int lp, int pb, 402 int dictSize, byte[] presetDict) 403 throws IOException { 404 // getDictSize validates dictSize and gives a message in 405 // the exception too, so skip validating dictSize here. 406 if (uncompSize < -1 || lc < 0 || lc > 8 || lp < 0 || lp > 4 407 || pb < 0 || pb > 4) 408 throw new IllegalArgumentException(); 409 410 this.in = in; 411 412 // If uncompressed size is known, use it to avoid wasting memory for 413 // a uselessly large dictionary buffer. 414 dictSize = getDictSize(dictSize); 415 if (uncompSize >= 0 && dictSize > uncompSize) 416 dictSize = getDictSize((int)uncompSize); 417 418 lz = new LZDecoder(getDictSize(dictSize), presetDict); 419 rc = new RangeDecoderFromStream(in); 420 lzma = new LZMADecoder(lz, rc, lc, lp, pb); 421 remainingSize = uncompSize; 422 } 423 424 /** 425 * Decompresses the next byte from this input stream. 426 * <p> 427 * Reading lots of data with <code>read()</code> from this input stream 428 * may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code> 429 * if you need to read lots of data one byte at a time. 430 * 431 * @return the next decompressed byte, or <code>-1</code> 432 * to indicate the end of the compressed stream 433 * 434 * @throws CorruptedInputException 435 * 436 * @throws XZIOException if the stream has been closed 437 * 438 * @throws EOFException 439 * compressed input is truncated or corrupt 440 * 441 * @throws IOException may be thrown by <code>in</code> 442 */ 443 public int read() throws IOException { 444 return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF); 445 } 446 447 /** 448 * Decompresses into an array of bytes. 449 * <p> 450 * If <code>len</code> is zero, no bytes are read and <code>0</code> 451 * is returned. Otherwise this will block until <code>len</code> 452 * bytes have been decompressed, the end of the LZMA stream is reached, 453 * or an exception is thrown. 454 * 455 * @param buf target buffer for uncompressed data 456 * @param off start offset in <code>buf</code> 457 * @param len maximum number of uncompressed bytes to read 458 * 459 * @return number of bytes read, or <code>-1</code> to indicate 460 * the end of the compressed stream 461 * 462 * @throws CorruptedInputException 463 * 464 * @throws XZIOException if the stream has been closed 465 * 466 * @throws EOFException compressed input is truncated or corrupt 467 * 468 * @throws IOException may be thrown by <code>in</code> 469 */ 470 public int read(byte[] buf, int off, int len) throws IOException { 471 if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length) 472 throw new IndexOutOfBoundsException(); 473 474 if (len == 0) 475 return 0; 476 477 if (in == null) 478 throw new XZIOException("Stream closed"); 479 480 if (exception != null) 481 throw exception; 482 483 if (endReached) 484 return -1; 485 486 try { 487 int size = 0; 488 489 while (len > 0) { 490 // If uncompressed size is known and thus no end marker will 491 // be present, set the limit so that the uncompressed size 492 // won't be exceeded. 493 int copySizeMax = len; 494 if (remainingSize >= 0 && remainingSize < len) 495 copySizeMax = (int)remainingSize; 496 497 lz.setLimit(copySizeMax); 498 499 // Decode into the dictionary buffer. 500 try { 501 lzma.decode(); 502 } catch (CorruptedInputException e) { 503 // The end marker is encoded with a LZMA symbol that 504 // indicates maximum match distance. This is larger 505 // than any supported dictionary and thus causes 506 // CorruptedInputException from LZDecoder.repeat. 507 if (remainingSize != -1 || !lzma.endMarkerDetected()) 508 throw e; 509 510 endReached = true; 511 512 // The exception makes lzma.decode() miss the last range 513 // decoder normalization, so do it here. This might 514 // cause an IOException if it needs to read a byte 515 // from the input stream. 516 rc.normalize(); 517 } 518 519 // Copy from the dictionary to buf. 520 int copiedSize = lz.flush(buf, off); 521 off += copiedSize; 522 len -= copiedSize; 523 size += copiedSize; 524 525 if (remainingSize >= 0) { 526 // Update the number of bytes left to be decompressed. 527 remainingSize -= copiedSize; 528 assert remainingSize >= 0; 529 530 if (remainingSize == 0) 531 endReached = true; 532 } 533 534 if (endReached) { 535 // Checking these helps a lot when catching corrupt 536 // or truncated .lzma files. LZMA Utils doesn't do 537 // the first check and thus it accepts many invalid 538 // files that this implementation and XZ Utils don't. 539 if (!rc.isFinished() || lz.hasPending()) 540 throw new CorruptedInputException(); 541 542 return size == 0 ? -1 : size; 543 } 544 } 545 546 return size; 547 548 } catch (IOException e) { 549 exception = e; 550 throw e; 551 } 552 } 553 554 /** 555 * Closes the stream and calls <code>in.close()</code>. 556 * If the stream was already closed, this does nothing. 557 * 558 * @throws IOException if thrown by <code>in.close()</code> 559 */ 560 public void close() throws IOException { 561 if (in != null) { 562 try { 563 in.close(); 564 } finally { 565 in = null; 566 } 567 } 568 } 569 } 570