Home | History | Annotate | Download | only in xz
      1 /*
      2  * LZMAInputStream
      3  *
      4  * Authors: Lasse Collin <lasse.collin (at) tukaani.org>
      5  *          Igor Pavlov <http://7-zip.org/>
      6  *
      7  * This file has been put into the public domain.
      8  * You can do whatever you want with this file.
      9  */
     10 
     11 package org.tukaani.xz;
     12 
     13 import java.io.InputStream;
     14 import java.io.DataInputStream;
     15 import java.io.IOException;
     16 import org.tukaani.xz.lz.LZDecoder;
     17 import org.tukaani.xz.rangecoder.RangeDecoderFromStream;
     18 import org.tukaani.xz.lzma.LZMADecoder;
     19 
     20 /**
     21  * Decompresses legacy .lzma files and raw LZMA streams (no .lzma header).
     22  * <p>
     23  * <b>IMPORTANT:</b> In contrast to other classes in this package, this class
     24  * reads data from its input stream one byte at a time. If the input stream
     25  * is for example {@link java.io.FileInputStream}, wrapping it into
     26  * {@link java.io.BufferedInputStream} tends to improve performance a lot.
     27  * This is not automatically done by this class because there may be use
     28  * cases where it is desired that this class won't read any bytes past
     29  * the end of the LZMA stream.
     30  * <p>
     31  * Even when using <code>BufferedInputStream</code>, the performance tends
     32  * to be worse (maybe 10-20&nbsp;% slower) than with {@link LZMA2InputStream}
     33  * or {@link XZInputStream} (when the .xz file contains LZMA2-compressed data).
     34  *
     35  * @since 1.4
     36  */
     37 public class LZMAInputStream extends InputStream {
     38     /**
     39      * Largest dictionary size supported by this implementation.
     40      * <p>
     41      * LZMA allows dictionaries up to one byte less than 4 GiB. This
     42      * implementation supports only 16 bytes less than 2 GiB. This
     43      * limitation is due to Java using signed 32-bit integers for array
     44      * indexing. The limitation shouldn't matter much in practice since so
     45      * huge dictionaries are not normally used.
     46      */
     47     public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15;
     48 
     49     private InputStream in;
     50     private LZDecoder lz;
     51     private RangeDecoderFromStream rc;
     52     private LZMADecoder lzma;
     53 
     54     private boolean endReached = false;
     55 
     56     private final byte[] tempBuf = new byte[1];
     57 
     58     /**
     59      * Number of uncompressed bytes left to be decompressed, or -1 if
     60      * the end marker is used.
     61      */
     62     private long remainingSize;
     63 
     64     private IOException exception = null;
     65 
     66     /**
     67      * Gets approximate decompressor memory requirements as kibibytes for
     68      * the given dictionary size and LZMA properties byte (lc, lp, and pb).
     69      *
     70      * @param       dictSize    LZMA dictionary size as bytes, should be
     71      *                          in the range [<code>0</code>,
     72      *                          <code>DICT_SIZE_MAX</code>]
     73      *
     74      * @param       propsByte   LZMA properties byte that encodes the values
     75      *                          of lc, lp, and pb
     76      *
     77      * @return      approximate memory requirements as kibibytes (KiB)
     78      *
     79      * @throws      UnsupportedOptionsException
     80      *                          if <code>dictSize</code> is outside
     81      *                          the range [<code>0</code>,
     82      *                          <code>DICT_SIZE_MAX</code>]
     83      *
     84      * @throws      CorruptedInputException
     85      *                          if <code>propsByte</code> is invalid
     86      */
     87     public static int getMemoryUsage(int dictSize, byte propsByte)
     88             throws UnsupportedOptionsException, CorruptedInputException {
     89         if (dictSize < 0 || dictSize > DICT_SIZE_MAX)
     90             throw new UnsupportedOptionsException(
     91                     "LZMA dictionary is too big for this implementation");
     92 
     93         int props = propsByte & 0xFF;
     94         if (props > (4 * 5 + 4) * 9 + 8)
     95             throw new CorruptedInputException("Invalid LZMA properties byte");
     96 
     97         props %= 9 * 5;
     98         int lp = props / 9;
     99         int lc = props - lp * 9;
    100 
    101         return getMemoryUsage(dictSize, lc, lp);
    102     }
    103 
    104     /**
    105      * Gets approximate decompressor memory requirements as kibibytes for
    106      * the given dictionary size, lc, and lp. Note that pb isn't needed.
    107      *
    108      * @param       dictSize    LZMA dictionary size as bytes, must be
    109      *                          in the range [<code>0</code>,
    110      *                          <code>DICT_SIZE_MAX</code>]
    111      *
    112      * @param       lc          number of literal context bits, must be
    113      *                          in the range [0, 8]
    114      *
    115      * @param       lp          number of literal position bits, must be
    116      *                          in the range [0, 4]
    117      *
    118      * @return      approximate memory requirements as kibibytes (KiB)
    119      */
    120     public static int getMemoryUsage(int dictSize, int lc, int lp) {
    121         if (lc < 0 || lc > 8 || lp < 0 || lp > 4)
    122             throw new IllegalArgumentException("Invalid lc or lp");
    123 
    124         // Probability variables have the type "short". There are
    125         // 0x300 (768) probability variables in each literal subcoder.
    126         // The number of literal subcoders is 2^(lc + lp).
    127         //
    128         // Roughly 10 KiB for the base state + LZ decoder's dictionary buffer
    129         // + sizeof(short) * number probability variables per literal subcoder
    130         //   * number of literal subcoders
    131         return 10 + getDictSize(dictSize) / 1024
    132                + ((2 * 0x300) << (lc + lp)) / 1024;
    133     }
    134 
    135     private static int getDictSize(int dictSize) {
    136         if (dictSize < 0 || dictSize > DICT_SIZE_MAX)
    137             throw new IllegalArgumentException(
    138                     "LZMA dictionary is too big for this implementation");
    139 
    140         // For performance reasons, use a 4 KiB dictionary if something
    141         // smaller was requested. It's a rare situation and the performance
    142         // difference isn't huge, and it starts to matter mostly when the
    143         // dictionary is just a few bytes. But we need to handle the special
    144         // case of dictSize == 0 anyway, which is an allowed value but in
    145         // practice means one-byte dictionary.
    146         //
    147         // Note that using a dictionary bigger than specified in the headers
    148         // can hide errors if there is a reference to data beyond the original
    149         // dictionary size but is still within 4 KiB.
    150         if (dictSize < 4096)
    151             dictSize = 4096;
    152 
    153         // Round dictionary size upward to a multiple of 16. This way LZMA
    154         // can use LZDecoder.getPos() for calculating LZMA's posMask.
    155         return (dictSize + 15) & ~15;
    156     }
    157 
    158     /**
    159      * Creates a new .lzma file format decompressor without
    160      * a memory usage limit.
    161      *
    162      * @param       in          input stream from which .lzma data is read;
    163      *                          it might be a good idea to wrap it in
    164      *                          <code>BufferedInputStream</code>, see the
    165      *                          note at the top of this page
    166      *
    167      * @throws      CorruptedInputException
    168      *                          file is corrupt or perhaps not in
    169      *                          the .lzma format at all
    170      *
    171      * @throws      UnsupportedOptionsException
    172      *                          dictionary size or uncompressed size is too
    173      *                          big for this implementation
    174      *
    175      * @throws      EOFException
    176      *                          file is truncated or perhaps not in
    177      *                          the .lzma format at all
    178      *
    179      * @throws      IOException may be thrown by <code>in</code>
    180      */
    181     public LZMAInputStream(InputStream in) throws IOException {
    182         this(in, -1);
    183     }
    184 
    185     /**
    186      * Creates a new .lzma file format decompressor with an optional
    187      * memory usage limit.
    188      *
    189      * @param       in          input stream from which .lzma data is read;
    190      *                          it might be a good idea to wrap it in
    191      *                          <code>BufferedInputStream</code>, see the
    192      *                          note at the top of this page
    193      *
    194      * @param       memoryLimit memory usage limit in kibibytes (KiB)
    195      *                          or <code>-1</code> to impose no
    196      *                          memory usage limit
    197      *
    198      * @throws      CorruptedInputException
    199      *                          file is corrupt or perhaps not in
    200      *                          the .lzma format at all
    201      *
    202      * @throws      UnsupportedOptionsException
    203      *                          dictionary size or uncompressed size is too
    204      *                          big for this implementation
    205      *
    206      * @throws      MemoryLimitException
    207      *                          memory usage limit was exceeded
    208      *
    209      * @throws      EOFException
    210      *                          file is truncated or perhaps not in
    211      *                          the .lzma format at all
    212      *
    213      * @throws      IOException may be thrown by <code>in</code>
    214      */
    215     public LZMAInputStream(InputStream in, int memoryLimit)
    216             throws IOException {
    217         DataInputStream inData = new DataInputStream(in);
    218 
    219         // Properties byte (lc, lp, and pb)
    220         byte propsByte = inData.readByte();
    221 
    222         // Dictionary size is an unsigned 32-bit little endian integer.
    223         int dictSize = 0;
    224         for (int i = 0; i < 4; ++i)
    225             dictSize |= inData.readUnsignedByte() << (8 * i);
    226 
    227         // Uncompressed size is an unsigned 64-bit little endian integer.
    228         // The maximum 64-bit value is a special case (becomes -1 here)
    229         // which indicates that the end marker is used instead of knowing
    230         // the uncompressed size beforehand.
    231         long uncompSize = 0;
    232         for (int i = 0; i < 8; ++i)
    233             uncompSize |= (long)inData.readUnsignedByte() << (8 * i);
    234 
    235         // Check the memory usage limit.
    236         int memoryNeeded = getMemoryUsage(dictSize, propsByte);
    237         if (memoryLimit != -1 && memoryNeeded > memoryLimit)
    238             throw new MemoryLimitException(memoryNeeded, memoryLimit);
    239 
    240         initialize(in, uncompSize, propsByte, dictSize, null);
    241     }
    242 
    243     /**
    244      * Creates a new input stream that decompresses raw LZMA data (no .lzma
    245      * header) from <code>in</code>.
    246      * <p>
    247      * The caller needs to know if the "end of payload marker (EOPM)" alias
    248      * "end of stream marker (EOS marker)" alias "end marker" present.
    249      * If the end marker isn't used, the caller must know the exact
    250      * uncompressed size of the stream.
    251      * <p>
    252      * The caller also needs to provide the LZMA properties byte that encodes
    253      * the number of literal context bits (lc), literal position bits (lp),
    254      * and position bits (pb).
    255      * <p>
    256      * The dictionary size used when compressing is also needed. Specifying
    257      * a too small dictionary size will prevent decompressing the stream.
    258      * Specifying a too big dictionary is waste of memory but decompression
    259      * will work.
    260      * <p>
    261      * There is no need to specify a dictionary bigger than
    262      * the uncompressed size of the data even if a bigger dictionary
    263      * was used when compressing. If you know the uncompressed size
    264      * of the data, this might allow saving some memory.
    265      *
    266      * @param       in          input stream from which compressed
    267      *                          data is read
    268      *
    269      * @param       uncompSize  uncompressed size of the LZMA stream or -1
    270      *                          if the end marker is used in the LZMA stream
    271      *
    272      * @param       propsByte   LZMA properties byte that has the encoded
    273      *                          values for literal context bits (lc), literal
    274      *                          position bits (lp), and position bits (pb)
    275      *
    276      * @param       dictSize    dictionary size as bytes, must be in the range
    277      *                          [<code>0</code>, <code>DICT_SIZE_MAX</code>]
    278      *
    279      * @throws      CorruptedInputException
    280      *                          if <code>propsByte</code> is invalid or
    281      *                          the first input byte is not 0x00
    282      *
    283      * @throws      UnsupportedOptionsException
    284      *                          dictionary size or uncompressed size is too
    285      *                          big for this implementation
    286      *
    287      *
    288      */
    289     public LZMAInputStream(InputStream in, long uncompSize, byte propsByte,
    290                            int dictSize) throws IOException {
    291         initialize(in, uncompSize, propsByte, dictSize, null);
    292     }
    293 
    294     /**
    295      * Creates a new input stream that decompresses raw LZMA data (no .lzma
    296      * header) from <code>in</code> optionally with a preset dictionary.
    297      *
    298      * @param       in          input stream from which LZMA-compressed
    299      *                          data is read
    300      *
    301      * @param       uncompSize  uncompressed size of the LZMA stream or -1
    302      *                          if the end marker is used in the LZMA stream
    303      *
    304      * @param       propsByte   LZMA properties byte that has the encoded
    305      *                          values for literal context bits (lc), literal
    306      *                          position bits (lp), and position bits (pb)
    307      *
    308      * @param       dictSize    dictionary size as bytes, must be in the range
    309      *                          [<code>0</code>, <code>DICT_SIZE_MAX</code>]
    310      *
    311      * @param       presetDict  preset dictionary or <code>null</code>
    312      *                          to use no preset dictionary
    313      *
    314      * @throws      CorruptedInputException
    315      *                          if <code>propsByte</code> is invalid or
    316      *                          the first input byte is not 0x00
    317      *
    318      * @throws      UnsupportedOptionsException
    319      *                          dictionary size or uncompressed size is too
    320      *                          big for this implementation
    321      *
    322      * @throws      EOFException file is truncated or corrupt
    323      *
    324      * @throws      IOException may be thrown by <code>in</code>
    325      */
    326     public LZMAInputStream(InputStream in, long uncompSize, byte propsByte,
    327                            int dictSize, byte[] presetDict)
    328             throws IOException {
    329         initialize(in, uncompSize, propsByte, dictSize, presetDict);
    330     }
    331 
    332     /**
    333      * Creates a new input stream that decompresses raw LZMA data (no .lzma
    334      * header) from <code>in</code> optionally with a preset dictionary.
    335      *
    336      * @param       in          input stream from which LZMA-compressed
    337      *                          data is read
    338      *
    339      * @param       uncompSize  uncompressed size of the LZMA stream or -1
    340      *                          if the end marker is used in the LZMA stream
    341      *
    342      * @param       lc          number of literal context bits, must be
    343      *                          in the range [0, 8]
    344      *
    345      * @param       lp          number of literal position bits, must be
    346      *                          in the range [0, 4]
    347      *
    348      * @param       pb          number position bits, must be
    349      *                          in the range [0, 4]
    350      *
    351      * @param       dictSize    dictionary size as bytes, must be in the range
    352      *                          [<code>0</code>, <code>DICT_SIZE_MAX</code>]
    353      *
    354      * @param       presetDict  preset dictionary or <code>null</code>
    355      *                          to use no preset dictionary
    356      *
    357      * @throws      CorruptedInputException
    358      *                          if the first input byte is not 0x00
    359      *
    360      * @throws      EOFException file is truncated or corrupt
    361      *
    362      * @throws      IOException may be thrown by <code>in</code>
    363      */
    364     public LZMAInputStream(InputStream in, long uncompSize,
    365                            int lc, int lp, int pb,
    366                            int dictSize, byte[] presetDict)
    367             throws IOException {
    368         initialize(in, uncompSize, lc, lp, pb, dictSize, presetDict);
    369     }
    370 
    371     private void initialize(InputStream in, long uncompSize, byte propsByte,
    372                             int dictSize, byte[] presetDict)
    373             throws IOException {
    374         // Validate the uncompressed size since the other "initialize" throws
    375         // IllegalArgumentException if uncompSize < -1.
    376         if (uncompSize < -1)
    377             throw new UnsupportedOptionsException(
    378                     "Uncompressed size is too big");
    379 
    380         // Decode the properties byte. In contrast to LZMA2, there is no
    381         // limit of lc + lp <= 4.
    382         int props = propsByte & 0xFF;
    383         if (props > (4 * 5 + 4) * 9 + 8)
    384             throw new CorruptedInputException("Invalid LZMA properties byte");
    385 
    386         int pb = props / (9 * 5);
    387         props -= pb * 9 * 5;
    388         int lp = props / 9;
    389         int lc = props - lp * 9;
    390 
    391         // Validate the dictionary size since the other "initialize" throws
    392         // IllegalArgumentException if dictSize is not supported.
    393         if (dictSize < 0 || dictSize > DICT_SIZE_MAX)
    394             throw new UnsupportedOptionsException(
    395                     "LZMA dictionary is too big for this implementation");
    396 
    397         initialize(in, uncompSize, lc, lp, pb, dictSize, presetDict);
    398     }
    399 
    400     private void initialize(InputStream in, long uncompSize,
    401                             int lc, int lp, int pb,
    402                             int dictSize, byte[] presetDict)
    403             throws IOException {
    404         // getDictSize validates dictSize and gives a message in
    405         // the exception too, so skip validating dictSize here.
    406         if (uncompSize < -1 || lc < 0 || lc > 8 || lp < 0 || lp > 4
    407                 || pb < 0 || pb > 4)
    408             throw new IllegalArgumentException();
    409 
    410         this.in = in;
    411 
    412         // If uncompressed size is known, use it to avoid wasting memory for
    413         // a uselessly large dictionary buffer.
    414         dictSize = getDictSize(dictSize);
    415         if (uncompSize >= 0 && dictSize > uncompSize)
    416             dictSize = getDictSize((int)uncompSize);
    417 
    418         lz = new LZDecoder(getDictSize(dictSize), presetDict);
    419         rc = new RangeDecoderFromStream(in);
    420         lzma = new LZMADecoder(lz, rc, lc, lp, pb);
    421         remainingSize = uncompSize;
    422     }
    423 
    424     /**
    425      * Decompresses the next byte from this input stream.
    426      * <p>
    427      * Reading lots of data with <code>read()</code> from this input stream
    428      * may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
    429      * if you need to read lots of data one byte at a time.
    430      *
    431      * @return      the next decompressed byte, or <code>-1</code>
    432      *              to indicate the end of the compressed stream
    433      *
    434      * @throws      CorruptedInputException
    435      *
    436      * @throws      XZIOException if the stream has been closed
    437      *
    438      * @throws      EOFException
    439      *                          compressed input is truncated or corrupt
    440      *
    441      * @throws      IOException may be thrown by <code>in</code>
    442      */
    443     public int read() throws IOException {
    444         return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF);
    445     }
    446 
    447     /**
    448      * Decompresses into an array of bytes.
    449      * <p>
    450      * If <code>len</code> is zero, no bytes are read and <code>0</code>
    451      * is returned. Otherwise this will block until <code>len</code>
    452      * bytes have been decompressed, the end of the LZMA stream is reached,
    453      * or an exception is thrown.
    454      *
    455      * @param       buf         target buffer for uncompressed data
    456      * @param       off         start offset in <code>buf</code>
    457      * @param       len         maximum number of uncompressed bytes to read
    458      *
    459      * @return      number of bytes read, or <code>-1</code> to indicate
    460      *              the end of the compressed stream
    461      *
    462      * @throws      CorruptedInputException
    463      *
    464      * @throws      XZIOException if the stream has been closed
    465      *
    466      * @throws      EOFException compressed input is truncated or corrupt
    467      *
    468      * @throws      IOException may be thrown by <code>in</code>
    469      */
    470     public int read(byte[] buf, int off, int len) throws IOException {
    471         if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
    472             throw new IndexOutOfBoundsException();
    473 
    474         if (len == 0)
    475             return 0;
    476 
    477         if (in == null)
    478             throw new XZIOException("Stream closed");
    479 
    480         if (exception != null)
    481             throw exception;
    482 
    483         if (endReached)
    484             return -1;
    485 
    486         try {
    487             int size = 0;
    488 
    489             while (len > 0) {
    490                 // If uncompressed size is known and thus no end marker will
    491                 // be present, set the limit so that the uncompressed size
    492                 // won't be exceeded.
    493                 int copySizeMax = len;
    494                 if (remainingSize >= 0 && remainingSize < len)
    495                     copySizeMax = (int)remainingSize;
    496 
    497                 lz.setLimit(copySizeMax);
    498 
    499                 // Decode into the dictionary buffer.
    500                 try {
    501                     lzma.decode();
    502                 } catch (CorruptedInputException e) {
    503                     // The end marker is encoded with a LZMA symbol that
    504                     // indicates maximum match distance. This is larger
    505                     // than any supported dictionary and thus causes
    506                     // CorruptedInputException from LZDecoder.repeat.
    507                     if (remainingSize != -1 || !lzma.endMarkerDetected())
    508                         throw e;
    509 
    510                     endReached = true;
    511 
    512                     // The exception makes lzma.decode() miss the last range
    513                     // decoder normalization, so do it here. This might
    514                     // cause an IOException if it needs to read a byte
    515                     // from the input stream.
    516                     rc.normalize();
    517                 }
    518 
    519                 // Copy from the dictionary to buf.
    520                 int copiedSize = lz.flush(buf, off);
    521                 off += copiedSize;
    522                 len -= copiedSize;
    523                 size += copiedSize;
    524 
    525                 if (remainingSize >= 0) {
    526                     // Update the number of bytes left to be decompressed.
    527                     remainingSize -= copiedSize;
    528                     assert remainingSize >= 0;
    529 
    530                     if (remainingSize == 0)
    531                         endReached = true;
    532                 }
    533 
    534                 if (endReached) {
    535                     // Checking these helps a lot when catching corrupt
    536                     // or truncated .lzma files. LZMA Utils doesn't do
    537                     // the first check and thus it accepts many invalid
    538                     // files that this implementation and XZ Utils don't.
    539                     if (!rc.isFinished() || lz.hasPending())
    540                         throw new CorruptedInputException();
    541 
    542                     return size == 0 ? -1 : size;
    543                 }
    544             }
    545 
    546             return size;
    547 
    548         } catch (IOException e) {
    549             exception = e;
    550             throw e;
    551         }
    552     }
    553 
    554     /**
    555      * Closes the stream and calls <code>in.close()</code>.
    556      * If the stream was already closed, this does nothing.
    557      *
    558      * @throws  IOException if thrown by <code>in.close()</code>
    559      */
    560     public void close() throws IOException {
    561         if (in != null) {
    562             try {
    563                 in.close();
    564             } finally {
    565                 in = null;
    566             }
    567         }
    568     }
    569 }
    570