Home | History | Annotate | Download | only in xz
      1 /*
      2  * LZMA2InputStream
      3  *
      4  * Authors: Lasse Collin <lasse.collin (at) tukaani.org>
      5  *          Igor Pavlov <http://7-zip.org/>
      6  *
      7  * This file has been put into the public domain.
      8  * You can do whatever you want with this file.
      9  */
     10 
     11 package org.tukaani.xz;
     12 
     13 import java.io.InputStream;
     14 import java.io.DataInputStream;
     15 import java.io.IOException;
     16 import org.tukaani.xz.lz.LZDecoder;
     17 import org.tukaani.xz.rangecoder.RangeDecoderFromBuffer;
     18 import org.tukaani.xz.lzma.LZMADecoder;
     19 
     20 /**
     21  * Decompresses a raw LZMA2 stream (no XZ headers).
     22  */
     23 public class LZMA2InputStream extends InputStream {
     24     /**
     25      * Smallest valid LZMA2 dictionary size.
     26      * <p>
     27      * Very tiny dictionaries would be a performance problem, so
     28      * the minimum is 4 KiB.
     29      */
     30     public static final int DICT_SIZE_MIN = 4096;
     31 
     32     /**
     33      * Largest dictionary size supported by this implementation.
     34      * <p>
     35      * The LZMA2 algorithm allows dictionaries up to one byte less than 4 GiB.
     36      * This implementation supports only 16 bytes less than 2 GiB for raw
     37      * LZMA2 streams, and for .xz files the maximum is 1.5 GiB. This
     38      * limitation is due to Java using signed 32-bit integers for array
     39      * indexing. The limitation shouldn't matter much in practice since so
     40      * huge dictionaries are not normally used.
     41      */
     42     public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15;
     43 
     44     private static final int COMPRESSED_SIZE_MAX = 1 << 16;
     45 
     46     private DataInputStream in;
     47 
     48     private final LZDecoder lz;
     49     private final RangeDecoderFromBuffer rc
     50             = new RangeDecoderFromBuffer(COMPRESSED_SIZE_MAX);
     51     private LZMADecoder lzma;
     52 
     53     private int uncompressedSize = 0;
     54     private boolean isLZMAChunk;
     55 
     56     private boolean needDictReset = true;
     57     private boolean needProps = true;
     58     private boolean endReached = false;
     59 
     60     private IOException exception = null;
     61 
     62     private final byte[] tempBuf = new byte[1];
     63 
     64     /**
     65      * Gets approximate decompressor memory requirements as kibibytes for
     66      * the given dictionary size.
     67      *
     68      * @param       dictSize    LZMA2 dictionary size as bytes, must be
     69      *                          in the range [<code>DICT_SIZE_MIN</code>,
     70      *                          <code>DICT_SIZE_MAX</code>]
     71      *
     72      * @return      approximate memory requirements as kibibytes (KiB)
     73      */
     74     public static int getMemoryUsage(int dictSize) {
     75         // The base state is around 30-40 KiB (probabilities etc.),
     76         // range decoder needs COMPRESSED_SIZE_MAX bytes for buffering,
     77         // and LZ decoder needs a dictionary buffer.
     78         return 40 + COMPRESSED_SIZE_MAX / 1024 + getDictSize(dictSize) / 1024;
     79     }
     80 
     81     private static int getDictSize(int dictSize) {
     82         if (dictSize < DICT_SIZE_MIN || dictSize > DICT_SIZE_MAX)
     83             throw new IllegalArgumentException(
     84                     "Unsupported dictionary size " + dictSize);
     85 
     86         // Round dictionary size upward to a multiple of 16. This way LZMA
     87         // can use LZDecoder.getPos() for calculating LZMA's posMask.
     88         // Note that this check is needed only for raw LZMA2 streams; it is
     89         // redundant with .xz.
     90         return (dictSize + 15) & ~15;
     91     }
     92 
     93     /**
     94      * Creates a new input stream that decompresses raw LZMA2 data
     95      * from <code>in</code>.
     96      * <p>
     97      * The caller needs to know the dictionary size used when compressing;
     98      * the dictionary size isn't stored as part of a raw LZMA2 stream.
     99      * <p>
    100      * Specifying a too small dictionary size will prevent decompressing
    101      * the stream. Specifying a too big dictionary is waste of memory but
    102      * decompression will work.
    103      * <p>
    104      * There is no need to specify a dictionary bigger than
    105      * the uncompressed size of the data even if a bigger dictionary
    106      * was used when compressing. If you know the uncompressed size
    107      * of the data, this might allow saving some memory.
    108      *
    109      * @param       in          input stream from which LZMA2-compressed
    110      *                          data is read
    111      *
    112      * @param       dictSize    LZMA2 dictionary size as bytes, must be
    113      *                          in the range [<code>DICT_SIZE_MIN</code>,
    114      *                          <code>DICT_SIZE_MAX</code>]
    115      */
    116     public LZMA2InputStream(InputStream in, int dictSize) {
    117         this(in, dictSize, null);
    118     }
    119 
    120     /**
    121      * Creates a new LZMA2 decompressor using a preset dictionary.
    122      * <p>
    123      * This is like <code>LZMA2InputStream(InputStream, int)</code> except
    124      * that the dictionary may be initialized using a preset dictionary.
    125      * If a preset dictionary was used when compressing the data, the
    126      * same preset dictionary must be provided when decompressing.
    127      *
    128      * @param       in          input stream from which LZMA2-compressed
    129      *                          data is read
    130      *
    131      * @param       dictSize    LZMA2 dictionary size as bytes, must be
    132      *                          in the range [<code>DICT_SIZE_MIN</code>,
    133      *                          <code>DICT_SIZE_MAX</code>]
    134      *
    135      * @param       presetDict  preset dictionary or <code>null</code>
    136      *                          to use no preset dictionary
    137      */
    138     public LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict) {
    139         // Check for null because otherwise null isn't detect
    140         // in this constructor.
    141         if (in == null)
    142             throw new NullPointerException();
    143 
    144         this.in = new DataInputStream(in);
    145         this.lz = new LZDecoder(getDictSize(dictSize), presetDict);
    146 
    147         if (presetDict != null && presetDict.length > 0)
    148             needDictReset = false;
    149     }
    150 
    151     /**
    152      * Decompresses the next byte from this input stream.
    153      * <p>
    154      * Reading lots of data with <code>read()</code> from this input stream
    155      * may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
    156      * if you need to read lots of data one byte at a time.
    157      *
    158      * @return      the next decompressed byte, or <code>-1</code>
    159      *              to indicate the end of the compressed stream
    160      *
    161      * @throws      CorruptedInputException
    162      *
    163      * @throws      XZIOException if the stream has been closed
    164      *
    165      * @throws      EOFException
    166      *                          compressed input is truncated or corrupt
    167      *
    168      * @throws      IOException may be thrown by <code>in</code>
    169      */
    170     public int read() throws IOException {
    171         return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF);
    172     }
    173 
    174     /**
    175      * Decompresses into an array of bytes.
    176      * <p>
    177      * If <code>len</code> is zero, no bytes are read and <code>0</code>
    178      * is returned. Otherwise this will block until <code>len</code>
    179      * bytes have been decompressed, the end of the LZMA2 stream is reached,
    180      * or an exception is thrown.
    181      *
    182      * @param       buf         target buffer for uncompressed data
    183      * @param       off         start offset in <code>buf</code>
    184      * @param       len         maximum number of uncompressed bytes to read
    185      *
    186      * @return      number of bytes read, or <code>-1</code> to indicate
    187      *              the end of the compressed stream
    188      *
    189      * @throws      CorruptedInputException
    190      *
    191      * @throws      XZIOException if the stream has been closed
    192      *
    193      * @throws      EOFException
    194      *                          compressed input is truncated or corrupt
    195      *
    196      * @throws      IOException may be thrown by <code>in</code>
    197      */
    198     public int read(byte[] buf, int off, int len) throws IOException {
    199         if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
    200             throw new IndexOutOfBoundsException();
    201 
    202         if (len == 0)
    203             return 0;
    204 
    205         if (in == null)
    206             throw new XZIOException("Stream closed");
    207 
    208         if (exception != null)
    209             throw exception;
    210 
    211         if (endReached)
    212             return -1;
    213 
    214         try {
    215             int size = 0;
    216 
    217             while (len > 0) {
    218                 if (uncompressedSize == 0) {
    219                     decodeChunkHeader();
    220                     if (endReached)
    221                         return size == 0 ? -1 : size;
    222                 }
    223 
    224                 int copySizeMax = Math.min(uncompressedSize, len);
    225 
    226                 if (!isLZMAChunk) {
    227                     lz.copyUncompressed(in, copySizeMax);
    228                 } else {
    229                     lz.setLimit(copySizeMax);
    230                     lzma.decode();
    231                     if (!rc.isInBufferOK())
    232                         throw new CorruptedInputException();
    233                 }
    234 
    235                 int copiedSize = lz.flush(buf, off);
    236                 off += copiedSize;
    237                 len -= copiedSize;
    238                 size += copiedSize;
    239                 uncompressedSize -= copiedSize;
    240 
    241                 if (uncompressedSize == 0)
    242                     if (!rc.isFinished() || lz.hasPending())
    243                         throw new CorruptedInputException();
    244             }
    245 
    246             return size;
    247 
    248         } catch (IOException e) {
    249             exception = e;
    250             throw e;
    251         }
    252     }
    253 
    254     private void decodeChunkHeader() throws IOException {
    255         int control = in.readUnsignedByte();
    256 
    257         if (control == 0x00) {
    258             endReached = true;
    259             return;
    260         }
    261 
    262         if (control >= 0xE0 || control == 0x01) {
    263             needProps = true;
    264             needDictReset = false;
    265             lz.reset();
    266         } else if (needDictReset) {
    267             throw new CorruptedInputException();
    268         }
    269 
    270         if (control >= 0x80) {
    271             isLZMAChunk = true;
    272 
    273             uncompressedSize = (control & 0x1F) << 16;
    274             uncompressedSize += in.readUnsignedShort() + 1;
    275 
    276             int compressedSize = in.readUnsignedShort() + 1;
    277 
    278             if (control >= 0xC0) {
    279                 needProps = false;
    280                 decodeProps();
    281 
    282             } else if (needProps) {
    283                 throw new CorruptedInputException();
    284 
    285             } else if (control >= 0xA0) {
    286                 lzma.reset();
    287             }
    288 
    289             rc.prepareInputBuffer(in, compressedSize);
    290 
    291         } else if (control > 0x02) {
    292             throw new CorruptedInputException();
    293 
    294         } else {
    295             isLZMAChunk = false;
    296             uncompressedSize = in.readUnsignedShort() + 1;
    297         }
    298     }
    299 
    300     private void decodeProps() throws IOException {
    301         int props = in.readUnsignedByte();
    302 
    303         if (props > (4 * 5 + 4) * 9 + 8)
    304             throw new CorruptedInputException();
    305 
    306         int pb = props / (9 * 5);
    307         props -= pb * 9 * 5;
    308         int lp = props / 9;
    309         int lc = props - lp * 9;
    310 
    311         if (lc + lp > 4)
    312             throw new CorruptedInputException();
    313 
    314         lzma = new LZMADecoder(lz, rc, lc, lp, pb);
    315     }
    316 
    317     /**
    318      * Returns the number of uncompressed bytes that can be read
    319      * without blocking. The value is returned with an assumption
    320      * that the compressed input data will be valid. If the compressed
    321      * data is corrupt, <code>CorruptedInputException</code> may get
    322      * thrown before the number of bytes claimed to be available have
    323      * been read from this input stream.
    324      * <p>
    325      * In LZMA2InputStream, the return value will be non-zero when the
    326      * decompressor is in the middle of an LZMA2 chunk. The return value
    327      * will then be the number of uncompressed bytes remaining from that
    328      * chunk.
    329      *
    330      * @return      the number of uncompressed bytes that can be read
    331      *              without blocking
    332      */
    333     public int available() throws IOException {
    334         if (in == null)
    335             throw new XZIOException("Stream closed");
    336 
    337         if (exception != null)
    338             throw exception;
    339 
    340         return uncompressedSize;
    341     }
    342 
    343     /**
    344      * Closes the stream and calls <code>in.close()</code>.
    345      * If the stream was already closed, this does nothing.
    346      *
    347      * @throws  IOException if thrown by <code>in.close()</code>
    348      */
    349     public void close() throws IOException {
    350         if (in != null) {
    351             try {
    352                 in.close();
    353             } finally {
    354                 in = null;
    355             }
    356         }
    357     }
    358 }
    359