Home | History | Annotate | Download | only in xz
      1 /*
      2  * LZMA2Options
      3  *
      4  * Author: Lasse Collin <lasse.collin (at) tukaani.org>
      5  *
      6  * This file has been put into the public domain.
      7  * You can do whatever you want with this file.
      8  */
      9 
     10 package org.tukaani.xz;
     11 
     12 import java.io.InputStream;
     13 import java.io.IOException;
     14 import org.tukaani.xz.lz.LZEncoder;
     15 import org.tukaani.xz.lzma.LZMAEncoder;
     16 
     17 /**
     18  * LZMA2 compression options.
     19  * <p>
     20  * While this allows setting the LZMA2 compression options in detail,
     21  * often you only need <code>LZMA2Options()</code> or
     22  * <code>LZMA2Options(int)</code>.
     23  */
     24 public class LZMA2Options extends FilterOptions {
     25     /**
     26      * Minimum valid compression preset level is 0.
     27      */
     28     public static final int PRESET_MIN = 0;
     29 
     30     /**
     31      * Maximum valid compression preset level is 9.
     32      */
     33     public static final int PRESET_MAX = 9;
     34 
     35     /**
     36      * Default compression preset level is 6.
     37      */
     38     public static final int PRESET_DEFAULT = 6;
     39 
     40     /**
     41      * Minimum dictionary size is 4 KiB.
     42      */
     43     public static final int DICT_SIZE_MIN = 4096;
     44 
     45     /**
     46      * Maximum dictionary size for compression is 768 MiB.
     47      * <p>
     48      * The decompressor supports bigger dictionaries, up to almost 2 GiB.
     49      * With HC4 the encoder would support dictionaries bigger than 768 MiB.
     50      * The 768 MiB limit comes from the current implementation of BT4 where
     51      * we would otherwise hit the limits of signed ints in array indexing.
     52      * <p>
     53      * If you really need bigger dictionary for decompression,
     54      * use {@link LZMA2InputStream} directly.
     55      */
     56     public static final int DICT_SIZE_MAX = 768 << 20;
     57 
     58     /**
     59      * The default dictionary size is 8 MiB.
     60      */
     61     public static final int DICT_SIZE_DEFAULT = 8 << 20;
     62 
     63     /**
     64      * Maximum value for lc + lp is 4.
     65      */
     66     public static final int LC_LP_MAX = 4;
     67 
     68     /**
     69      * The default number of literal context bits is 3.
     70      */
     71     public static final int LC_DEFAULT = 3;
     72 
     73     /**
     74      * The default number of literal position bits is 0.
     75      */
     76     public static final int LP_DEFAULT = 0;
     77 
     78     /**
     79      * Maximum value for pb is 4.
     80      */
     81     public static final int PB_MAX = 4;
     82 
     83     /**
     84      * The default number of position bits is 2.
     85      */
     86     public static final int PB_DEFAULT = 2;
     87 
     88     /**
     89      * Compression mode: uncompressed.
     90      * The data is wrapped into a LZMA2 stream without compression.
     91      */
     92     public static final int MODE_UNCOMPRESSED = 0;
     93 
     94     /**
     95      * Compression mode: fast.
     96      * This is usually combined with a hash chain match finder.
     97      */
     98     public static final int MODE_FAST = LZMAEncoder.MODE_FAST;
     99 
    100     /**
    101      * Compression mode: normal.
    102      * This is usually combined with a binary tree match finder.
    103      */
    104     public static final int MODE_NORMAL = LZMAEncoder.MODE_NORMAL;
    105 
    106     /**
    107      * Minimum value for <code>niceLen</code> is 8.
    108      */
    109     public static final int NICE_LEN_MIN = 8;
    110 
    111     /**
    112      * Maximum value for <code>niceLen</code> is 273.
    113      */
    114     public static final int NICE_LEN_MAX = 273;
    115 
    116     /**
    117      * Match finder: Hash Chain 2-3-4
    118      */
    119     public static final int MF_HC4 = LZEncoder.MF_HC4;
    120 
    121     /**
    122      * Match finder: Binary tree 2-3-4
    123      */
    124     public static final int MF_BT4 = LZEncoder.MF_BT4;
    125 
    126     private static final int[] presetToDictSize = {
    127             1 << 18, 1 << 20, 1 << 21, 1 << 22, 1 << 22,
    128             1 << 23, 1 << 23, 1 << 24, 1 << 25, 1 << 26 };
    129 
    130     private static final int[] presetToDepthLimit = { 4, 8, 24, 48 };
    131 
    132     private int dictSize;
    133     private byte[] presetDict = null;
    134     private int lc;
    135     private int lp;
    136     private int pb;
    137     private int mode;
    138     private int niceLen;
    139     private int mf;
    140     private int depthLimit;
    141 
    142     /**
    143      * Creates new LZMA2 options and sets them to the default values.
    144      * This is equivalent to <code>LZMA2Options(PRESET_DEFAULT)</code>.
    145      */
    146     public LZMA2Options() {
    147         try {
    148             setPreset(PRESET_DEFAULT);
    149         } catch (UnsupportedOptionsException e) {
    150             assert false;
    151             throw new RuntimeException();
    152         }
    153     }
    154 
    155     /**
    156      * Creates new LZMA2 options and sets them to the given preset.
    157      *
    158      * @throws      UnsupportedOptionsException
    159      *                          <code>preset</code> is not supported
    160      */
    161     public LZMA2Options(int preset) throws UnsupportedOptionsException {
    162         setPreset(preset);
    163     }
    164 
    165     /**
    166      * Creates new LZMA2 options and sets them to the given custom values.
    167      *
    168      * @throws      UnsupportedOptionsException
    169      *                          unsupported options were specified
    170      */
    171     public LZMA2Options(int dictSize, int lc, int lp, int pb, int mode,
    172                         int niceLen, int mf, int depthLimit)
    173             throws UnsupportedOptionsException {
    174         setDictSize(dictSize);
    175         setLcLp(lc, lp);
    176         setPb(pb);
    177         setMode(mode);
    178         setNiceLen(niceLen);
    179         setMatchFinder(mf);
    180         setDepthLimit(depthLimit);
    181     }
    182 
    183     /**
    184      * Sets the compression options to the given preset.
    185      * <p>
    186      * The presets 0-3 are fast presets with medium compression.
    187      * The presets 4-6 are fairly slow presets with high compression.
    188      * The default preset (<code>PRESET_DEFAULT</code>) is 6.
    189      * <p>
    190      * The presets 7-9 are like the preset 6 but use bigger dictionaries
    191      * and have higher compressor and decompressor memory requirements.
    192      * Unless the uncompressed size of the file exceeds 8&nbsp;MiB,
    193      * 16&nbsp;MiB, or 32&nbsp;MiB, it is waste of memory to use the
    194      * presets 7, 8, or 9, respectively.
    195      *
    196      * @throws      UnsupportedOptionsException
    197      *                          <code>preset</code> is not supported
    198      */
    199     public void setPreset(int preset) throws UnsupportedOptionsException {
    200         if (preset < 0 || preset > 9)
    201             throw new UnsupportedOptionsException(
    202                     "Unsupported preset: " + preset);
    203 
    204         lc = LC_DEFAULT;
    205         lp = LP_DEFAULT;
    206         pb = PB_DEFAULT;
    207         dictSize = presetToDictSize[preset];
    208 
    209         if (preset <= 3) {
    210             mode = MODE_FAST;
    211             mf = MF_HC4;
    212             niceLen = preset <= 1 ? 128 : NICE_LEN_MAX;
    213             depthLimit = presetToDepthLimit[preset];
    214         } else {
    215             mode = MODE_NORMAL;
    216             mf = MF_BT4;
    217             niceLen = (preset == 4) ? 16 : (preset == 5) ? 32 : 64;
    218             depthLimit = 0;
    219         }
    220     }
    221 
    222     /**
    223      * Sets the dictionary size in bytes.
    224      * <p>
    225      * The dictionary (or history buffer) holds the most recently seen
    226      * uncompressed data. Bigger dictionary usually means better compression.
    227      * However, using a dictioanary bigger than the size of the uncompressed
    228      * data is waste of memory.
    229      * <p>
    230      * Any value in the range [DICT_SIZE_MIN, DICT_SIZE_MAX] is valid,
    231      * but sizes of 2^n and 2^n&nbsp;+&nbsp;2^(n-1) bytes are somewhat
    232      * recommended.
    233      *
    234      * @throws      UnsupportedOptionsException
    235      *                          <code>dictSize</code> is not supported
    236      */
    237     public void setDictSize(int dictSize) throws UnsupportedOptionsException {
    238         if (dictSize < DICT_SIZE_MIN)
    239             throw new UnsupportedOptionsException(
    240                     "LZMA2 dictionary size must be at least 4 KiB: "
    241                     + dictSize + " B");
    242 
    243         if (dictSize > DICT_SIZE_MAX)
    244             throw new UnsupportedOptionsException(
    245                     "LZMA2 dictionary size must not exceed "
    246                     + (DICT_SIZE_MAX >> 20) + " MiB: " + dictSize + " B");
    247 
    248         this.dictSize = dictSize;
    249     }
    250 
    251     /**
    252      * Gets the dictionary size in bytes.
    253      */
    254     public int getDictSize() {
    255         return dictSize;
    256     }
    257 
    258     /**
    259      * Sets a preset dictionary. Use null to disable the use of
    260      * a preset dictionary. By default there is no preset dictionary.
    261      * <p>
    262      * <b>The .xz format doesn't support a preset dictionary for now.
    263      * Do not set a preset dictionary unless you use raw LZMA2.</b>
    264      * <p>
    265      * Preset dictionary can be useful when compressing many similar,
    266      * relatively small chunks of data independently from each other.
    267      * A preset dictionary should contain typical strings that occur in
    268      * the files being compressed. The most probable strings should be
    269      * near the end of the preset dictionary. The preset dictionary used
    270      * for compression is also needed for decompression.
    271      */
    272     public void setPresetDict(byte[] presetDict) {
    273         this.presetDict = presetDict;
    274     }
    275 
    276     /**
    277      * Gets the preset dictionary.
    278      */
    279     public byte[] getPresetDict() {
    280         return presetDict;
    281     }
    282 
    283     /**
    284      * Sets the number of literal context bits and literal position bits.
    285      * <p>
    286      * The sum of <code>lc</code> and <code>lp</code> is limited to 4.
    287      * Trying to exceed it will throw an exception. This function lets
    288      * you change both at the same time.
    289      *
    290      * @throws      UnsupportedOptionsException
    291      *                          <code>lc</code> and <code>lp</code>
    292      *                          are invalid
    293      */
    294     public void setLcLp(int lc, int lp) throws UnsupportedOptionsException {
    295         if (lc < 0 || lp < 0 || lc > LC_LP_MAX || lp > LC_LP_MAX
    296                 || lc + lp > LC_LP_MAX)
    297             throw new UnsupportedOptionsException(
    298                     "lc + lp must not exceed " + LC_LP_MAX + ": "
    299                     + lc + " + " + lp);
    300 
    301         this.lc = lc;
    302         this.lp = lp;
    303     }
    304 
    305     /**
    306      * Sets the number of literal context bits.
    307      * <p>
    308      * All bytes that cannot be encoded as matches are encoded as literals.
    309      * That is, literals are simply 8-bit bytes that are encoded one at
    310      * a time.
    311      * <p>
    312      * The literal coding makes an assumption that the highest <code>lc</code>
    313      * bits of the previous uncompressed byte correlate with the next byte.
    314      * For example, in typical English text, an upper-case letter is often
    315      * followed by a lower-case letter, and a lower-case letter is usually
    316      * followed by another lower-case letter. In the US-ASCII character set,
    317      * the highest three bits are 010 for upper-case letters and 011 for
    318      * lower-case letters. When <code>lc</code> is at least 3, the literal
    319      * coding can take advantage of this property in the  uncompressed data.
    320      * <p>
    321      * The default value (3) is usually good. If you want maximum compression,
    322      * try <code>setLc(4)</code>. Sometimes it helps a little, and sometimes it
    323      * makes compression worse. If it makes it worse, test for example
    324      * <code>setLc(2)</code> too.
    325      *
    326      * @throws      UnsupportedOptionsException
    327      *                          <code>lc</code> is invalid, or the sum
    328      *                          of <code>lc</code> and <code>lp</code>
    329      *                          exceed LC_LP_MAX
    330      */
    331     public void setLc(int lc) throws UnsupportedOptionsException {
    332         setLcLp(lc, lp);
    333     }
    334 
    335     /**
    336      * Sets the number of literal position bits.
    337      * <p>
    338      * This affets what kind of alignment in the uncompressed data is
    339      * assumed when encoding literals. See {@link #setPb(int) setPb} for
    340      * more information about alignment.
    341      *
    342      * @throws      UnsupportedOptionsException
    343      *                          <code>lp</code> is invalid, or the sum
    344      *                          of <code>lc</code> and <code>lp</code>
    345      *                          exceed LC_LP_MAX
    346      */
    347     public void setLp(int lp) throws UnsupportedOptionsException {
    348         setLcLp(lc, lp);
    349     }
    350 
    351     /**
    352      * Gets the number of literal context bits.
    353      */
    354     public int getLc() {
    355         return lc;
    356     }
    357 
    358     /**
    359      * Gets the number of literal position bits.
    360      */
    361     public int getLp() {
    362         return lp;
    363     }
    364 
    365     /**
    366      * Sets the number of position bits.
    367      * <p>
    368      * This affects what kind of alignment in the uncompressed data is
    369      * assumed in general. The default (2) means four-byte alignment
    370      * (2^<code>pb</code> = 2^2 = 4), which is often a good choice when
    371      * there's no better guess.
    372      * <p>
    373      * When the alignment is known, setting the number of position bits
    374      * accordingly may reduce the file size a little. For example with text
    375      * files having one-byte alignment (US-ASCII, ISO-8859-*, UTF-8), using
    376      * <code>setPb(0)</code> can improve compression slightly. For UTF-16
    377      * text, <code>setPb(1)</code> is a good choice. If the alignment is
    378      * an odd number like 3 bytes, <code>setPb(0)</code> might be the best
    379      * choice.
    380      * <p>
    381      * Even though the assumed alignment can be adjusted with
    382      * <code>setPb</code> and <code>setLp</code>, LZMA2 still slightly favors
    383      * 16-byte alignment. It might be worth taking into account when designing
    384      * file formats that are likely to be often compressed with LZMA2.
    385      *
    386      * @throws      UnsupportedOptionsException
    387      *                          <code>pb</code> is invalid
    388      */
    389     public void setPb(int pb) throws UnsupportedOptionsException {
    390         if (pb < 0 || pb > PB_MAX)
    391             throw new UnsupportedOptionsException(
    392                     "pb must not exceed " + PB_MAX + ": " + pb);
    393 
    394         this.pb = pb;
    395     }
    396 
    397     /**
    398      * Gets the number of position bits.
    399      */
    400     public int getPb() {
    401         return pb;
    402     }
    403 
    404     /**
    405      * Sets the compression mode.
    406      * <p>
    407      * This specifies the method to analyze the data produced by
    408      * a match finder. The default is <code>MODE_FAST</code> for presets
    409      * 0-3 and <code>MODE_NORMAL</code> for presets 4-9.
    410      * <p>
    411      * Usually <code>MODE_FAST</code> is used with Hash Chain match finders
    412      * and <code>MODE_NORMAL</code> with Binary Tree match finders. This is
    413      * also what the presets do.
    414      * <p>
    415      * The special mode <code>MODE_UNCOMPRESSED</code> doesn't try to
    416      * compress the data at all (and doesn't use a match finder) and will
    417      * simply wrap it in uncompressed LZMA2 chunks.
    418      *
    419      * @throws      UnsupportedOptionsException
    420      *                          <code>mode</code> is not supported
    421      */
    422     public void setMode(int mode) throws UnsupportedOptionsException {
    423         if (mode < MODE_UNCOMPRESSED || mode > MODE_NORMAL)
    424             throw new UnsupportedOptionsException(
    425                     "Unsupported compression mode: " + mode);
    426 
    427         this.mode = mode;
    428     }
    429 
    430     /**
    431      * Gets the compression mode.
    432      */
    433     public int getMode() {
    434         return mode;
    435     }
    436 
    437     /**
    438      * Sets the nice length of matches.
    439      * Once a match of at least <code>niceLen</code> bytes is found,
    440      * the algorithm stops looking for better matches. Higher values tend
    441      * to give better compression at the expense of speed. The default
    442      * depends on the preset.
    443      *
    444      * @throws      UnsupportedOptionsException
    445      *                          <code>niceLen</code> is invalid
    446      */
    447     public void setNiceLen(int niceLen) throws UnsupportedOptionsException {
    448         if (niceLen < NICE_LEN_MIN)
    449             throw new UnsupportedOptionsException(
    450                     "Minimum nice length of matches is "
    451                     + NICE_LEN_MIN + " bytes: " + niceLen);
    452 
    453         if (niceLen > NICE_LEN_MAX)
    454             throw new UnsupportedOptionsException(
    455                     "Maximum nice length of matches is " + NICE_LEN_MAX
    456                     + ": " + niceLen);
    457 
    458         this.niceLen = niceLen;
    459     }
    460 
    461     /**
    462      * Gets the nice length of matches.
    463      */
    464     public int getNiceLen() {
    465         return niceLen;
    466     }
    467 
    468     /**
    469      * Sets the match finder type.
    470      * <p>
    471      * Match finder has a major effect on compression speed, memory usage,
    472      * and compression ratio. Usually Hash Chain match finders are faster
    473      * than Binary Tree match finders. The default depends on the preset:
    474      * 0-3 use <code>MF_HC4</code> and 4-9 use <code>MF_BT4</code>.
    475      *
    476      * @throws      UnsupportedOptionsException
    477      *                          <code>mf</code> is not supported
    478      */
    479     public void setMatchFinder(int mf) throws UnsupportedOptionsException {
    480         if (mf != MF_HC4 && mf != MF_BT4)
    481             throw new UnsupportedOptionsException(
    482                     "Unsupported match finder: " + mf);
    483 
    484         this.mf = mf;
    485     }
    486 
    487     /**
    488      * Gets the match finder type.
    489      */
    490     public int getMatchFinder() {
    491         return mf;
    492     }
    493 
    494     /**
    495      * Sets the match finder search depth limit.
    496      * <p>
    497      * The default is a special value of <code>0</code> which indicates that
    498      * the depth limit should be automatically calculated by the selected
    499      * match finder from the nice length of matches.
    500      * <p>
    501      * Reasonable depth limit for Hash Chain match finders is 4-100 and
    502      * 16-1000 for Binary Tree match finders. Using very high values can
    503      * make the compressor extremely slow with some files. Avoid settings
    504      * higher than 1000 unless you are prepared to interrupt the compression
    505      * in case it is taking far too long.
    506      *
    507      * @throws      UnsupportedOptionsException
    508      *                          <code>depthLimit</code> is invalid
    509      */
    510     public void setDepthLimit(int depthLimit)
    511             throws UnsupportedOptionsException {
    512         if (depthLimit < 0)
    513             throw new UnsupportedOptionsException(
    514                     "Depth limit cannot be negative: " + depthLimit);
    515 
    516         this.depthLimit = depthLimit;
    517     }
    518 
    519     /**
    520      * Gets the match finder search depth limit.
    521      */
    522     public int getDepthLimit() {
    523         return depthLimit;
    524     }
    525 
    526     public int getEncoderMemoryUsage() {
    527         return (mode == MODE_UNCOMPRESSED)
    528                ? UncompressedLZMA2OutputStream.getMemoryUsage()
    529                : LZMA2OutputStream.getMemoryUsage(this);
    530     }
    531 
    532     public FinishableOutputStream getOutputStream(FinishableOutputStream out) {
    533         if (mode == MODE_UNCOMPRESSED)
    534             return new UncompressedLZMA2OutputStream(out);
    535 
    536         return new LZMA2OutputStream(out, this);
    537     }
    538 
    539     /**
    540      * Gets how much memory the LZMA2 decoder will need to decompress the data
    541      * that was encoded with these options and stored in a .xz file.
    542      * <p>
    543      * The returned value may bigger than the value returned by a direct call
    544      * to {@link LZMA2InputStream#getMemoryUsage(int)} if the dictionary size
    545      * is not 2^n or 2^n&nbsp;+&nbsp;2^(n-1) bytes. This is because the .xz
    546      * headers store the dictionary size in such a format and other values
    547      * are rounded up to the next such value. Such rounding is harmess except
    548      * it might waste some memory if an unsual dictionary size is used.
    549      * <p>
    550      * If you use raw LZMA2 streams and unusual dictioanary size, call
    551      * {@link LZMA2InputStream#getMemoryUsage} directly to get raw decoder
    552      * memory requirements.
    553      */
    554     public int getDecoderMemoryUsage() {
    555         // Round the dictionary size up to the next 2^n or 2^n + 2^(n-1).
    556         int d = dictSize - 1;
    557         d |= d >>> 2;
    558         d |= d >>> 3;
    559         d |= d >>> 4;
    560         d |= d >>> 8;
    561         d |= d >>> 16;
    562         return LZMA2InputStream.getMemoryUsage(d + 1);
    563     }
    564 
    565     public InputStream getInputStream(InputStream in) throws IOException {
    566         return new LZMA2InputStream(in, dictSize);
    567     }
    568 
    569     FilterEncoder getFilterEncoder() {
    570         return new LZMA2Encoder(this);
    571     }
    572 
    573     public Object clone() {
    574         try {
    575             return super.clone();
    576         } catch (CloneNotSupportedException e) {
    577             assert false;
    578             throw new RuntimeException();
    579         }
    580     }
    581 }
    582