1 /* 2 * LZMA2Options 3 * 4 * Author: Lasse Collin <lasse.collin (at) tukaani.org> 5 * 6 * This file has been put into the public domain. 7 * You can do whatever you want with this file. 8 */ 9 10 package org.tukaani.xz; 11 12 import java.io.InputStream; 13 import java.io.IOException; 14 import org.tukaani.xz.lz.LZEncoder; 15 import org.tukaani.xz.lzma.LZMAEncoder; 16 17 /** 18 * LZMA2 compression options. 19 * <p> 20 * While this allows setting the LZMA2 compression options in detail, 21 * often you only need <code>LZMA2Options()</code> or 22 * <code>LZMA2Options(int)</code>. 23 */ 24 public class LZMA2Options extends FilterOptions { 25 /** 26 * Minimum valid compression preset level is 0. 27 */ 28 public static final int PRESET_MIN = 0; 29 30 /** 31 * Maximum valid compression preset level is 9. 32 */ 33 public static final int PRESET_MAX = 9; 34 35 /** 36 * Default compression preset level is 6. 37 */ 38 public static final int PRESET_DEFAULT = 6; 39 40 /** 41 * Minimum dictionary size is 4 KiB. 42 */ 43 public static final int DICT_SIZE_MIN = 4096; 44 45 /** 46 * Maximum dictionary size for compression is 768 MiB. 47 * <p> 48 * The decompressor supports bigger dictionaries, up to almost 2 GiB. 49 * With HC4 the encoder would support dictionaries bigger than 768 MiB. 50 * The 768 MiB limit comes from the current implementation of BT4 where 51 * we would otherwise hit the limits of signed ints in array indexing. 52 * <p> 53 * If you really need bigger dictionary for decompression, 54 * use {@link LZMA2InputStream} directly. 55 */ 56 public static final int DICT_SIZE_MAX = 768 << 20; 57 58 /** 59 * The default dictionary size is 8 MiB. 60 */ 61 public static final int DICT_SIZE_DEFAULT = 8 << 20; 62 63 /** 64 * Maximum value for lc + lp is 4. 65 */ 66 public static final int LC_LP_MAX = 4; 67 68 /** 69 * The default number of literal context bits is 3. 70 */ 71 public static final int LC_DEFAULT = 3; 72 73 /** 74 * The default number of literal position bits is 0. 75 */ 76 public static final int LP_DEFAULT = 0; 77 78 /** 79 * Maximum value for pb is 4. 80 */ 81 public static final int PB_MAX = 4; 82 83 /** 84 * The default number of position bits is 2. 85 */ 86 public static final int PB_DEFAULT = 2; 87 88 /** 89 * Compression mode: uncompressed. 90 * The data is wrapped into a LZMA2 stream without compression. 91 */ 92 public static final int MODE_UNCOMPRESSED = 0; 93 94 /** 95 * Compression mode: fast. 96 * This is usually combined with a hash chain match finder. 97 */ 98 public static final int MODE_FAST = LZMAEncoder.MODE_FAST; 99 100 /** 101 * Compression mode: normal. 102 * This is usually combined with a binary tree match finder. 103 */ 104 public static final int MODE_NORMAL = LZMAEncoder.MODE_NORMAL; 105 106 /** 107 * Minimum value for <code>niceLen</code> is 8. 108 */ 109 public static final int NICE_LEN_MIN = 8; 110 111 /** 112 * Maximum value for <code>niceLen</code> is 273. 113 */ 114 public static final int NICE_LEN_MAX = 273; 115 116 /** 117 * Match finder: Hash Chain 2-3-4 118 */ 119 public static final int MF_HC4 = LZEncoder.MF_HC4; 120 121 /** 122 * Match finder: Binary tree 2-3-4 123 */ 124 public static final int MF_BT4 = LZEncoder.MF_BT4; 125 126 private static final int[] presetToDictSize = { 127 1 << 18, 1 << 20, 1 << 21, 1 << 22, 1 << 22, 128 1 << 23, 1 << 23, 1 << 24, 1 << 25, 1 << 26 }; 129 130 private static final int[] presetToDepthLimit = { 4, 8, 24, 48 }; 131 132 private int dictSize; 133 private byte[] presetDict = null; 134 private int lc; 135 private int lp; 136 private int pb; 137 private int mode; 138 private int niceLen; 139 private int mf; 140 private int depthLimit; 141 142 /** 143 * Creates new LZMA2 options and sets them to the default values. 144 * This is equivalent to <code>LZMA2Options(PRESET_DEFAULT)</code>. 145 */ 146 public LZMA2Options() { 147 try { 148 setPreset(PRESET_DEFAULT); 149 } catch (UnsupportedOptionsException e) { 150 assert false; 151 throw new RuntimeException(); 152 } 153 } 154 155 /** 156 * Creates new LZMA2 options and sets them to the given preset. 157 * 158 * @throws UnsupportedOptionsException 159 * <code>preset</code> is not supported 160 */ 161 public LZMA2Options(int preset) throws UnsupportedOptionsException { 162 setPreset(preset); 163 } 164 165 /** 166 * Creates new LZMA2 options and sets them to the given custom values. 167 * 168 * @throws UnsupportedOptionsException 169 * unsupported options were specified 170 */ 171 public LZMA2Options(int dictSize, int lc, int lp, int pb, int mode, 172 int niceLen, int mf, int depthLimit) 173 throws UnsupportedOptionsException { 174 setDictSize(dictSize); 175 setLcLp(lc, lp); 176 setPb(pb); 177 setMode(mode); 178 setNiceLen(niceLen); 179 setMatchFinder(mf); 180 setDepthLimit(depthLimit); 181 } 182 183 /** 184 * Sets the compression options to the given preset. 185 * <p> 186 * The presets 0-3 are fast presets with medium compression. 187 * The presets 4-6 are fairly slow presets with high compression. 188 * The default preset (<code>PRESET_DEFAULT</code>) is 6. 189 * <p> 190 * The presets 7-9 are like the preset 6 but use bigger dictionaries 191 * and have higher compressor and decompressor memory requirements. 192 * Unless the uncompressed size of the file exceeds 8 MiB, 193 * 16 MiB, or 32 MiB, it is waste of memory to use the 194 * presets 7, 8, or 9, respectively. 195 * 196 * @throws UnsupportedOptionsException 197 * <code>preset</code> is not supported 198 */ 199 public void setPreset(int preset) throws UnsupportedOptionsException { 200 if (preset < 0 || preset > 9) 201 throw new UnsupportedOptionsException( 202 "Unsupported preset: " + preset); 203 204 lc = LC_DEFAULT; 205 lp = LP_DEFAULT; 206 pb = PB_DEFAULT; 207 dictSize = presetToDictSize[preset]; 208 209 if (preset <= 3) { 210 mode = MODE_FAST; 211 mf = MF_HC4; 212 niceLen = preset <= 1 ? 128 : NICE_LEN_MAX; 213 depthLimit = presetToDepthLimit[preset]; 214 } else { 215 mode = MODE_NORMAL; 216 mf = MF_BT4; 217 niceLen = (preset == 4) ? 16 : (preset == 5) ? 32 : 64; 218 depthLimit = 0; 219 } 220 } 221 222 /** 223 * Sets the dictionary size in bytes. 224 * <p> 225 * The dictionary (or history buffer) holds the most recently seen 226 * uncompressed data. Bigger dictionary usually means better compression. 227 * However, using a dictioanary bigger than the size of the uncompressed 228 * data is waste of memory. 229 * <p> 230 * Any value in the range [DICT_SIZE_MIN, DICT_SIZE_MAX] is valid, 231 * but sizes of 2^n and 2^n + 2^(n-1) bytes are somewhat 232 * recommended. 233 * 234 * @throws UnsupportedOptionsException 235 * <code>dictSize</code> is not supported 236 */ 237 public void setDictSize(int dictSize) throws UnsupportedOptionsException { 238 if (dictSize < DICT_SIZE_MIN) 239 throw new UnsupportedOptionsException( 240 "LZMA2 dictionary size must be at least 4 KiB: " 241 + dictSize + " B"); 242 243 if (dictSize > DICT_SIZE_MAX) 244 throw new UnsupportedOptionsException( 245 "LZMA2 dictionary size must not exceed " 246 + (DICT_SIZE_MAX >> 20) + " MiB: " + dictSize + " B"); 247 248 this.dictSize = dictSize; 249 } 250 251 /** 252 * Gets the dictionary size in bytes. 253 */ 254 public int getDictSize() { 255 return dictSize; 256 } 257 258 /** 259 * Sets a preset dictionary. Use null to disable the use of 260 * a preset dictionary. By default there is no preset dictionary. 261 * <p> 262 * <b>The .xz format doesn't support a preset dictionary for now. 263 * Do not set a preset dictionary unless you use raw LZMA2.</b> 264 * <p> 265 * Preset dictionary can be useful when compressing many similar, 266 * relatively small chunks of data independently from each other. 267 * A preset dictionary should contain typical strings that occur in 268 * the files being compressed. The most probable strings should be 269 * near the end of the preset dictionary. The preset dictionary used 270 * for compression is also needed for decompression. 271 */ 272 public void setPresetDict(byte[] presetDict) { 273 this.presetDict = presetDict; 274 } 275 276 /** 277 * Gets the preset dictionary. 278 */ 279 public byte[] getPresetDict() { 280 return presetDict; 281 } 282 283 /** 284 * Sets the number of literal context bits and literal position bits. 285 * <p> 286 * The sum of <code>lc</code> and <code>lp</code> is limited to 4. 287 * Trying to exceed it will throw an exception. This function lets 288 * you change both at the same time. 289 * 290 * @throws UnsupportedOptionsException 291 * <code>lc</code> and <code>lp</code> 292 * are invalid 293 */ 294 public void setLcLp(int lc, int lp) throws UnsupportedOptionsException { 295 if (lc < 0 || lp < 0 || lc > LC_LP_MAX || lp > LC_LP_MAX 296 || lc + lp > LC_LP_MAX) 297 throw new UnsupportedOptionsException( 298 "lc + lp must not exceed " + LC_LP_MAX + ": " 299 + lc + " + " + lp); 300 301 this.lc = lc; 302 this.lp = lp; 303 } 304 305 /** 306 * Sets the number of literal context bits. 307 * <p> 308 * All bytes that cannot be encoded as matches are encoded as literals. 309 * That is, literals are simply 8-bit bytes that are encoded one at 310 * a time. 311 * <p> 312 * The literal coding makes an assumption that the highest <code>lc</code> 313 * bits of the previous uncompressed byte correlate with the next byte. 314 * For example, in typical English text, an upper-case letter is often 315 * followed by a lower-case letter, and a lower-case letter is usually 316 * followed by another lower-case letter. In the US-ASCII character set, 317 * the highest three bits are 010 for upper-case letters and 011 for 318 * lower-case letters. When <code>lc</code> is at least 3, the literal 319 * coding can take advantage of this property in the uncompressed data. 320 * <p> 321 * The default value (3) is usually good. If you want maximum compression, 322 * try <code>setLc(4)</code>. Sometimes it helps a little, and sometimes it 323 * makes compression worse. If it makes it worse, test for example 324 * <code>setLc(2)</code> too. 325 * 326 * @throws UnsupportedOptionsException 327 * <code>lc</code> is invalid, or the sum 328 * of <code>lc</code> and <code>lp</code> 329 * exceed LC_LP_MAX 330 */ 331 public void setLc(int lc) throws UnsupportedOptionsException { 332 setLcLp(lc, lp); 333 } 334 335 /** 336 * Sets the number of literal position bits. 337 * <p> 338 * This affets what kind of alignment in the uncompressed data is 339 * assumed when encoding literals. See {@link #setPb(int) setPb} for 340 * more information about alignment. 341 * 342 * @throws UnsupportedOptionsException 343 * <code>lp</code> is invalid, or the sum 344 * of <code>lc</code> and <code>lp</code> 345 * exceed LC_LP_MAX 346 */ 347 public void setLp(int lp) throws UnsupportedOptionsException { 348 setLcLp(lc, lp); 349 } 350 351 /** 352 * Gets the number of literal context bits. 353 */ 354 public int getLc() { 355 return lc; 356 } 357 358 /** 359 * Gets the number of literal position bits. 360 */ 361 public int getLp() { 362 return lp; 363 } 364 365 /** 366 * Sets the number of position bits. 367 * <p> 368 * This affects what kind of alignment in the uncompressed data is 369 * assumed in general. The default (2) means four-byte alignment 370 * (2^<code>pb</code> = 2^2 = 4), which is often a good choice when 371 * there's no better guess. 372 * <p> 373 * When the alignment is known, setting the number of position bits 374 * accordingly may reduce the file size a little. For example with text 375 * files having one-byte alignment (US-ASCII, ISO-8859-*, UTF-8), using 376 * <code>setPb(0)</code> can improve compression slightly. For UTF-16 377 * text, <code>setPb(1)</code> is a good choice. If the alignment is 378 * an odd number like 3 bytes, <code>setPb(0)</code> might be the best 379 * choice. 380 * <p> 381 * Even though the assumed alignment can be adjusted with 382 * <code>setPb</code> and <code>setLp</code>, LZMA2 still slightly favors 383 * 16-byte alignment. It might be worth taking into account when designing 384 * file formats that are likely to be often compressed with LZMA2. 385 * 386 * @throws UnsupportedOptionsException 387 * <code>pb</code> is invalid 388 */ 389 public void setPb(int pb) throws UnsupportedOptionsException { 390 if (pb < 0 || pb > PB_MAX) 391 throw new UnsupportedOptionsException( 392 "pb must not exceed " + PB_MAX + ": " + pb); 393 394 this.pb = pb; 395 } 396 397 /** 398 * Gets the number of position bits. 399 */ 400 public int getPb() { 401 return pb; 402 } 403 404 /** 405 * Sets the compression mode. 406 * <p> 407 * This specifies the method to analyze the data produced by 408 * a match finder. The default is <code>MODE_FAST</code> for presets 409 * 0-3 and <code>MODE_NORMAL</code> for presets 4-9. 410 * <p> 411 * Usually <code>MODE_FAST</code> is used with Hash Chain match finders 412 * and <code>MODE_NORMAL</code> with Binary Tree match finders. This is 413 * also what the presets do. 414 * <p> 415 * The special mode <code>MODE_UNCOMPRESSED</code> doesn't try to 416 * compress the data at all (and doesn't use a match finder) and will 417 * simply wrap it in uncompressed LZMA2 chunks. 418 * 419 * @throws UnsupportedOptionsException 420 * <code>mode</code> is not supported 421 */ 422 public void setMode(int mode) throws UnsupportedOptionsException { 423 if (mode < MODE_UNCOMPRESSED || mode > MODE_NORMAL) 424 throw new UnsupportedOptionsException( 425 "Unsupported compression mode: " + mode); 426 427 this.mode = mode; 428 } 429 430 /** 431 * Gets the compression mode. 432 */ 433 public int getMode() { 434 return mode; 435 } 436 437 /** 438 * Sets the nice length of matches. 439 * Once a match of at least <code>niceLen</code> bytes is found, 440 * the algorithm stops looking for better matches. Higher values tend 441 * to give better compression at the expense of speed. The default 442 * depends on the preset. 443 * 444 * @throws UnsupportedOptionsException 445 * <code>niceLen</code> is invalid 446 */ 447 public void setNiceLen(int niceLen) throws UnsupportedOptionsException { 448 if (niceLen < NICE_LEN_MIN) 449 throw new UnsupportedOptionsException( 450 "Minimum nice length of matches is " 451 + NICE_LEN_MIN + " bytes: " + niceLen); 452 453 if (niceLen > NICE_LEN_MAX) 454 throw new UnsupportedOptionsException( 455 "Maximum nice length of matches is " + NICE_LEN_MAX 456 + ": " + niceLen); 457 458 this.niceLen = niceLen; 459 } 460 461 /** 462 * Gets the nice length of matches. 463 */ 464 public int getNiceLen() { 465 return niceLen; 466 } 467 468 /** 469 * Sets the match finder type. 470 * <p> 471 * Match finder has a major effect on compression speed, memory usage, 472 * and compression ratio. Usually Hash Chain match finders are faster 473 * than Binary Tree match finders. The default depends on the preset: 474 * 0-3 use <code>MF_HC4</code> and 4-9 use <code>MF_BT4</code>. 475 * 476 * @throws UnsupportedOptionsException 477 * <code>mf</code> is not supported 478 */ 479 public void setMatchFinder(int mf) throws UnsupportedOptionsException { 480 if (mf != MF_HC4 && mf != MF_BT4) 481 throw new UnsupportedOptionsException( 482 "Unsupported match finder: " + mf); 483 484 this.mf = mf; 485 } 486 487 /** 488 * Gets the match finder type. 489 */ 490 public int getMatchFinder() { 491 return mf; 492 } 493 494 /** 495 * Sets the match finder search depth limit. 496 * <p> 497 * The default is a special value of <code>0</code> which indicates that 498 * the depth limit should be automatically calculated by the selected 499 * match finder from the nice length of matches. 500 * <p> 501 * Reasonable depth limit for Hash Chain match finders is 4-100 and 502 * 16-1000 for Binary Tree match finders. Using very high values can 503 * make the compressor extremely slow with some files. Avoid settings 504 * higher than 1000 unless you are prepared to interrupt the compression 505 * in case it is taking far too long. 506 * 507 * @throws UnsupportedOptionsException 508 * <code>depthLimit</code> is invalid 509 */ 510 public void setDepthLimit(int depthLimit) 511 throws UnsupportedOptionsException { 512 if (depthLimit < 0) 513 throw new UnsupportedOptionsException( 514 "Depth limit cannot be negative: " + depthLimit); 515 516 this.depthLimit = depthLimit; 517 } 518 519 /** 520 * Gets the match finder search depth limit. 521 */ 522 public int getDepthLimit() { 523 return depthLimit; 524 } 525 526 public int getEncoderMemoryUsage() { 527 return (mode == MODE_UNCOMPRESSED) 528 ? UncompressedLZMA2OutputStream.getMemoryUsage() 529 : LZMA2OutputStream.getMemoryUsage(this); 530 } 531 532 public FinishableOutputStream getOutputStream(FinishableOutputStream out) { 533 if (mode == MODE_UNCOMPRESSED) 534 return new UncompressedLZMA2OutputStream(out); 535 536 return new LZMA2OutputStream(out, this); 537 } 538 539 /** 540 * Gets how much memory the LZMA2 decoder will need to decompress the data 541 * that was encoded with these options and stored in a .xz file. 542 * <p> 543 * The returned value may bigger than the value returned by a direct call 544 * to {@link LZMA2InputStream#getMemoryUsage(int)} if the dictionary size 545 * is not 2^n or 2^n + 2^(n-1) bytes. This is because the .xz 546 * headers store the dictionary size in such a format and other values 547 * are rounded up to the next such value. Such rounding is harmess except 548 * it might waste some memory if an unsual dictionary size is used. 549 * <p> 550 * If you use raw LZMA2 streams and unusual dictioanary size, call 551 * {@link LZMA2InputStream#getMemoryUsage} directly to get raw decoder 552 * memory requirements. 553 */ 554 public int getDecoderMemoryUsage() { 555 // Round the dictionary size up to the next 2^n or 2^n + 2^(n-1). 556 int d = dictSize - 1; 557 d |= d >>> 2; 558 d |= d >>> 3; 559 d |= d >>> 4; 560 d |= d >>> 8; 561 d |= d >>> 16; 562 return LZMA2InputStream.getMemoryUsage(d + 1); 563 } 564 565 public InputStream getInputStream(InputStream in) throws IOException { 566 return new LZMA2InputStream(in, dictSize); 567 } 568 569 FilterEncoder getFilterEncoder() { 570 return new LZMA2Encoder(this); 571 } 572 573 public Object clone() { 574 try { 575 return super.clone(); 576 } catch (CloneNotSupportedException e) { 577 assert false; 578 throw new RuntimeException(); 579 } 580 } 581 } 582