Home | History | Annotate | Download | only in lz77support
      1 /*
      2  * Licensed to the Apache Software Foundation (ASF) under one
      3  * or more contributor license agreements.  See the NOTICE file
      4  * distributed with this work for additional information
      5  * regarding copyright ownership.  The ASF licenses this file
      6  * to you under the Apache License, Version 2.0 (the
      7  * "License"); you may not use this file except in compliance
      8  * with the License.  You may obtain a copy of the License at
      9  *
     10  * http://www.apache.org/licenses/LICENSE-2.0
     11  *
     12  * Unless required by applicable law or agreed to in writing,
     13  * software distributed under the License is distributed on an
     14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
     15  * KIND, either express or implied.  See the License for the
     16  * specific language governing permissions and limitations
     17  * under the License.
     18  */
     19 package org.apache.commons.compress.compressors.lz77support;
     20 
     21 import java.io.IOException;
     22 import java.util.Arrays;
     23 
     24 /**
     25  * Helper class for compression algorithms that use the ideas of LZ77.
     26  *
     27  * <p>Most LZ77 derived algorithms split input data into blocks of
     28  * uncompressed data (called literal blocks) and back-references
     29  * (pairs of offsets and lengths) that state "add <code>length</code>
     30  * bytes that are the same as those already written starting
     31  * <code>offset</code> bytes before the current position. The details
     32  * of how those blocks and back-references are encoded are quite
     33  * different between the algorithms and some algorithms perform
     34  * additional steps (Huffman encoding in the case of DEFLATE for
     35  * example).</p>
     36  *
     37  * <p>This class attempts to extract the core logic - finding
     38  * back-references - so it can be re-used. It follows the algorithm
     39  * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't
     40  * implement the "lazy match" optimization. The three-byte hash
     41  * function used in this class is the same as the one used by zlib and
     42  * InfoZIP's ZIP implementation of DEFLATE. The whole class is
     43  * strongly inspired by InfoZIP's implementation.</p>
     44  *
     45  * <p>LZ77 is used vaguely here (as well as many other places that
     46  * talk about it :-), LZSS would likely be closer to the truth but
     47  * LZ77 has become the synonym for a whole family of algorithms.</p>
     48  *
     49  * <p>The API consists of a compressor that is fed <code>byte</code>s
     50  * and emits {@link Block}s to a registered callback where the blocks
     51  * represent either {@link LiteralBlock literal blocks}, {@link
     52  * BackReference back-references} or {@link EOD end of data
     53  * markers}. In order to ensure the callback receives all information,
     54  * the {@code #finish} method must be used once all data has been fed
     55  * into the compressor.</p>
     56  *
     57  * <p>Several parameters influence the outcome of the "compression":</p>
     58  * <dl>
     59  *
     60  *  <dt><code>windowSize</code></dt> <dd>the size of the sliding
     61  *  window, must be a power of two - this determines the maximum
     62  *  offset a back-reference can take. The compressor maintains a
     63  *  buffer of twice of <code>windowSize</code> - real world values are
     64  *  in the area of 32k.</dd>
     65  *
     66  *  <dt><code>minBackReferenceLength</code></dt>
     67  *  <dd>Minimal length of a back-reference found. A true minimum of 3 is
     68  *  hard-coded inside of this implemention but bigger lengths can be
     69  *  configured.</dd>
     70  *
     71  *  <dt><code>maxBackReferenceLength</code></dt>
     72  *  <dd>Maximal length of a back-reference found.</dd>
     73  *
     74  *  <dt><code>maxOffset</code></dt>
     75  *  <dd>Maximal offset of a back-reference.</dd>
     76  *
     77  *  <dt><code>maxLiteralLength</code></dt>
     78  *  <dd>Maximal length of a literal block.</dd>
     79  * </dl>
     80  *
     81  * @see "https://tools.ietf.org/html/rfc1951#section-4"
     82  * @since 1.14
     83  * @NotThreadSafe
     84  */
     85 public class LZ77Compressor {
     86 
     87     /**
     88      * Base class representing blocks the compressor may emit.
     89      *
     90      * <p>This class is not supposed to be subclassed by classes
     91      * outside of Commons Compress so it is considered internal and
     92      * changed that would break subclasses may get introduced with
     93      * future releases.</p>
     94      */
     95     public static abstract class Block {
     96         /** Enumeration of the block types the compressor may emit. */
     97         public enum BlockType {
     98             LITERAL, BACK_REFERENCE, EOD
     99         }
    100         public abstract BlockType getType();
    101     }
    102 
    103     /**
    104      * Represents a literal block of data.
    105      *
    106      * <p>For performance reasons this encapsulates the real data, not
    107      * a copy of it. Don't modify the data and process it inside of
    108      * {@link Callback#accept} immediately as it will get overwritten
    109      * sooner or later.</p>
    110      */
    111     public static final class LiteralBlock extends Block {
    112         private final byte[] data;
    113         private final int offset, length;
    114         public LiteralBlock(byte[] data, int offset, int length) {
    115             this.data = data;
    116             this.offset = offset;
    117             this.length = length;
    118         }
    119         /**
    120          * The literal data.
    121          *
    122          * <p>This returns a life view of the actual data in order to
    123          * avoid copying, modify the array at your own risk.</p>
    124          * @return the data
    125          */
    126         public byte[] getData() {
    127             return data;
    128         }
    129         /**
    130          * Offset into data where the literal block starts.
    131          * @return the offset
    132          */
    133         public int getOffset() {
    134             return offset;
    135         }
    136         /**
    137          * Length of literal block.
    138          * @return the length
    139          */
    140         public int getLength() {
    141             return length;
    142         }
    143         @Override
    144         public BlockType getType() {
    145             return BlockType.LITERAL;
    146         }
    147         @Override
    148         public String toString() {
    149             return "LiteralBlock starting at " + offset + " with length " + length;
    150         }
    151     }
    152 
    153     /**
    154      * Represents a back-reference.
    155      */
    156     public static final class BackReference extends Block {
    157         private final int offset, length;
    158         public BackReference(int offset, int length) {
    159             this.offset = offset;
    160             this.length = length;
    161         }
    162         /**
    163          * Provides the offset of the back-reference.
    164          * @return the offset
    165          */
    166         public int getOffset() {
    167             return offset;
    168         }
    169         /**
    170          * Provides the length of the back-reference.
    171          * @return the length
    172          */
    173         public int getLength() {
    174             return length;
    175         }
    176         @Override
    177         public BlockType getType() {
    178             return BlockType.BACK_REFERENCE;
    179         }
    180         @Override
    181         public String toString() {
    182             return "BackReference with offset " + offset + " and length " + length;
    183         }
    184     }
    185 
    186     /** A simple "we are done" marker. */
    187     public static final class EOD extends Block {
    188         @Override
    189         public BlockType getType() {
    190             return BlockType.EOD;
    191         }
    192     }
    193 
    194     private static final Block THE_EOD = new EOD();
    195 
    196     /**
    197      * Callback invoked while the compressor processes data.
    198      *
    199      * <p>The callback is invoked on the same thread that receives the
    200      * bytes to compress and may be invoked multiple times during the
    201      * execution of {@link #compress} or {@link #finish}.</p>
    202      */
    203     public interface Callback {
    204         /**
    205          * Consumes a block.
    206          * @param b the block to consume
    207          * @throws IOException in case of an error
    208          */
    209         void accept(Block b) throws IOException;
    210     }
    211 
    212     static final int NUMBER_OF_BYTES_IN_HASH = 3;
    213     private static final int NO_MATCH = -1;
    214 
    215     private final Parameters params;
    216     private final Callback callback;
    217 
    218     // the sliding window, twice as big as "windowSize" parameter
    219     private final byte[] window;
    220     // the head of hash-chain - indexed by hash-code, points to the
    221     // location inside of window of the latest sequence of bytes with
    222     // the given hash.
    223     private final int[] head;
    224     // for each window-location points to the latest earlier location
    225     // with the same hash. Only stores values for the latest
    226     // "windowSize" elements, the index is "window location modulo
    227     // windowSize".
    228     private final int[] prev;
    229 
    230     // bit mask used when indexing into prev
    231     private final int wMask;
    232 
    233     private boolean initialized = false;
    234     // the position inside of window that shall be encoded right now
    235     private int currentPosition;
    236     // the number of bytes available to compress including the one at
    237     // currentPosition
    238     private int lookahead = 0;
    239     // the hash of the three bytes stating at the current position
    240     private int insertHash = 0;
    241     // the position inside of the window where the current literal
    242     // block starts (in case we are inside of a literal block).
    243     private int blockStart = 0;
    244     // position of the current match
    245     private int matchStart = NO_MATCH;
    246     // number of missed insertString calls for the up to three last
    247     // bytes of the last match that can only be performed once more
    248     // data has been read
    249     private int missedInserts = 0;
    250 
    251     /**
    252      * Initializes a compressor with parameters and a callback.
    253      * @param params the parameters
    254      * @param callback the callback
    255      * @throws NullPointerException if either parameter is <code>null</code>
    256      */
    257     public LZ77Compressor(Parameters params, Callback callback) {
    258         if (params == null) {
    259             throw new NullPointerException("params must not be null");
    260         }
    261         if (callback == null) {
    262             throw new NullPointerException("callback must not be null");
    263         }
    264         this.params = params;
    265         this.callback = callback;
    266 
    267         final int wSize = params.getWindowSize();
    268         window = new byte[wSize * 2];
    269         wMask = wSize - 1;
    270         head = new int[HASH_SIZE];
    271         Arrays.fill(head, NO_MATCH);
    272         prev = new int[wSize];
    273     }
    274 
    275     /**
    276      * Feeds bytes into the compressor which in turn may emit zero or
    277      * more blocks to the callback during the execution of this
    278      * method.
    279      * @param data the data to compress - must not be null
    280      * @throws IOException if the callback throws an exception
    281      */
    282     public void compress(byte[] data) throws IOException {
    283         compress(data, 0, data.length);
    284     }
    285 
    286     /**
    287      * Feeds bytes into the compressor which in turn may emit zero or
    288      * more blocks to the callback during the execution of this
    289      * method.
    290      * @param data the data to compress - must not be null
    291      * @param off the start offset of the data
    292      * @param len the number of bytes to compress
    293      * @throws IOException if the callback throws an exception
    294      */
    295     public void compress(byte[] data, int off, int len) throws IOException {
    296         final int wSize = params.getWindowSize();
    297         while (len > wSize) { // chop into windowSize sized chunks
    298             doCompress(data, off, wSize);
    299             off += wSize;
    300             len -= wSize;
    301         }
    302         if (len > 0) {
    303             doCompress(data, off, len);
    304         }
    305     }
    306 
    307     /**
    308      * Tells the compressor to process all remaining data and signal
    309      * end of data to the callback.
    310      *
    311      * <p>The compressor will in turn emit at least one block ({@link
    312      * EOD}) but potentially multiple blocks to the callback during
    313      * the execution of this method.</p>
    314      * @throws IOException if the callback throws an exception
    315      */
    316     public void finish() throws IOException {
    317         if (blockStart != currentPosition || lookahead > 0) {
    318             currentPosition += lookahead;
    319             flushLiteralBlock();
    320         }
    321         callback.accept(THE_EOD);
    322     }
    323 
    324     /**
    325      * Adds some initial data to fill the window with.
    326      *
    327      * <p>This is used if the stream has been cut into blocks and
    328      * back-references of one block may refer to data of the previous
    329      * block(s). One such example is the LZ4 frame format using block
    330      * dependency.</p>
    331      *
    332      * @param data the data to fill the window with.
    333      * @throws IllegalStateException if the compressor has already started to accept data
    334      */
    335     public void prefill(byte[] data) {
    336         if (currentPosition != 0 || lookahead != 0) {
    337             throw new IllegalStateException("the compressor has already started to accept data, can't prefill anymore");
    338         }
    339 
    340         // don't need more than windowSize for back-references
    341         final int len = Math.min(params.getWindowSize(), data.length);
    342         System.arraycopy(data, data.length - len, window, 0, len);
    343 
    344         if (len >= NUMBER_OF_BYTES_IN_HASH) {
    345             initialize();
    346             final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
    347             for (int i = 0; i < stop; i++) {
    348                 insertString(i);
    349             }
    350             missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
    351         } else { // not enough data to hash anything
    352             missedInserts = len;
    353         }
    354         blockStart = currentPosition = len;
    355     }
    356 
    357     // we use a 15 bit hashcode as calculated in updateHash
    358     private static final int HASH_SIZE = 1 << 15;
    359     private static final int HASH_MASK = HASH_SIZE - 1;
    360     private static final int H_SHIFT = 5;
    361 
    362     /**
    363      * Assumes we are calculating the hash for three consecutive bytes
    364      * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC
    365      * the new hash for BCD is nextHash(H, D).
    366      *
    367      * <p>The hash is shifted by five bits on each update so all
    368      * effects of A have been swapped after the third update.</p>
    369      */
    370     private int nextHash(int oldHash, byte nextByte) {
    371         final int nextVal = nextByte & 0xFF;
    372         return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK;
    373     }
    374 
    375     // performs the actual algorithm with the pre-condition len <= windowSize
    376     private void doCompress(byte[] data, int off, int len) throws IOException {
    377         int spaceLeft = window.length - currentPosition - lookahead;
    378         if (len > spaceLeft) {
    379             slide();
    380         }
    381         System.arraycopy(data, off, window, currentPosition + lookahead, len);
    382         lookahead += len;
    383         if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
    384             initialize();
    385         }
    386         if (initialized) {
    387             compress();
    388         }
    389     }
    390 
    391     private void slide() throws IOException {
    392         final int wSize = params.getWindowSize();
    393         if (blockStart != currentPosition && blockStart < wSize) {
    394             flushLiteralBlock();
    395             blockStart = currentPosition;
    396         }
    397         System.arraycopy(window, wSize, window, 0, wSize);
    398         currentPosition -= wSize;
    399         matchStart -= wSize;
    400         blockStart -= wSize;
    401         for (int i = 0; i < HASH_SIZE; i++) {
    402             int h = head[i];
    403             head[i] = h >= wSize ? h - wSize : NO_MATCH;
    404         }
    405         for (int i = 0; i < wSize; i++) {
    406             int p = prev[i];
    407             prev[i] = p >= wSize ? p - wSize : NO_MATCH;
    408         }
    409     }
    410 
    411     private void initialize() {
    412         for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
    413             insertHash = nextHash(insertHash, window[i]);
    414         }
    415         initialized = true;
    416     }
    417 
    418     private void compress() throws IOException {
    419         final int minMatch = params.getMinBackReferenceLength();
    420         final boolean lazy = params.getLazyMatching();
    421         final int lazyThreshold = params.getLazyMatchingThreshold();
    422 
    423         while (lookahead >= minMatch) {
    424             catchUpMissedInserts();
    425             int matchLength = 0;
    426             int hashHead = insertString(currentPosition);
    427             if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
    428                 // sets matchStart as a side effect
    429                 matchLength = longestMatch(hashHead);
    430 
    431                 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
    432                     // try to find a longer match using the next position
    433                     matchLength = longestMatchForNextPosition(matchLength);
    434                 }
    435             }
    436             if (matchLength >= minMatch) {
    437                 if (blockStart != currentPosition) {
    438                     // emit preceeding literal block
    439                     flushLiteralBlock();
    440                     blockStart = NO_MATCH;
    441                 }
    442                 flushBackReference(matchLength);
    443                 insertStringsInMatch(matchLength);
    444                 lookahead -= matchLength;
    445                 currentPosition += matchLength;
    446                 blockStart = currentPosition;
    447             } else {
    448                 // no match, append to current or start a new literal
    449                 lookahead--;
    450                 currentPosition++;
    451                 if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
    452                     flushLiteralBlock();
    453                     blockStart = currentPosition;
    454                 }
    455             }
    456         }
    457     }
    458 
    459     /**
    460      * Inserts the current three byte sequence into the dictionary and
    461      * returns the previous head of the hash-chain.
    462      *
    463      * <p>Updates <code>insertHash</code> and <code>prev</code> as a
    464      * side effect.</p>
    465      */
    466     private int insertString(int pos) {
    467         insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
    468         int hashHead = head[insertHash];
    469         prev[pos & wMask] = hashHead;
    470         head[insertHash] = pos;
    471         return hashHead;
    472     }
    473 
    474     private int longestMatchForNextPosition(final int prevMatchLength) {
    475         // save a bunch of values to restore them if the next match isn't better than the current one
    476         final int prevMatchStart = matchStart;
    477         final int prevInsertHash = insertHash;
    478 
    479         lookahead--;
    480         currentPosition++;
    481         int hashHead = insertString(currentPosition);
    482         final int prevHashHead = prev[currentPosition & wMask];
    483         int matchLength = longestMatch(hashHead);
    484 
    485         if (matchLength <= prevMatchLength) {
    486             // use the first match, as the next one isn't any better
    487             matchLength = prevMatchLength;
    488             matchStart = prevMatchStart;
    489 
    490             // restore modified values
    491             head[insertHash] = prevHashHead;
    492             insertHash = prevInsertHash;
    493             currentPosition--;
    494             lookahead++;
    495         }
    496         return matchLength;
    497     }
    498 
    499     private void insertStringsInMatch(int matchLength) {
    500         // inserts strings contained in current match
    501         // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
    502         final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
    503         // currentPosition has been inserted already
    504         for (int i = 1; i <= stop; i++) {
    505             insertString(currentPosition + i);
    506         }
    507         missedInserts = matchLength - stop - 1;
    508     }
    509 
    510     private void catchUpMissedInserts() {
    511         while (missedInserts > 0) {
    512             insertString(currentPosition - missedInserts--);
    513         }
    514     }
    515 
    516     private void flushBackReference(int matchLength) throws IOException {
    517         callback.accept(new BackReference(currentPosition - matchStart, matchLength));
    518     }
    519 
    520     private void flushLiteralBlock() throws IOException {
    521         callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
    522     }
    523 
    524     /**
    525      * Searches the hash chain for real matches and returns the length
    526      * of the longest match (0 if none were found) that isn't too far
    527      * away (WRT maxOffset).
    528      *
    529      * <p>Sets matchStart to the index of the start position of the
    530      * longest match as a side effect.</p>
    531      */
    532     private int longestMatch(int matchHead) {
    533         final int minLength = params.getMinBackReferenceLength();
    534         int longestMatchLength = minLength - 1;
    535         final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
    536         final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
    537         final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
    538         final int maxCandidates = params.getMaxCandidates();
    539         for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
    540             int currentLength = 0;
    541             for (int i = 0; i < maxPossibleLength; i++) {
    542                 if (window[matchHead + i] != window[currentPosition + i]) {
    543                     break;
    544                 }
    545                 currentLength++;
    546             }
    547             if (currentLength > longestMatchLength) {
    548                 longestMatchLength = currentLength;
    549                 matchStart = matchHead;
    550                 if (currentLength >= niceBackReferenceLength) {
    551                     // no need to search any further
    552                     break;
    553                 }
    554             }
    555             matchHead = prev[matchHead & wMask];
    556         }
    557         return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
    558     }
    559 }
    560