Home | History | Annotate | Download | only in src
      1 // Copyright 2006 Google Inc.
      2 // Authors: Sanjay Ghemawat, Jeff Dean, Chandra Chereddi, Lincoln Smith
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 //
     16 // Implementation of the Bentley/McIlroy algorithm for finding differences.
     17 // Bentley, McIlroy.  DCC 1999.  Data Compression Using Long Common Strings.
     18 // http://citeseer.ist.psu.edu/555557.html
     19 
     20 #ifndef OPEN_VCDIFF_BLOCKHASH_H_
     21 #define OPEN_VCDIFF_BLOCKHASH_H_
     22 
     23 #include <config.h>
     24 #include <stddef.h>  // size_t
     25 #include <stdint.h>  // uint32_t
     26 #include <vector>
     27 
     28 namespace open_vcdiff {
     29 
     30 // A generic hash table which will be used to keep track of byte runs
     31 // of size kBlockSize in both the incrementally processed target data
     32 // and the preprocessed source dictionary.
     33 //
     34 // A custom hash table implementation is used instead of the standard
     35 // hash_map template because we know that there will be exactly one
     36 // entry in the BlockHash corresponding to each kBlockSize bytes
     37 // in the source data, which makes certain optimizations possible:
     38 // * The memory for the hash table and for all hash entries can be allocated
     39 //   in one step rather than incrementally for each insert operation.
     40 // * A single integer can be used to represent both
     41 //   the index of the next hash entry in the chain
     42 //   and the position of the entry within the source data
     43 //   (== kBlockSize * block_number).  This greatly reduces the size
     44 //   of a hash entry.
     45 //
     46 class BlockHash {
     47  public:
     48   // Block size as per Bentley/McIlroy; must be a power of two.
     49   //
     50   // Using (for example) kBlockSize = 4 guarantees that no match smaller
     51   // than size 4 will be identified, that some matches having sizes
     52   // 4, 5, or 6 may be identified, and that all matches
     53   // having size 7 or greater will be identified (because any string of
     54   // 7 bytes must contain a complete aligned block of 4 bytes.)
     55   //
     56   // Increasing kBlockSize by a factor of two will halve the amount of
     57   // memory needed for the next block table, and will halve the setup time
     58   // for a new BlockHash.  However, it also doubles the minimum
     59   // match length that is guaranteed to be found in FindBestMatch(),
     60   // so that function will be less effective in finding matches.
     61   //
     62   // Computational effort in FindBestMatch (which is the inner loop of
     63   // the encoding algorithm) will be proportional to the number of
     64   // matches found, and a low value of kBlockSize will waste time
     65   // tracking down small matches.  On the other hand, if this value
     66   // is set too high, no matches will be found at all.
     67   //
     68   // It is suggested that different values of kBlockSize be tried against
     69   // a representative data set to find the best tradeoff between
     70   // memory/CPU and the effectiveness of FindBestMatch().
     71   //
     72   // If you change kBlockSize to a smaller value, please increase
     73   // kMaxMatchesToCheck accordingly.
     74   static const int kBlockSize = 16;
     75 
     76   // This class is used to store the best match found by FindBestMatch()
     77   // and return it to the caller.
     78   class Match {
     79    public:
     80     Match() : size_(0), source_offset_(-1), target_offset_(-1) { }
     81 
     82     void ReplaceIfBetterMatch(size_t candidate_size,
     83                               int candidate_source_offset,
     84                               int candidate_target_offset) {
     85       if (candidate_size > size_) {
     86         size_ = candidate_size;
     87         source_offset_ = candidate_source_offset;
     88         target_offset_ = candidate_target_offset;
     89       }
     90     }
     91 
     92     size_t size() const { return size_; }
     93     int source_offset() const { return source_offset_; }
     94     int target_offset() const { return target_offset_; }
     95 
     96    private:
     97      // The size of the best (longest) match passed to ReplaceIfBetterMatch().
     98     size_t size_;
     99 
    100     // The source offset of the match, including the starting_offset_
    101     // of the BlockHash for which the match was found.
    102     int source_offset_;
    103 
    104     // The target offset of the match.  An offset of 0 corresponds to the
    105     // data at target_start, which is an argument of FindBestMatch().
    106     int target_offset_;
    107 
    108     // Making these private avoids implicit copy constructor
    109     // & assignment operator
    110     Match(const Match&);  // NOLINT
    111     void operator=(const Match&);
    112   };
    113 
    114   // A BlockHash is created using a buffer of source data.  The hash table
    115   // will contain one entry for each kBlockSize-byte block in the
    116   // source data.
    117   //
    118   // See the comments for starting_offset_, below, for a description of
    119   // the starting_offset argument.  For a hash of source (dictionary) data,
    120   // starting_offset_ will be zero; for a hash of previously encoded
    121   // target data, starting_offset_ will be equal to the dictionary size.
    122   //
    123   BlockHash(const char* source_data, size_t source_size, int starting_offset);
    124 
    125   ~BlockHash();
    126 
    127   // Initializes the object before use.
    128   // This method must be called after constructing a BlockHash object,
    129   // and before any other method may be called.  This is because
    130   // Init() dynamically allocates hash_table_ and next_block_table_.
    131   // Returns true if initialization succeeded, or false if an error occurred,
    132   // in which case no other method except the destructor may then be used
    133   // on the object.
    134   //
    135   // If populate_hash_table is true, then AddAllBlocks() will be called
    136   // to populate the hash table.  If populate_hash_table is false, then
    137   // classes that inherit from BlockHash are expected to call AddBlock()
    138   // to incrementally populate individual blocks of data.
    139   //
    140   bool Init(bool populate_hash_table);
    141 
    142   // In the context of the open-vcdiff encoder, BlockHash is used for two
    143   // purposes: to hash the source (dictionary) data, and to hash
    144   // the previously encoded target data.  The main differences between
    145   // a dictionary BlockHash and a target BlockHash are as follows:
    146   //
    147   //   1. The best_match->source_offset() returned from FindBestMatch()
    148   //      for a target BlockHash is computed in the following manner:
    149   //      the starting offset of the first byte in the target data
    150   //      is equal to the dictionary size.  FindBestMatch() will add
    151   //      starting_offset_ to any best_match->source_offset() value it returns,
    152   //      in order to produce the correct offset value for a target BlockHash.
    153   //   2. For a dictionary BlockHash, the entire data set is hashed at once
    154   //      when Init() is called with the parameter populate_hash_table = true.
    155   //      For a target BlockHash, because the previously encoded target data
    156   //      includes only the data seen up to the current encoding position,
    157   //      the data blocks are hashed incrementally as the encoding position
    158   //      advances, using AddOneIndexHash() and AddAllBlocksThroughIndex().
    159   //
    160   // The following two factory functions can be used to create BlockHash
    161   // objects for each of these two purposes.  Each factory function calls
    162   // the object constructor and also calls Init().  If an error occurs,
    163   // NULL is returned; otherwise a valid BlockHash object is returned.
    164   // Since a dictionary BlockHash is not expected to be modified after
    165   // initialization, a const object is returned.
    166   // The caller is responsible for deleting the returned object
    167   // (using the C++ delete operator) once it is no longer needed.
    168   static const BlockHash* CreateDictionaryHash(const char* dictionary_data,
    169                                                size_t dictionary_size);
    170   static BlockHash* CreateTargetHash(const char* target_data,
    171                                      size_t target_size,
    172                                      size_t dictionary_size);
    173 
    174   // This function will be called to add blocks incrementally to the target hash
    175   // as the encoding position advances through the target data.  It will be
    176   // called for every kBlockSize-byte block in the target data, regardless
    177   // of whether the block is aligned evenly on a block boundary.  The
    178   // BlockHash will only store hash entries for the evenly-aligned blocks.
    179   //
    180   void AddOneIndexHash(int index, uint32_t hash_value) {
    181     if (index == NextIndexToAdd()) {
    182       AddBlock(hash_value);
    183     }
    184   }
    185 
    186   // Calls AddBlock() for each kBlockSize-byte block in the range
    187   // (last_block_added_ * kBlockSize, end_index), exclusive of the endpoints.
    188   // If end_index <= the last index added (last_block_added_ * kBlockSize),
    189   // this function does nothing.
    190   //
    191   // A partial block beginning anywhere up to (end_index - 1) is also added,
    192   // unless it extends outside the end of the source data.  Like AddAllBlocks(),
    193   // this function computes the hash value for each of the blocks in question
    194   // from scratch, so it is not a good option if the hash values have already
    195   // been computed for some other purpose.
    196   //
    197   // Example: assume kBlockSize = 4, last_block_added_ = 1, and there are
    198   // 14 bytes of source data.
    199   // If AddAllBlocksThroughIndex(9) is invoked, then it will call AddBlock()
    200   // only for block number 2 (at index 8).
    201   // If, after that, AddAllBlocksThroughIndex(14) is invoked, it will not call
    202   // AddBlock() at all, because block 3 (beginning at index 12) would
    203   // fall outside the range of source data.
    204   //
    205   // VCDiffEngine::Encode (in vcdiffengine.cc) uses this function to
    206   // add a whole range of data to a target hash when a COPY instruction
    207   // is generated.
    208   void AddAllBlocksThroughIndex(int end_index);
    209 
    210   // FindBestMatch takes a position within the unencoded target data
    211   // (target_candidate_start) and the hash value of the kBlockSize bytes
    212   // beginning at that position (hash_value).  It attempts to find a matching
    213   // set of bytes within the source (== dictionary) data, expanding
    214   // the match both below and above the target block.  It cannot expand
    215   // the match outside the bounds of the source data, or below
    216   // target_start within the target data, or past
    217   // the end limit of (target_start + target_length).
    218   //
    219   // target_candidate_start is the start of the candidate block within the
    220   // target data for which a match will be sought, while
    221   // target_start (which is <= target_candidate_start)
    222   // is the start of the target data that has yet to be encoded.
    223   //
    224   // If a match is found whose size is greater than the size
    225   // of best_match, this function populates *best_match with the
    226   // size, source_offset, and target_offset of the match found.
    227   // best_match->source_offset() will contain the index of the start of the
    228   // matching source data, plus starting_offset_
    229   // (see description of starting_offset_ for details);
    230   // best_match->target_offset() will contain the offset of the match
    231   // beginning with target_start = offset 0, such that
    232   //     0 <= best_match->target_offset()
    233   //              <= (target_candidate_start - target_start);
    234   // and best_match->size() will contain the size of the match.
    235   // If no such match is found, this function leaves *best_match unmodified.
    236   //
    237   // On calling FindBestMatch(), best_match must
    238   // point to a valid Match object, and cannot be NULL.
    239   // The same Match object can be passed
    240   // when calling FindBestMatch() on a different BlockHash object
    241   // for the same candidate data block, in order to find
    242   // the best match possible across both objects.  For example:
    243   //
    244   //     open_vcdiff::BlockHash::Match best_match;
    245   //     uint32_t hash_value =
    246   //         RollingHash<BlockHash::kBlockSize>::Hash(target_candidate_start);
    247   //     bh1.FindBestMatch(hash_value,
    248   //                       target_candidate_start,
    249   //                       target_start,
    250   //                       target_length,
    251   //                       &best_match);
    252   //     bh2.FindBestMatch(hash_value,
    253   //                       target_candidate_start,
    254   //                       target_start,
    255   //                       target_length,
    256   //                       &best_match);
    257   //     if (best_size >= 0) {
    258   //       // a match was found; its size, source offset, and target offset
    259   //       // can be found in best_match
    260   //     }
    261   //
    262   // hash_value is passed as a separate parameter from target_candidate_start,
    263   // (rather than calculated within FindBestMatch) in order to take
    264   // advantage of the rolling hash, which quickly calculates the hash value
    265   // of the block starting at target_candidate_start based on
    266   // the known hash value of the block starting at (target_candidate_start - 1).
    267   // See vcdiffengine.cc for more details.
    268   //
    269   // Example:
    270   //    kBlockSize: 4
    271   //    target text: "ANDREW LLOYD WEBBER"
    272   //                 1^    5^2^         3^
    273   //    dictionary: "INSURANCE : LLOYDS OF LONDON"
    274   //                           4^
    275   //    hashed dictionary blocks:
    276   //        "INSU", "RANC", "E : ", "LLOY", "DS O", "F LON"
    277   //
    278   //    1: target_start (beginning of unencoded data)
    279   //    2: target_candidate_start (for the block "LLOY")
    280   //    3: target_length (points one byte beyond the last byte of data.)
    281   //    4: best_match->source_offset() (after calling FindBestMatch)
    282   //    5: best_match->target_offset() (after calling FindBestMatch)
    283   //
    284   //    Under these conditions, FindBestMatch will find a matching
    285   //    hashed dictionary block for "LLOY", and will extend the beginning of
    286   //    this match backwards by one byte, and the end of the match forwards
    287   //    by one byte, finding that the best match is " LLOYD"
    288   //    with best_match->source_offset() = 10
    289   //                                  (offset of " LLOYD" in the source string),
    290   //         best_match->target_offset() = 6
    291   //                                  (offset of " LLOYD" in the target string),
    292   //     and best_match->size() = 6.
    293   //
    294   void FindBestMatch(uint32_t hash_value,
    295                      const char* target_candidate_start,
    296                      const char* target_start,
    297                      size_t target_size,
    298                      Match* best_match) const;
    299 
    300  protected:
    301   // FindBestMatch() will not process more than this number
    302   // of matching hash entries.
    303   //
    304   // It is necessary to have a limit on the maximum number of matches
    305   // that will be checked in order to avoid the worst-case performance
    306   // possible if, for example, all the blocks in the dictionary have
    307   // the same hash value.  See the unit test SearchStringFindsTooManyMatches
    308   // for an example of such a case.  The encoder uses a loop in
    309   // VCDiffEngine::Encode over each target byte, containing a loop in
    310   // BlockHash::FindBestMatch over the number of matches (up to a maximum
    311   // of the number of source blocks), containing two loops that extend
    312   // the match forwards and backwards up to the number of source bytes.
    313   // Total complexity in the worst case is
    314   //     O([target size] * source_size_ * source_size_)
    315   // Placing a limit on the possible number of matches checked changes this to
    316   //     O([target size] * source_size_ * kMaxMatchesToCheck)
    317   //
    318   // In empirical testing on real HTML text, using a block size of 4,
    319   // the number of true matches per call to FindBestMatch() did not exceed 78;
    320   // with a block size of 32, the number of matches did not exceed 3.
    321   //
    322   // The expected number of true matches scales super-linearly
    323   // with the inverse of kBlockSize, but here a linear scale is used
    324   // for block sizes smaller than 32.
    325   static const int kMaxMatchesToCheck = (kBlockSize >= 32) ? 32 :
    326                                             (32 * (32 / kBlockSize));
    327 
    328   // Do not skip more than this number of non-matching hash collisions
    329   // to find the next matching entry in the hash chain.
    330   static const int kMaxProbes = 16;
    331 
    332   // Internal routine which calculates a hash table size based on kBlockSize and
    333   // the dictionary_size.  Will return a power of two if successful, or 0 if an
    334   // internal error occurs.  Some calculations (such as GetHashTableIndex())
    335   // depend on the table size being a power of two.
    336   static size_t CalcTableSize(const size_t dictionary_size);
    337 
    338   size_t GetNumberOfBlocks() const {
    339     return source_size_ / kBlockSize;
    340   }
    341 
    342   // Use the lowest-order bits of the hash value
    343   // as the index into the hash table.
    344   uint32_t GetHashTableIndex(uint32_t hash_value) const {
    345     return hash_value & hash_table_mask_;
    346   }
    347 
    348   // The index within source_data_ of the next block
    349   // for which AddBlock() should be called.
    350   int NextIndexToAdd() const {
    351     return (last_block_added_ + 1) * kBlockSize;
    352   }
    353 
    354   static inline bool TooManyMatches(int* match_counter);
    355 
    356   const char* source_data() { return source_data_; }
    357   size_t source_size() { return source_size_; }
    358 
    359   // Adds an entry to the hash table for one block of source data of length
    360   // kBlockSize, starting at source_data_[block_number * kBlockSize],
    361   // where block_number is always (last_block_added_ + 1).  That is,
    362   // AddBlock() must be called once for each block in source_data_
    363   // in increasing order.
    364   void AddBlock(uint32_t hash_value);
    365 
    366   // Calls AddBlock() for each complete kBlockSize-byte block between
    367   // source_data_ and (source_data_ + source_size_).  It is equivalent
    368   // to calling AddAllBlocksThroughIndex(source_data + source_size).
    369   // This function is called when Init(true) is invoked.
    370   void AddAllBlocks();
    371 
    372   // Returns true if the contents of the kBlockSize-byte block
    373   // beginning at block1 are identical to the contents of
    374   // the block beginning at block2; false otherwise.
    375   static bool BlockContentsMatch(const char* block1, const char* block2);
    376 
    377   // Compares each machine word of the two (possibly unaligned) blocks, rather
    378   // than each byte, thus reducing the number of test-and-branch instructions
    379   // executed.  Returns a boolean (do the blocks match?) rather than
    380   // the signed byte difference returned by memcmp.
    381   //
    382   // BlockContentsMatch will use either this function or memcmp to do its work,
    383   // depending on which is faster for a particular architecture.
    384   //
    385   // For gcc on x86-based architectures, this function has been shown to run
    386   // about twice as fast as the library function memcmp(), and between five and
    387   // nine times faster than the assembly instructions (repz and cmpsb) that gcc
    388   // uses by default for builtin memcmp.  On other architectures, or using
    389   // other compilers, this function has not shown to be faster than memcmp.
    390   static bool BlockCompareWords(const char* block1, const char* block2);
    391 
    392   // Finds the first block number within the hashed data
    393   // that represents a match for the given hash value.
    394   // Returns -1 if no match was found.
    395   //
    396   // Init() must have been called and returned true before using
    397   // FirstMatchingBlock or NextMatchingBlock.  No check is performed
    398   // for this condition; the code will crash if this condition is violated.
    399   //
    400   // The hash table is initially populated with -1 (not found) values,
    401   // so if this function is called before the hash table has been populated
    402   // using AddAllBlocks() or AddBlock(), it will simply return -1
    403   // for any value of hash_value.
    404   int FirstMatchingBlock(uint32_t hash_value, const char* block_ptr) const;
    405 
    406   // Given a block number returned by FirstMatchingBlock()
    407   // or by a previous call to NextMatchingBlock(), returns
    408   // the next block number that matches the same hash value.
    409   // Returns -1 if no match was found.
    410   int NextMatchingBlock(int block_number, const char* block_ptr) const;
    411 
    412   // Inline version of FirstMatchingBlock.  This saves the cost of a function
    413   // call when this routine is called from within the module.  The external
    414   // (non-inlined) version is called only by unit tests.
    415   inline int FirstMatchingBlockInline(uint32_t hash_value,
    416                                       const char* block_ptr) const;
    417 
    418   // Walk through the hash entry chain, skipping over any false matches
    419   // (for which the lowest bits of the fingerprints match,
    420   // but the actual block data does not.)  Returns the block number of
    421   // the first true match found, or -1 if no true match was found.
    422   // If block_number is a matching block, the function will return block_number
    423   // without skipping to the next block.
    424   int SkipNonMatchingBlocks(int block_number, const char* block_ptr) const;
    425 
    426   // Returns the number of bytes to the left of source_match_start
    427   // that match the corresponding bytes to the left of target_match_start.
    428   // Will not examine more than max_bytes bytes, which is to say that
    429   // the return value will be in the range [0, max_bytes] inclusive.
    430   static int MatchingBytesToLeft(const char* source_match_start,
    431                                  const char* target_match_start,
    432                                  int max_bytes);
    433 
    434   // Returns the number of bytes starting at source_match_end
    435   // that match the corresponding bytes starting at target_match_end.
    436   // Will not examine more than max_bytes bytes, which is to say that
    437   // the return value will be in the range [0, max_bytes] inclusive.
    438   static int MatchingBytesToRight(const char* source_match_end,
    439                                   const char* target_match_end,
    440                                   int max_bytes);
    441 
    442   // The protected functions BlockContentsMatch, FirstMatchingBlock,
    443   // NextMatchingBlock, MatchingBytesToLeft, and MatchingBytesToRight
    444   // should be made accessible to unit tests.
    445   friend class BlockHashTest;
    446 
    447  private:
    448   const char* const  source_data_;
    449   const size_t       source_size_;
    450 
    451   // The size of this array is determined using CalcTableSize().  It has at
    452   // least one element for each kBlockSize-byte block in the source data.
    453   // GetHashTableIndex() returns an index into this table for a given hash
    454   // value.  The value of each element of hash_table_ is the lowest block
    455   // number in the source data whose hash value would return the same value from
    456   // GetHashTableIndex(), or -1 if there is no matching block.  This value can
    457   // then be used as an index into next_block_table_ to retrieve the entire set
    458   // of matching block numbers.
    459   std::vector<int> hash_table_;
    460 
    461   // An array containing one element for each source block.  Each element is
    462   // either -1 (== not found) or the index of the next block whose hash value
    463   // would produce a matching result from GetHashTableIndex().
    464   std::vector<int> next_block_table_;
    465 
    466   // This vector has the same size as next_block_table_.  For every block number
    467   // B that is referenced in hash_table_, last_block_table_[B] will contain
    468   // the maximum block number that has the same GetHashTableIndex() value
    469   // as block B.  This number may be B itself.  For a block number B' that
    470   // is not referenced in hash_table_, the value of last_block_table_[B'] is -1.
    471   // This table is used only while populating the hash table, not while looking
    472   // up hash values in the table.  Keeping track of the last block number in the
    473   // chain allows us to construct the block chains as FIFO rather than LIFO
    474   // lists, so that the match with the lowest index is returned first.  This
    475   // should result in a more compact encoding because the VCDIFF format favors
    476   // smaller index values and repeated index values.
    477   std::vector<int> last_block_table_;
    478 
    479   // Performing a bitwise AND with hash_table_mask_ will produce a value ranging
    480   // from 0 to the number of elements in hash_table_.
    481   uint32_t hash_table_mask_;
    482 
    483   // The offset of the first byte of source data (the data at source_data_[0]).
    484   // For the purpose of computing offsets, the source data and target data
    485   // are considered to be concatenated -- not literally in a single memory
    486   // buffer, but conceptually as described in the RFC.
    487   // The first byte of the previously encoded target data
    488   // has an offset that is equal to dictionary_size, i.e., just after
    489   // the last byte of source data.
    490   // For a hash of source (dictionary) data, starting_offset_ will be zero;
    491   // for a hash of previously encoded target data, starting_offset_ will be
    492   // equal to the dictionary size.
    493   const int starting_offset_;
    494 
    495   // The last index added by AddBlock().  This determines the block number
    496   // for successive calls to AddBlock(), and is also
    497   // used to determine the starting block for AddAllBlocksThroughIndex().
    498   int last_block_added_;
    499 
    500   // Making these private avoids implicit copy constructor & assignment operator
    501   BlockHash(const BlockHash&);  // NOLINT
    502   void operator=(const BlockHash&);
    503 };
    504 
    505 }  // namespace open_vcdiff
    506 
    507 #endif  // OPEN_VCDIFF_BLOCKHASH_H_
    508