open-vcdiff/src/vcdiffengine.cc

// Copyright 2006, 2008 Google Inc.
// Authors: Chandra Chereddi, Lincoln Smith
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <config.h>
#include "vcdiffengine.h"
#include <stdint.h>  // uint32_t
#include <string.h>  // memcpy
#include "blockhash.h"
#include "codetablewriter_interface.h"
#include "logging.h"
#include "rolling_hash.h"

namespace open_vcdiff {

VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size)
    // If dictionary_size == 0, then dictionary could be NULL.  Guard against
    // using a NULL value.
    : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""),
      dictionary_size_(dictionary_size),
      hashed_dictionary_(NULL) {
  if (dictionary_size > 0) {
    memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size);
  }
}

VCDiffEngine::~VCDiffEngine() {
  delete hashed_dictionary_;
  if (dictionary_size_ > 0) {
    delete[] dictionary_;
  }
}

bool VCDiffEngine::Init() {
  if (hashed_dictionary_) {
    LOG(DFATAL) << "Init() called twice for same VCDiffEngine object"
                << LOG_ENDL;
    return false;
  }
  hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_,
                                                       dictionary_size());
  if (!hashed_dictionary_) {
    LOG(DFATAL) << "Creation of dictionary hash failed" << LOG_ENDL;
    return false;
  }
  if (!RollingHash<BlockHash::kBlockSize>::Init()) {
    LOG(DFATAL) << "RollingHash initialization failed" << LOG_ENDL;
    return false;
  }
  return true;
}

// This helper function tries to find an appropriate match within
// hashed_dictionary_ for the block starting at the current target position.
// If target_hash is not NULL, this function will also look for a match
// within the previously encoded target data.
//
// If a match is found, this function will generate an ADD instruction
// for all unencoded data that precedes the match,
// and a COPY instruction for the match itself; then it returns
// the number of bytes processed by both instructions,
// which is guaranteed to be > 0.
// If no appropriate match is found, the function returns 0.
//
// The first four parameters are input parameters which are passed
// directly to BlockHash::FindBestMatch; please see that function
// for a description of their allowable values.
template<bool look_for_target_matches>
inline size_t VCDiffEngine::EncodeCopyForBestMatch(
    uint32_t hash_value,
    const char* target_candidate_start,
    const char* unencoded_target_start,
    size_t unencoded_target_size,
    const BlockHash* target_hash,
    CodeTableWriterInterface* coder) const {
  // When FindBestMatch() comes up with a match for a candidate block,
  // it will populate best_match with the size, source offset,
  // and target offset of the match.
  BlockHash::Match best_match;

  // First look for a match in the dictionary.
  hashed_dictionary_->FindBestMatch(hash_value,
                                    target_candidate_start,
                                    unencoded_target_start,
                                    unencoded_target_size,
                                    &best_match);
  // If target matching is enabled, then see if there is a better match
  // within the target data that has been encoded so far.
  if (look_for_target_matches) {
    target_hash->FindBestMatch(hash_value,
                               target_candidate_start,
                               unencoded_target_start,
                               unencoded_target_size,
                               &best_match);
  }
  if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) {
    return 0;
  }
  if (best_match.target_offset() > 0) {
    // Create an ADD instruction to encode all target bytes
    // from the end of the last COPY match, if any, up to
    // the beginning of this COPY match.
    coder->Add(unencoded_target_start, best_match.target_offset());
  }
  coder->Copy(best_match.source_offset(), best_match.size());
  return best_match.target_offset()  // ADD size
       + best_match.size();          // + COPY size
}

// Once the encoder loop has finished checking for matches in the target data,
// this function creates an ADD instruction to encode all target bytes
// from the end of the last COPY match, if any, through the end of
// the target data.  In the worst case, if no matches were found at all,
// this function will create one big ADD instruction
// for the entire buffer of target data.
inline void VCDiffEngine::AddUnmatchedRemainder(
    const char* unencoded_target_start,
    size_t unencoded_target_size,
    CodeTableWriterInterface* coder) const {
  if (unencoded_target_size > 0) {
    coder->Add(unencoded_target_start, unencoded_target_size);
  }
}

// This helper function tells the coder to finish the encoding and write
// the results into the output string "diff".
inline void VCDiffEngine::FinishEncoding(
    size_t target_size,
    OutputStringInterface* diff,
    CodeTableWriterInterface* coder) const {
  if (target_size != static_cast<size_t>(coder->target_length())) {
    LOG(DFATAL) << "Internal error in VCDiffEngine::Encode: "
                   "original target size (" << target_size
                << ") does not match number of bytes processed ("
                << coder->target_length() << ")" << LOG_ENDL;
  }
  coder->Output(diff);
}

template<bool look_for_target_matches>
void VCDiffEngine::EncodeInternal(const char* target_data,
                                  size_t target_size,
                                  OutputStringInterface* diff,
                                  CodeTableWriterInterface* coder) const {
  if (!hashed_dictionary_) {
    LOG(DFATAL) << "Internal error: VCDiffEngine::Encode() "
                   "called before VCDiffEngine::Init()" << LOG_ENDL;
    return;
  }
  if (target_size == 0) {
    return;  // Do nothing for empty target
  }
  // Special case for really small input
  if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) {
    AddUnmatchedRemainder(target_data, target_size, coder);
    FinishEncoding(target_size, diff, coder);
    return;
  }
  RollingHash<BlockHash::kBlockSize> hasher;
  BlockHash* target_hash = NULL;
  if (look_for_target_matches) {
    // Check matches against previously encoded target data
    // in this same target window, as well as against the dictionary
    target_hash = BlockHash::CreateTargetHash(target_data,
                                              target_size,
                                              dictionary_size());
    if (!target_hash) {
      LOG(DFATAL) << "Instantiation of target hash failed" << LOG_ENDL;
      return;
    }
  }
  const char* const target_end = target_data + target_size;
  const char* const start_of_last_block = target_end - BlockHash::kBlockSize;
  // Offset of next bytes in string to ADD if NOT copied (i.e., not found in
  // dictionary)
  const char* next_encode = target_data;
  // candidate_pos points to the start of the kBlockSize-byte block that may
  // begin a match with the dictionary or previously encoded target data.
  const char* candidate_pos = target_data;
  uint32_t hash_value = hasher.Hash(candidate_pos);
  while (1) {
    const size_t bytes_encoded =
        EncodeCopyForBestMatch<look_for_target_matches>(
            hash_value,
            candidate_pos,
            next_encode,
            (target_end - next_encode),
            target_hash,
            coder);
    if (bytes_encoded > 0) {
      next_encode += bytes_encoded;  // Advance past COPYed data
      candidate_pos = next_encode;
      if (candidate_pos > start_of_last_block) {
        break;  // Reached end of target data
      }
      // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash
      // can't be used to calculate the hash value at its new position.
      hash_value = hasher.Hash(candidate_pos);
      if (look_for_target_matches) {
        // Update the target hash for the ADDed and COPYed data
        target_hash->AddAllBlocksThroughIndex(
            static_cast<int>(next_encode - target_data));
      }
    } else {
      // No match, or match is too small to be worth a COPY instruction.
      // Move to the next position in the target data.
      if ((candidate_pos + 1) > start_of_last_block) {
        break;  // Reached end of target data
      }
      if (look_for_target_matches) {
        target_hash->AddOneIndexHash(
            static_cast<int>(candidate_pos - target_data),
            hash_value);
      }
      hash_value = hasher.UpdateHash(hash_value,
                                     candidate_pos[0],
                                     candidate_pos[BlockHash::kBlockSize]);
      ++candidate_pos;
    }
  }
  AddUnmatchedRemainder(next_encode, target_end - next_encode, coder);
  FinishEncoding(target_size, diff, coder);
  delete target_hash;
}

void VCDiffEngine::Encode(const char* target_data,
                          size_t target_size,
                          bool look_for_target_matches,
                          OutputStringInterface* diff,
                          CodeTableWriterInterface* coder) const {
  if (look_for_target_matches) {
    EncodeInternal<true>(target_data, target_size, diff, coder);
  } else {
    EncodeInternal<false>(target_data, target_size, diff, coder);
  }
}

}  // namespace open_vcdiff