Home | History | Annotate | Download | only in courgette
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // We want to measure the similarity of two sequences of bytes as a surrogate
      6 // for measuring how well a second sequence will compress differentially to the
      7 // first sequence.
      8 
      9 #include "courgette/difference_estimator.h"
     10 
     11 #include "base/containers/hash_tables.h"
     12 
     13 namespace courgette {
     14 
     15 // Our difference measure is the number of k-tuples that occur in Subject that
     16 // don't occur in Base.
     17 const int kTupleSize = 4;
     18 
     19 namespace {
     20 
     21 COMPILE_ASSERT(kTupleSize >= 4 && kTupleSize <= 8, kTupleSize_between_4_and_8);
     22 
     23 size_t HashTuple(const uint8* source) {
     24   size_t hash1 = *reinterpret_cast<const uint32*>(source);
     25   size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4);
     26   size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23);
     27   return hash;
     28 }
     29 
     30 bool RegionsEqual(const Region& a, const Region& b) {
     31   if (a.length() != b.length())
     32     return false;
     33   return memcmp(a.start(), b.start(), a.length()) == 0;
     34 }
     35 
     36 }  // anonymous namepace
     37 
     38 class DifferenceEstimator::Base {
     39  public:
     40   explicit Base(const Region& region) : region_(region) { }
     41 
     42   void Init() {
     43     const uint8* start = region_.start();
     44     const uint8* end = region_.end() - (kTupleSize - 1);
     45     for (const uint8* p = start;  p < end;  ++p) {
     46       size_t hash = HashTuple(p);
     47       hashes_.insert(hash);
     48     }
     49   }
     50 
     51   const Region& region() const { return region_; }
     52 
     53  private:
     54   Region region_;
     55   base::hash_set<size_t> hashes_;
     56 
     57   friend class DifferenceEstimator;
     58   DISALLOW_COPY_AND_ASSIGN(Base);
     59 };
     60 
     61 class DifferenceEstimator::Subject {
     62  public:
     63   explicit Subject(const Region& region) : region_(region) {}
     64 
     65   const Region& region() const { return region_; }
     66 
     67  private:
     68   Region region_;
     69 
     70   DISALLOW_COPY_AND_ASSIGN(Subject);
     71 };
     72 
     73 DifferenceEstimator::DifferenceEstimator() {}
     74 
     75 DifferenceEstimator::~DifferenceEstimator() {
     76   for (size_t i = 0;  i < owned_bases_.size();  ++i)
     77     delete owned_bases_[i];
     78   for (size_t i = 0;  i < owned_subjects_.size();  ++i)
     79     delete owned_subjects_[i];
     80 }
     81 
     82 DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) {
     83   Base* base = new Base(region);
     84   base->Init();
     85   owned_bases_.push_back(base);
     86   return base;
     87 }
     88 
     89 DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject(
     90     const Region& region) {
     91   Subject* subject = new Subject(region);
     92   owned_subjects_.push_back(subject);
     93   return subject;
     94 }
     95 
     96 size_t DifferenceEstimator::Measure(Base* base, Subject* subject) {
     97   size_t mismatches = 0;
     98   const uint8* start = subject->region().start();
     99   const uint8* end = subject->region().end() - (kTupleSize - 1);
    100 
    101   const uint8* p = start;
    102   while (p < end) {
    103     size_t hash = HashTuple(p);
    104     if (base->hashes_.find(hash) == base->hashes_.end()) {
    105       ++mismatches;
    106     }
    107     p += 1;
    108   }
    109 
    110   if (mismatches == 0) {
    111     if (RegionsEqual(base->region(), subject->region()))
    112       return 0;
    113   }
    114   ++mismatches;  // Guarantee not zero.
    115   return mismatches;
    116 }
    117 
    118 }  // namespace
    119