1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // We want to measure the similarity of two sequences of bytes as a surrogate 6 // for measuring how well a second sequence will compress differentially to the 7 // first sequence. 8 9 #include "courgette/difference_estimator.h" 10 11 #include "base/containers/hash_tables.h" 12 13 namespace courgette { 14 15 // Our difference measure is the number of k-tuples that occur in Subject that 16 // don't occur in Base. 17 const int kTupleSize = 4; 18 19 namespace { 20 21 COMPILE_ASSERT(kTupleSize >= 4 && kTupleSize <= 8, kTupleSize_between_4_and_8); 22 23 size_t HashTuple(const uint8* source) { 24 size_t hash1 = *reinterpret_cast<const uint32*>(source); 25 size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4); 26 size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23); 27 return hash; 28 } 29 30 bool RegionsEqual(const Region& a, const Region& b) { 31 if (a.length() != b.length()) 32 return false; 33 return memcmp(a.start(), b.start(), a.length()) == 0; 34 } 35 36 } // anonymous namepace 37 38 class DifferenceEstimator::Base { 39 public: 40 explicit Base(const Region& region) : region_(region) { } 41 42 void Init() { 43 const uint8* start = region_.start(); 44 const uint8* end = region_.end() - (kTupleSize - 1); 45 for (const uint8* p = start; p < end; ++p) { 46 size_t hash = HashTuple(p); 47 hashes_.insert(hash); 48 } 49 } 50 51 const Region& region() const { return region_; } 52 53 private: 54 Region region_; 55 base::hash_set<size_t> hashes_; 56 57 friend class DifferenceEstimator; 58 DISALLOW_COPY_AND_ASSIGN(Base); 59 }; 60 61 class DifferenceEstimator::Subject { 62 public: 63 explicit Subject(const Region& region) : region_(region) {} 64 65 const Region& region() const { return region_; } 66 67 private: 68 Region region_; 69 70 DISALLOW_COPY_AND_ASSIGN(Subject); 71 }; 72 73 DifferenceEstimator::DifferenceEstimator() {} 74 75 DifferenceEstimator::~DifferenceEstimator() { 76 for (size_t i = 0; i < owned_bases_.size(); ++i) 77 delete owned_bases_[i]; 78 for (size_t i = 0; i < owned_subjects_.size(); ++i) 79 delete owned_subjects_[i]; 80 } 81 82 DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) { 83 Base* base = new Base(region); 84 base->Init(); 85 owned_bases_.push_back(base); 86 return base; 87 } 88 89 DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject( 90 const Region& region) { 91 Subject* subject = new Subject(region); 92 owned_subjects_.push_back(subject); 93 return subject; 94 } 95 96 size_t DifferenceEstimator::Measure(Base* base, Subject* subject) { 97 size_t mismatches = 0; 98 const uint8* start = subject->region().start(); 99 const uint8* end = subject->region().end() - (kTupleSize - 1); 100 101 const uint8* p = start; 102 while (p < end) { 103 size_t hash = HashTuple(p); 104 if (base->hashes_.find(hash) == base->hashes_.end()) { 105 ++mismatches; 106 } 107 p += 1; 108 } 109 110 if (mismatches == 0) { 111 if (RegionsEqual(base->region(), subject->region())) 112 return 0; 113 } 114 ++mismatches; // Guarantee not zero. 115 return mismatches; 116 } 117 118 } // namespace 119