Home | History | Annotate | Download | only in courgette
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "courgette/adjustment_method.h"
      6 
      7 #include <algorithm>
      8 #include <limits>
      9 #include <list>
     10 #include <map>
     11 #include <set>
     12 #include <string>
     13 #include <vector>
     14 
     15 #include "base/basictypes.h"
     16 #include "base/format_macros.h"
     17 #include "base/logging.h"
     18 #include "base/strings/stringprintf.h"
     19 #include "base/time/time.h"
     20 #include "courgette/assembly_program.h"
     21 #include "courgette/courgette.h"
     22 #include "courgette/encoded_program.h"
     23 
     24 /*
     25 
     26 Shingle weighting matching.
     27 
     28 We have a sequence S1 of symbols from alphabet A1={A,B,C,...} called the 'model'
     29 and a second sequence of S2 of symbols from alphabet A2={U,V,W,....} called the
     30 'program'.  Each symbol in A1 has a unique numerical name or index.  We can
     31 transcribe the sequence S1 to a sequence T1 of indexes of the symbols. We wish
     32 to assign indexes to the symbols in A2 so that when we transcribe S2 into T2, T2
     33 has long subsequences that occur in T1.  This will ensure that the sequence
     34 T1;T2 compresses to be only slightly larger than the compressed T1.
     35 
     36 The algorithm for matching members of S2 with members of S1 is eager - it makes
     37 matches without backtracking, until no more matches can be made.  Each variable
     38 (symbol) U,V,... in A2 has a set of candidates from A1, each candidate with a
     39 weight summarizing the evidence for the match.  We keep a VariableQueue of
     40 U,V,... sorted by how much the evidence for the best choice outweighs the
     41 evidence for the second choice, i.e. prioritized by how 'clear cut' the best
     42 assignment is.  We pick the variable with the most clear-cut candidate, make the
     43 assignment, adjust the evidence and repeat.
     44 
     45 What has not been described so far is how the evidence is gathered and
     46 maintained.  We are working under the assumption that S1 and S2 are largely
     47 similar.  (A different assumption might be that S1 and S2 are dissimilar except
     48 for many long subsequences.)
     49 
     50 A naive algorithm would consider all pairs (A,U) and for each pair assess the
     51 benefit, or score, the assignment U:=A.  The score might count the number of
     52 occurrences of U in S2 which appear in similar contexts to A in S1.
     53 
     54 To distinguish contexts we view S1 and S2 as a sequence of overlapping k-length
     55 substrings or 'shingles'.  Two shingles are compatible if the symbols in one
     56 shingle could be matched with the symbols in the other symbol.  For example, ABC
     57 is *not* compatible with UVU because it would require conflicting matches A=U
     58 and C=U.  ABC is compatible with UVW, UWV, WUV, VUW etc.  We can't tell which
     59 until we make an assignment - the compatible shingles form an equivalence class.
     60 After assigning U:=A then only UVW and UWV (equivalently AVW, AWV) are
     61 compatible.  As we make assignments the number of equivalence classes of
     62 shingles increases and the number of members of each equivalence class
     63 decreases.  The compatibility test becomes more restrictive.
     64 
     65 We gather evidence for the potential assignment U:=A by counting how many
     66 shingles containing U are compatible with shingles containing A.  Thus symbols
     67 occurring a large number of times in compatible contexts will be assigned first.
     68 
     69 Finding the 'most clear-cut' assignment by considering all pairs symbols and for
     70 each pair comparing the contexts of each pair of occurrences of the symbols is
     71 computationally infeasible.  We get the job done in a reasonable time by
     72 approaching it 'backwards' and making incremental changes as we make
     73 assignments.
     74 
     75 First the shingles are partitioned according to compatibility.  In S1=ABCDD and
     76 S2=UVWXX we have a total of 6 shingles, each occuring once. (ABC:1 BCD:1 CDD:1;
     77 UVW:1 VWX: WXX:1) all fit the pattern <V0 V1 V2> or the pattern <V0 V1 V1>.  The
     78 first pattern indicates that each position matches a different symbol, the
     79 second pattern indicates that the second symbol is repeated.
     80 
     81   pattern      S1 members      S2 members
     82   <V0 V1 V2>:  {ABC:1, BCD:1}; {UVW:1, VWX:1}
     83   <V0 V1 V1>:  {CDD:1}         {WXX:1}
     84 
     85 The second pattern appears to have a unique assignment but we don't make the
     86 assignment on such scant evidence.  If S1 and S2 do not match exactly, there
     87 will be numerous spurious low-score matches like this.  Instead we must see what
     88 assignments are indicated by considering all of the evidence.
     89 
     90 First pattern has 2 x 2 = 4 shingle pairs.  For each pair we count the number
     91 of symbol assignments.  For ABC:a * UVW:b accumulate min(a,b) to each of
     92   {U:=A, V:=B, W:=C}.
     93 After accumulating over all 2 x 2 pairs:
     94   U: {A:1  B:1}
     95   V: {A:1  B:2  C:1}
     96   W: {B:1  C:2  D:1 }
     97   X: {C:1  D:1}
     98 The second pattern contributes:
     99   W: {C:1}
    100   X: {D:2}
    101 Sum:
    102   U: {A:1  B:1}
    103   V: {A:1  B:2  C:1}
    104   W: {B:1  C:3  D:1}
    105   X: {C:1  D:3}
    106 
    107 From this we decide to assign X:=D (because this assignment has both the largest
    108 difference above the next candidate (X:=C) and this is also the largest
    109 proportionately over the sum of alternatives).
    110 
    111 Lets assume D has numerical 'name' 77.  The assignment X:=D sets X to 77 too.
    112 Next we repartition all the shingles containing X or D:
    113 
    114   pattern      S1 members      S2 members
    115   <V0 V1 V2>:  {ABC:1};        {UVW:1}
    116   <V0 V1 77>:  {BCD:1};        {VWX:1}
    117   <V0 77 77>:  {CDD:1}         {WXX:1}
    118 As we repartition, we recalculate the contributions to the scores:
    119   U: {A:1}
    120   V: {B:2}
    121   W: {C:3}
    122 All the remaining assignments are now fixed.
    123 
    124 There is one step in the incremental algorithm that is still infeasibly
    125 expensive: the contributions due to the cross product of large equivalence
    126 classes.  We settle for making an approximation by computing the contribution of
    127 the cross product of only the most common shingles.  The hope is that the noise
    128 from the long tail of uncounted shingles is well below the scores being used to
    129 pick assignments.  The second hope is that as assignment are made, the large
    130 equivalence class will be partitioned into smaller equivalence classes, reducing
    131 the noise over time.
    132 
    133 In the code below the shingles are bigger (Shingle::kWidth = 5).
    134 Class ShinglePattern holds the data for one pattern.
    135 
    136 There is an optimization for this case:
    137   <V0 V1 V1>:  {CDD:1}         {WXX:1}
    138 
    139 Above we said that we don't make an assignment on this "scant evidence".  There
    140 is an exception: if there is only one variable unassigned (more like the <V0 77
    141 77> pattern) AND there are no occurrences of C and W other than those counted in
    142 this pattern, then there is no competing evidence and we go ahead with the
    143 assignment immediately.  This produces slightly better results because these
    144 cases tend to be low-scoring and susceptible to small mistakes made in
    145 low-scoring assignments in the approximation for large equivalence classes.
    146 
    147 */
    148 
    149 namespace courgette {
    150 namespace adjustment_method_2 {
    151 
    152 ////////////////////////////////////////////////////////////////////////////////
    153 
    154 class AssignmentCandidates;
    155 class LabelInfoMaker;
    156 class Shingle;
    157 class ShinglePattern;
    158 
    159 // The purpose of adjustment is to assign indexes to Labels of a program 'p' to
    160 // make the sequence of indexes similar to a 'model' program 'm'.  Labels
    161 // themselves don't have enough information to do this job, so we work with a
    162 // LabelInfo surrogate for each label.
    163 //
    164 class LabelInfo {
    165  public:
    166   // Just a no-argument constructor and copy constructor.  Actual LabelInfo
    167   // objects are allocated in std::pair structs in a std::map.
    168   LabelInfo()
    169       : label_(NULL), is_model_(false), debug_index_(0), refs_(0),
    170         assignment_(NULL), candidates_(NULL)
    171   {}
    172 
    173   ~LabelInfo();
    174 
    175   AssignmentCandidates* candidates();
    176 
    177   Label* label_;             // The label that this info a surrogate for.
    178 
    179   uint32 is_model_ : 1;      // Is the label in the model?
    180   uint32 debug_index_ : 31;  // A small number for naming the label in debug
    181                              // output. The pair (is_model_, debug_index_) is
    182                              // unique.
    183 
    184   int refs_;                 // Number of times this Label is referenced.
    185 
    186   LabelInfo* assignment_;    // Label from other program corresponding to this.
    187 
    188   std::vector<uint32> positions_;  // Offsets into the trace of references.
    189 
    190  private:
    191   AssignmentCandidates* candidates_;
    192 
    193   void operator=(const LabelInfo*);  // Disallow assignment only.
    194   // Public compiler generated copy constructor is needed to constuct
    195   // std::pair<Label*, LabelInfo> so that fresh LabelInfos can be allocated
    196   // inside a std::map.
    197 };
    198 
    199 typedef std::vector<LabelInfo*> Trace;
    200 
    201 std::string ToString(const LabelInfo* info) {
    202   std::string s;
    203   base::StringAppendF(&s, "%c%d", "pm"[info->is_model_], info->debug_index_);
    204   if (info->label_->index_ != Label::kNoIndex)
    205     base::StringAppendF(&s, " (%d)", info->label_->index_);
    206 
    207   base::StringAppendF(&s, " #%u", info->refs_);
    208   return s;
    209 }
    210 
    211 // LabelInfoMaker maps labels to their surrogate LabelInfo objects.
    212 class LabelInfoMaker {
    213  public:
    214   LabelInfoMaker() : debug_label_index_gen_(0) {}
    215 
    216   LabelInfo* MakeLabelInfo(Label* label, bool is_model, uint32 position) {
    217     LabelInfo& slot = label_infos_[label];
    218     if (slot.label_ == NULL) {
    219       slot.label_ = label;
    220       slot.is_model_ = is_model;
    221       slot.debug_index_ = ++debug_label_index_gen_;
    222     }
    223     slot.positions_.push_back(position);
    224     ++slot.refs_;
    225     return &slot;
    226   }
    227 
    228   void ResetDebugLabel() { debug_label_index_gen_ = 0; }
    229 
    230  private:
    231   int debug_label_index_gen_;
    232 
    233   // Note LabelInfo is allocated 'flat' inside map::value_type, so the LabelInfo
    234   // lifetimes are managed by the map.
    235   std::map<Label*, LabelInfo> label_infos_;
    236 
    237   DISALLOW_COPY_AND_ASSIGN(LabelInfoMaker);
    238 };
    239 
    240 struct OrderLabelInfo {
    241   bool operator()(const LabelInfo* a, const LabelInfo* b) const {
    242     if (a->label_->rva_ < b->label_->rva_) return true;
    243     if (a->label_->rva_ > b->label_->rva_) return false;
    244     if (a == b) return false;
    245     return a->positions_ < b->positions_;  // Lexicographic ordering of vector.
    246   }
    247 };
    248 
    249 // AssignmentCandidates is a priority queue of candidate assignments to
    250 // a single program LabelInfo, |program_info_|.
    251 class AssignmentCandidates {
    252  public:
    253   explicit AssignmentCandidates(LabelInfo* program_info)
    254       : program_info_(program_info) {}
    255 
    256   LabelInfo* program_info() const { return program_info_; }
    257 
    258   bool empty() const { return label_to_score_.empty(); }
    259 
    260   LabelInfo* top_candidate() const { return queue_.begin()->second; }
    261 
    262   void Update(LabelInfo* model_info, int delta_score) {
    263     LOG_ASSERT(delta_score != 0);
    264     int old_score = 0;
    265     int new_score = 0;
    266     LabelToScore::iterator p = label_to_score_.find(model_info);
    267     if (p != label_to_score_.end()) {
    268       old_score = p->second;
    269       new_score = old_score + delta_score;
    270       queue_.erase(ScoreAndLabel(old_score, p->first));
    271       if (new_score == 0) {
    272         label_to_score_.erase(p);
    273       } else {
    274         p->second = new_score;
    275         queue_.insert(ScoreAndLabel(new_score, model_info));
    276       }
    277     } else {
    278       new_score = delta_score;
    279       label_to_score_.insert(std::make_pair(model_info, new_score));
    280       queue_.insert(ScoreAndLabel(new_score, model_info));
    281     }
    282     LOG_ASSERT(queue_.size() == label_to_score_.size());
    283   }
    284 
    285   int TopScore() const {
    286     int first_value = 0;
    287     int second_value = 0;
    288     Queue::const_iterator p = queue_.begin();
    289     if (p != queue_.end()) {
    290       first_value = p->first;
    291       ++p;
    292       if (p != queue_.end()) {
    293         second_value = p->first;
    294       }
    295     }
    296     return first_value - second_value;
    297   }
    298 
    299   bool HasPendingUpdates() { return !pending_updates_.empty(); }
    300 
    301   void AddPendingUpdate(LabelInfo* model_info, int delta_score) {
    302     LOG_ASSERT(delta_score != 0);
    303     pending_updates_[model_info] += delta_score;
    304   }
    305 
    306   void ApplyPendingUpdates() {
    307     // TODO(sra): try to walk |pending_updates_| and |label_to_score_| in
    308     // lockstep.  Try to batch updates to |queue_|.
    309     size_t zeroes = 0;
    310     for (LabelToScore::iterator p = pending_updates_.begin();
    311          p != pending_updates_.end();
    312          ++p) {
    313       if (p->second != 0)
    314         Update(p->first, p->second);
    315       else
    316         ++zeroes;
    317     }
    318     pending_updates_.clear();
    319   }
    320 
    321   void Print(int max) {
    322     VLOG(2) << "score "  << TopScore() << "  " << ToString(program_info_)
    323             << " := ?";
    324     if (!pending_updates_.empty())
    325       VLOG(2) << pending_updates_.size() << " pending";
    326     int count = 0;
    327     for (Queue::iterator q = queue_.begin();  q != queue_.end();  ++q) {
    328       if (++count > max) break;
    329       VLOG(2) << "   " << q->first << "  " << ToString(q->second);
    330     }
    331   }
    332 
    333  private:
    334   typedef std::map<LabelInfo*, int, OrderLabelInfo> LabelToScore;
    335   typedef std::pair<int, LabelInfo*> ScoreAndLabel;
    336   struct OrderScoreAndLabelByScoreDecreasing {
    337     OrderLabelInfo tie_breaker;
    338     bool operator()(const ScoreAndLabel& a, const ScoreAndLabel& b) const {
    339       if (a.first > b.first) return true;
    340       if (a.first < b.first) return false;
    341       return tie_breaker(a.second, b.second);
    342     }
    343   };
    344   typedef std::set<ScoreAndLabel, OrderScoreAndLabelByScoreDecreasing> Queue;
    345 
    346   LabelInfo* program_info_;
    347   LabelToScore label_to_score_;
    348   LabelToScore pending_updates_;
    349   Queue queue_;
    350 };
    351 
    352 AssignmentCandidates* LabelInfo::candidates() {
    353   if (candidates_ == NULL)
    354     candidates_ = new AssignmentCandidates(this);
    355   return candidates_;
    356 }
    357 
    358 LabelInfo::~LabelInfo() {
    359   delete candidates_;
    360 }
    361 
    362 // A Shingle is a short fixed-length string of LabelInfos that actually occurs
    363 // in a Trace.  A Shingle may occur many times.  We repesent the Shingle by the
    364 // position of one of the occurrences in the Trace.
    365 class Shingle {
    366  public:
    367   static const size_t kWidth = 5;
    368 
    369   struct InterningLess {
    370     bool operator()(const Shingle& a, const Shingle& b) const;
    371   };
    372 
    373   typedef std::set<Shingle, InterningLess> OwningSet;
    374 
    375   static Shingle* Find(const Trace& trace, size_t position,
    376                        OwningSet* owning_set) {
    377     std::pair<OwningSet::iterator, bool> pair =
    378         owning_set->insert(Shingle(trace, position));
    379     // pair.first iterator 'points' to the newly inserted Shingle or the
    380     // previouly inserted one that looks the same according to the comparator.
    381 
    382     // const_cast required because key is const.  We modify the Shingle
    383     // extensively but not in a way that affects InterningLess.
    384     Shingle* shingle = const_cast<Shingle*>(&*pair.first);
    385     shingle->add_position(position);
    386     return shingle;
    387   }
    388 
    389   LabelInfo* at(size_t i) const { return trace_[exemplar_position_ + i]; }
    390   void add_position(size_t position) {
    391     positions_.push_back(static_cast<uint32>(position));
    392   }
    393   int position_count() const { return static_cast<int>(positions_.size()); }
    394 
    395   bool InModel() const { return at(0)->is_model_; }
    396 
    397   ShinglePattern* pattern() const { return pattern_; }
    398   void set_pattern(ShinglePattern* pattern) { pattern_ = pattern; }
    399 
    400   struct PointerLess {
    401     bool operator()(const Shingle* a, const Shingle* b) const {
    402       // Arbitrary but repeatable (memory-address) independent ordering:
    403       return a->exemplar_position_ < b->exemplar_position_;
    404       // return InterningLess()(*a, *b);
    405     }
    406   };
    407 
    408  private:
    409   Shingle(const Trace& trace, size_t exemplar_position)
    410       : trace_(trace),
    411         exemplar_position_(exemplar_position),
    412         pattern_(NULL) {
    413   }
    414 
    415   const Trace& trace_;             // The shingle lives inside trace_.
    416   size_t exemplar_position_;       // At this position (and other positions).
    417   std::vector<uint32> positions_;  // Includes exemplar_position_.
    418 
    419   ShinglePattern* pattern_;       // Pattern changes as LabelInfos are assigned.
    420 
    421   friend std::string ToString(const Shingle* instance);
    422 
    423   // We can't disallow the copy constructor because we use std::set<Shingle> and
    424   // VS2005's implementation of std::set<T>::set() requires T to have a copy
    425   // constructor.
    426   //   DISALLOW_COPY_AND_ASSIGN(Shingle);
    427   void operator=(const Shingle&);  // Disallow assignment only.
    428 };
    429 
    430 std::string ToString(const Shingle* instance) {
    431   std::string s;
    432   const char* sep = "<";
    433   for (size_t i = 0; i < Shingle::kWidth; ++i) {
    434     // base::StringAppendF(&s, "%s%x ", sep, instance.at(i)->label_->rva_);
    435     s += sep;
    436     s += ToString(instance->at(i));
    437     sep = ", ";
    438   }
    439   base::StringAppendF(&s, ">(%" PRIuS ")@{%d}",
    440                       instance->exemplar_position_,
    441                       instance->position_count());
    442   return s;
    443 }
    444 
    445 
    446 bool Shingle::InterningLess::operator()(
    447     const Shingle& a,
    448     const Shingle& b) const {
    449   for (size_t i = 0;  i < kWidth;  ++i) {
    450     LabelInfo* info_a = a.at(i);
    451     LabelInfo* info_b = b.at(i);
    452     if (info_a->label_->rva_ < info_b->label_->rva_)
    453       return true;
    454     if (info_a->label_->rva_ > info_b->label_->rva_)
    455       return false;
    456     if (info_a->is_model_ < info_b->is_model_)
    457       return true;
    458     if (info_a->is_model_ > info_b->is_model_)
    459       return false;
    460     if (info_a != info_b) {
    461       NOTREACHED();
    462     }
    463   }
    464   return false;
    465 }
    466 
    467 class ShinglePattern {
    468  public:
    469   enum { kOffsetMask = 7,  // Offset lives in low bits.
    470          kFixed    = 0,    // kind & kVariable == 0  => fixed.
    471          kVariable = 8     // kind & kVariable == 1  => variable.
    472          };
    473   // sequence[position + (kinds_[i] & kOffsetMask)] gives LabelInfo for position
    474   // i of shingle.  Below, second 'A' is duplicate of position 1, second '102'
    475   // is duplicate of position 0.
    476   //
    477   //   <102, A, 103, A , 102>
    478   //      --> <kFixed+0, kVariable+1, kFixed+2, kVariable+1, kFixed+0>
    479   struct Index {
    480     explicit Index(const Shingle* instance);
    481     uint8 kinds_[Shingle::kWidth];
    482     uint8 variables_;
    483     uint8 unique_variables_;
    484     uint8 first_variable_index_;
    485     uint32 hash_;
    486     int assigned_indexes_[Shingle::kWidth];
    487   };
    488 
    489   // ShinglePattern keeps histograms of member Shingle instances, ordered by
    490   // decreasing number of occurrences.  We don't have a pair (occurrence count,
    491   // Shingle instance), so we use a FreqView adapter to make the instance
    492   // pointer look like the pair.
    493   class FreqView {
    494    public:
    495     explicit FreqView(const Shingle* instance) : instance_(instance) {}
    496     int count() const { return instance_->position_count(); }
    497     const Shingle* instance() const { return instance_; }
    498     struct Greater {
    499       bool operator()(const FreqView& a, const FreqView& b) const {
    500         if (a.count() > b.count()) return true;
    501         if (a.count() < b.count()) return false;
    502         return resolve_ties(a.instance(), b.instance());
    503       }
    504      private:
    505       Shingle::PointerLess resolve_ties;
    506     };
    507    private:
    508     const Shingle* instance_;
    509   };
    510 
    511   typedef std::set<FreqView, FreqView::Greater> Histogram;
    512 
    513   ShinglePattern() : index_(NULL), model_coverage_(0), program_coverage_(0) {}
    514 
    515   const Index* index_;  // Points to the key in the owning map value_type.
    516   Histogram model_histogram_;
    517   Histogram program_histogram_;
    518   int model_coverage_;
    519   int program_coverage_;
    520 };
    521 
    522 std::string ToString(const ShinglePattern::Index* index) {
    523   std::string s;
    524   if (index == NULL) {
    525     s = "<null>";
    526   } else {
    527     base::StringAppendF(&s, "<%d: ", index->variables_);
    528     const char* sep = "";
    529     for (size_t i = 0;  i < Shingle::kWidth;  ++i) {
    530       s += sep;
    531       sep = ", ";
    532       uint32 kind = index->kinds_[i];
    533       int offset = kind & ShinglePattern::kOffsetMask;
    534       if (kind & ShinglePattern::kVariable)
    535         base::StringAppendF(&s, "V%d", offset);
    536       else
    537         base::StringAppendF(&s, "%d", index->assigned_indexes_[offset]);
    538      }
    539     base::StringAppendF(&s, " %x", index->hash_);
    540     s += ">";
    541   }
    542   return s;
    543 }
    544 
    545 std::string HistogramToString(const ShinglePattern::Histogram& histogram,
    546                               size_t snippet_max) {
    547   std::string s;
    548   size_t histogram_size = histogram.size();
    549   size_t snippet_size = 0;
    550   for (ShinglePattern::Histogram::const_iterator p = histogram.begin();
    551        p != histogram.end();
    552        ++p) {
    553     if (++snippet_size > snippet_max && snippet_size != histogram_size) {
    554       s += " ...";
    555       break;
    556     }
    557     base::StringAppendF(&s, " %d", p->count());
    558   }
    559   return s;
    560 }
    561 
    562 std::string HistogramToStringFull(const ShinglePattern::Histogram& histogram,
    563                                   const char* indent,
    564                                   size_t snippet_max) {
    565   std::string s;
    566 
    567   size_t histogram_size = histogram.size();
    568   size_t snippet_size = 0;
    569   for (ShinglePattern::Histogram::const_iterator p = histogram.begin();
    570        p != histogram.end();
    571        ++p) {
    572     s += indent;
    573     if (++snippet_size > snippet_max && snippet_size != histogram_size) {
    574       s += "...\n";
    575       break;
    576     }
    577     base::StringAppendF(&s, "(%d) ", p->count());
    578     s += ToString(&(*p->instance()));
    579     s += "\n";
    580   }
    581   return s;
    582 }
    583 
    584 std::string ToString(const ShinglePattern* pattern, size_t snippet_max = 3) {
    585   std::string s;
    586   if (pattern == NULL) {
    587     s = "<null>";
    588   } else {
    589     s = "{";
    590     s += ToString(pattern->index_);
    591     base::StringAppendF(&s, ";  %d(%d):",
    592                         static_cast<int>(pattern->model_histogram_.size()),
    593                         pattern->model_coverage_);
    594 
    595     s += HistogramToString(pattern->model_histogram_, snippet_max);
    596     base::StringAppendF(&s, ";  %d(%d):",
    597                         static_cast<int>(pattern->program_histogram_.size()),
    598                         pattern->program_coverage_);
    599     s += HistogramToString(pattern->program_histogram_, snippet_max);
    600     s += "}";
    601   }
    602   return s;
    603 }
    604 
    605 std::string ShinglePatternToStringFull(const ShinglePattern* pattern,
    606                                        size_t max) {
    607   std::string s;
    608   s += ToString(pattern->index_);
    609   s += "\n";
    610   size_t model_size = pattern->model_histogram_.size();
    611   size_t program_size = pattern->program_histogram_.size();
    612   base::StringAppendF(&s, "  model shingles %" PRIuS "\n", model_size);
    613   s += HistogramToStringFull(pattern->model_histogram_, "    ", max);
    614   base::StringAppendF(&s, "  program shingles %" PRIuS "\n", program_size);
    615   s += HistogramToStringFull(pattern->program_histogram_, "    ", max);
    616   return s;
    617 }
    618 
    619 struct ShinglePatternIndexLess {
    620   bool operator()(const ShinglePattern::Index& a,
    621                   const ShinglePattern::Index& b) const {
    622     if (a.hash_ < b.hash_) return true;
    623     if (a.hash_ > b.hash_) return false;
    624 
    625     for (size_t i = 0;  i < Shingle::kWidth;  ++i) {
    626       if (a.kinds_[i] < b.kinds_[i]) return true;
    627       if (a.kinds_[i] > b.kinds_[i]) return false;
    628       if ((a.kinds_[i] & ShinglePattern::kVariable) == 0) {
    629         if (a.assigned_indexes_[i] < b.assigned_indexes_[i])
    630           return true;
    631         if (a.assigned_indexes_[i] > b.assigned_indexes_[i])
    632           return false;
    633       }
    634     }
    635     return false;
    636   }
    637 };
    638 
    639 static uint32 hash_combine(uint32 h, uint32 v) {
    640   h += v;
    641   return (h * (37 + 0x0000d100)) ^ (h >> 13);
    642 }
    643 
    644 ShinglePattern::Index::Index(const Shingle* instance) {
    645   uint32 hash = 0;
    646   variables_ = 0;
    647   unique_variables_ = 0;
    648   first_variable_index_ = 255;
    649 
    650   for (uint32 i = 0; i < Shingle::kWidth; ++i) {
    651     LabelInfo* info = instance->at(i);
    652     uint32 kind = 0;
    653     int code = -1;
    654     size_t j = 0;
    655     for ( ; j < i; ++j) {
    656       if (info == instance->at(j)) {  // Duplicate LabelInfo
    657         kind = kinds_[j];
    658         break;
    659       }
    660     }
    661     if (j == i) {  // Not found above.
    662       if (info->assignment_) {
    663         code = info->label_->index_;
    664         assigned_indexes_[i] = code;
    665         kind = kFixed + i;
    666       } else {
    667         kind = kVariable + i;
    668         ++unique_variables_;
    669         if (i < first_variable_index_)
    670           first_variable_index_ = i;
    671       }
    672     }
    673     if (kind & kVariable) ++variables_;
    674     hash = hash_combine(hash, code);
    675     hash = hash_combine(hash, kind);
    676     kinds_[i] = kind;
    677     assigned_indexes_[i] = code;
    678   }
    679   hash_ = hash;
    680 }
    681 
    682 struct ShinglePatternLess {
    683   bool operator()(const ShinglePattern& a, const ShinglePattern& b) const {
    684     return index_less(*a.index_, *b.index_);
    685   }
    686   ShinglePatternIndexLess index_less;
    687 };
    688 
    689 struct ShinglePatternPointerLess {
    690   bool operator()(const ShinglePattern* a, const ShinglePattern* b) const {
    691     return pattern_less(*a, *b);
    692   }
    693   ShinglePatternLess pattern_less;
    694 };
    695 
    696 template<int (*Scorer)(const ShinglePattern*)>
    697 struct OrderShinglePatternByScoreDescending {
    698   bool operator()(const ShinglePattern* a, const ShinglePattern* b) const {
    699     int score_a = Scorer(a);
    700     int score_b = Scorer(b);
    701     if (score_a > score_b) return true;
    702     if (score_a < score_b) return false;
    703     return break_ties(a, b);
    704   }
    705   ShinglePatternPointerLess break_ties;
    706 };
    707 
    708 // Returns a score for a 'Single Use' rule.  Returns -1 if the rule is not
    709 // applicable.
    710 int SingleUseScore(const ShinglePattern* pattern) {
    711   if (pattern->index_->variables_ != 1)
    712     return -1;
    713 
    714   if (pattern->model_histogram_.size() != 1 ||
    715       pattern->program_histogram_.size() != 1)
    716     return -1;
    717 
    718   // Does this pattern account for all uses of the variable?
    719   const ShinglePattern::FreqView& program_freq =
    720       *pattern->program_histogram_.begin();
    721   const ShinglePattern::FreqView& model_freq =
    722       *pattern->model_histogram_.begin();
    723   int p1 = program_freq.count();
    724   int m1 = model_freq.count();
    725   if (p1 == m1) {
    726     const Shingle* program_instance = program_freq.instance();
    727     const Shingle* model_instance = model_freq.instance();
    728     size_t variable_index = pattern->index_->first_variable_index_;
    729     LabelInfo* program_info = program_instance->at(variable_index);
    730     LabelInfo* model_info = model_instance->at(variable_index);
    731     if (!program_info->assignment_) {
    732       if (program_info->refs_ == p1 && model_info->refs_ == m1) {
    733         return p1;
    734       }
    735     }
    736   }
    737   return -1;
    738 }
    739 
    740 // The VariableQueue is a priority queue of unassigned LabelInfos from
    741 // the 'program' (the 'variables') and their AssignmentCandidates.
    742 class VariableQueue {
    743  public:
    744   typedef std::pair<int, LabelInfo*> ScoreAndLabel;
    745 
    746   VariableQueue() {}
    747 
    748   bool empty() const { return queue_.empty(); }
    749 
    750   const ScoreAndLabel& first() const { return *queue_.begin(); }
    751 
    752   // For debugging only.
    753   void Print() const {
    754     for (Queue::const_iterator p = queue_.begin();  p != queue_.end();  ++p) {
    755       AssignmentCandidates* candidates = p->second->candidates();
    756       candidates->Print(std::numeric_limits<int>::max());
    757     }
    758   }
    759 
    760   void AddPendingUpdate(LabelInfo* program_info, LabelInfo* model_info,
    761                         int delta_score) {
    762     AssignmentCandidates* candidates = program_info->candidates();
    763     if (!candidates->HasPendingUpdates()) {
    764       pending_update_candidates_.push_back(candidates);
    765     }
    766     candidates->AddPendingUpdate(model_info, delta_score);
    767   }
    768 
    769   void ApplyPendingUpdates() {
    770     for (size_t i = 0;  i < pending_update_candidates_.size();  ++i) {
    771       AssignmentCandidates* candidates = pending_update_candidates_[i];
    772       int old_score = candidates->TopScore();
    773       queue_.erase(ScoreAndLabel(old_score, candidates->program_info()));
    774       candidates->ApplyPendingUpdates();
    775       if (!candidates->empty()) {
    776         int new_score = candidates->TopScore();
    777         queue_.insert(ScoreAndLabel(new_score, candidates->program_info()));
    778       }
    779     }
    780     pending_update_candidates_.clear();
    781   }
    782 
    783  private:
    784   struct OrderScoreAndLabelByScoreDecreasing {
    785     bool operator()(const ScoreAndLabel& a, const ScoreAndLabel& b) const {
    786       if (a.first > b.first) return true;
    787       if (a.first < b.first) return false;
    788       return OrderLabelInfo()(a.second, b.second);
    789     }
    790   };
    791   typedef std::set<ScoreAndLabel, OrderScoreAndLabelByScoreDecreasing> Queue;
    792 
    793   Queue queue_;
    794   std::vector<AssignmentCandidates*> pending_update_candidates_;
    795 
    796   DISALLOW_COPY_AND_ASSIGN(VariableQueue);
    797 };
    798 
    799 
    800 class AssignmentProblem {
    801  public:
    802   AssignmentProblem(const Trace& trace, size_t model_end)
    803       : trace_(trace),
    804         model_end_(model_end) {
    805     VLOG(2) << "AssignmentProblem::AssignmentProblem  " << model_end << ", "
    806             << trace.size();
    807   }
    808 
    809   bool Solve() {
    810     if (model_end_ < Shingle::kWidth ||
    811         trace_.size() - model_end_ < Shingle::kWidth) {
    812       // Nothing much we can do with such a short problem.
    813       return true;
    814     }
    815     instances_.resize(trace_.size() - Shingle::kWidth + 1, NULL);
    816     AddShingles(0, model_end_);
    817     AddShingles(model_end_, trace_.size());
    818     InitialClassify();
    819     AddPatternsNeedingUpdatesToQueues();
    820 
    821     patterns_needing_updates_.clear();
    822     while (FindAndAssignBestLeader())
    823       patterns_needing_updates_.clear();
    824     PrintActivePatterns();
    825 
    826     return true;
    827   }
    828 
    829  private:
    830   typedef std::set<Shingle*, Shingle::PointerLess> ShingleSet;
    831 
    832   typedef std::set<const ShinglePattern*, ShinglePatternPointerLess>
    833       ShinglePatternSet;
    834 
    835   // Patterns are partitioned into the following sets:
    836 
    837   // * Retired patterns (not stored).  No shingles exist for this pattern (they
    838   //   all now match more specialized patterns).
    839   // * Useless patterns (not stored).  There are no 'program' shingles for this
    840   //   pattern (they all now match more specialized patterns).
    841   // * Single-use patterns - single_use_pattern_queue_.
    842   // * Other patterns - active_non_single_use_patterns_ / variable_queue_.
    843 
    844   typedef std::set<const ShinglePattern*,
    845                    OrderShinglePatternByScoreDescending<&SingleUseScore> >
    846       SingleUsePatternQueue;
    847 
    848   void PrintPatternsHeader() const {
    849     VLOG(2) << shingle_instances_.size() << " instances  "
    850             << trace_.size() << " trace length  "
    851             << patterns_.size() << " shingle indexes  "
    852             << single_use_pattern_queue_.size() << " single use patterns  "
    853             << active_non_single_use_patterns_.size() << " active patterns";
    854   }
    855 
    856   void PrintActivePatterns() const {
    857     for (ShinglePatternSet::const_iterator p =
    858              active_non_single_use_patterns_.begin();
    859          p != active_non_single_use_patterns_.end();
    860          ++p) {
    861       const ShinglePattern* pattern = *p;
    862       VLOG(2) << ToString(pattern, 10);
    863     }
    864   }
    865 
    866   void PrintPatterns() const {
    867     PrintAllPatterns();
    868     PrintActivePatterns();
    869     PrintAllShingles();
    870   }
    871 
    872   void PrintAllPatterns() const {
    873     for (IndexToPattern::const_iterator p = patterns_.begin();
    874          p != patterns_.end();
    875          ++p) {
    876       const ShinglePattern& pattern = p->second;
    877       VLOG(2) << ToString(&pattern, 10);
    878     }
    879   }
    880 
    881   void PrintAllShingles() const {
    882     for (Shingle::OwningSet::const_iterator p = shingle_instances_.begin();
    883          p != shingle_instances_.end();
    884          ++p) {
    885       const Shingle& instance = *p;
    886       VLOG(2) << ToString(&instance) << "   " << ToString(instance.pattern());
    887     }
    888   }
    889 
    890 
    891   void AddShingles(size_t begin, size_t end) {
    892     for (size_t i = begin;  i + Shingle::kWidth - 1 < end;  ++i) {
    893       instances_[i] = Shingle::Find(trace_, i, &shingle_instances_);
    894     }
    895   }
    896 
    897   void Declassify(Shingle* shingle) {
    898     ShinglePattern* pattern = shingle->pattern();
    899     if (shingle->InModel()) {
    900       pattern->model_histogram_.erase(ShinglePattern::FreqView(shingle));
    901       pattern->model_coverage_ -= shingle->position_count();
    902     } else {
    903       pattern->program_histogram_.erase(ShinglePattern::FreqView(shingle));
    904       pattern->program_coverage_ -= shingle->position_count();
    905     }
    906     shingle->set_pattern(NULL);
    907   }
    908 
    909   void Reclassify(Shingle* shingle) {
    910     ShinglePattern* pattern = shingle->pattern();
    911     LOG_ASSERT(pattern == NULL);
    912 
    913     ShinglePattern::Index index(shingle);
    914     if (index.variables_ == 0)
    915       return;
    916 
    917     std::pair<IndexToPattern::iterator, bool> inserted =
    918         patterns_.insert(std::make_pair(index, ShinglePattern()));
    919 
    920     pattern = &inserted.first->second;
    921     pattern->index_ = &inserted.first->first;
    922     shingle->set_pattern(pattern);
    923     patterns_needing_updates_.insert(pattern);
    924 
    925     if (shingle->InModel()) {
    926       pattern->model_histogram_.insert(ShinglePattern::FreqView(shingle));
    927       pattern->model_coverage_ += shingle->position_count();
    928     } else {
    929       pattern->program_histogram_.insert(ShinglePattern::FreqView(shingle));
    930       pattern->program_coverage_ += shingle->position_count();
    931     }
    932   }
    933 
    934   void InitialClassify() {
    935     for (Shingle::OwningSet::iterator p = shingle_instances_.begin();
    936          p != shingle_instances_.end();
    937          ++p) {
    938       // GCC's set<T>::iterator::operator *() returns a const object.
    939       Reclassify(const_cast<Shingle*>(&*p));
    940     }
    941   }
    942 
    943   // For the positions in |info|, find the shingles that overlap that position.
    944   void AddAffectedPositions(LabelInfo* info, ShingleSet* affected_shingles) {
    945     const size_t kWidth = Shingle::kWidth;
    946     for (size_t i = 0;  i < info->positions_.size();  ++i) {
    947       size_t position = info->positions_[i];
    948       // Find bounds to the subrange of |trace_| we are in.
    949       size_t start = position < model_end_ ? 0 : model_end_;
    950       size_t end = position < model_end_ ? model_end_ : trace_.size();
    951 
    952       // Clip [position-kWidth+1, position+1)
    953       size_t low = position > start + kWidth - 1
    954           ? position - kWidth + 1
    955           : start;
    956       size_t high = position + kWidth < end ? position + 1 : end - kWidth + 1;
    957 
    958       for (size_t shingle_position = low;
    959            shingle_position < high;
    960            ++shingle_position) {
    961         Shingle* overlapping_shingle = instances_.at(shingle_position);
    962         affected_shingles->insert(overlapping_shingle);
    963       }
    964     }
    965   }
    966 
    967   void RemovePatternsNeedingUpdatesFromQueues() {
    968     for (ShinglePatternSet::iterator p = patterns_needing_updates_.begin();
    969          p != patterns_needing_updates_.end();
    970          ++p) {
    971       RemovePatternFromQueues(*p);
    972     }
    973   }
    974 
    975   void AddPatternsNeedingUpdatesToQueues() {
    976     for (ShinglePatternSet::iterator p = patterns_needing_updates_.begin();
    977          p != patterns_needing_updates_.end();
    978          ++p) {
    979       AddPatternToQueues(*p);
    980     }
    981     variable_queue_.ApplyPendingUpdates();
    982   }
    983 
    984   void RemovePatternFromQueues(const ShinglePattern* pattern) {
    985     int single_use_score = SingleUseScore(pattern);
    986     if (single_use_score > 0) {
    987       size_t n = single_use_pattern_queue_.erase(pattern);
    988       LOG_ASSERT(n == 1);
    989     } else if (pattern->program_histogram_.empty() &&
    990                pattern->model_histogram_.empty()) {
    991       NOTREACHED();  // Should not come back to life.
    992     } else if (pattern->program_histogram_.empty()) {
    993       // Useless pattern.
    994     } else {
    995       active_non_single_use_patterns_.erase(pattern);
    996       AddPatternToLabelQueue(pattern, -1);
    997     }
    998   }
    999 
   1000   void AddPatternToQueues(const ShinglePattern* pattern) {
   1001     int single_use_score = SingleUseScore(pattern);
   1002     if (single_use_score > 0) {
   1003       single_use_pattern_queue_.insert(pattern);
   1004     } else if (pattern->program_histogram_.empty() &&
   1005                pattern->model_histogram_.empty()) {
   1006     } else if (pattern->program_histogram_.empty()) {
   1007       // Useless pattern.
   1008     } else {
   1009       active_non_single_use_patterns_.insert(pattern);
   1010       AddPatternToLabelQueue(pattern, +1);
   1011     }
   1012   }
   1013 
   1014   void AddPatternToLabelQueue(const ShinglePattern* pattern, int sign) {
   1015     // For each possible assignment in this pattern, update the potential
   1016     // contributions to the LabelInfo queues.
   1017 
   1018     // We want to find for each symbol (LabelInfo) the maximum contribution that
   1019     // could be achieved by making shingle-wise assignments between shingles in
   1020     // the model and shingles in the program.
   1021     //
   1022     // If the shingles in the histograms are independent (no two shingles have a
   1023     // symbol in common) then any permutation of the assignments is possible,
   1024     // and the maximum contribution can be found by taking the maximum over all
   1025     // the pairs.
   1026     //
   1027     // If the shingles are dependent two things happen.  The maximum
   1028     // contribution to any given symbol is a sum because the symbol has
   1029     // contributions from all the shingles containing it.  Second, some
   1030     // assignments are blocked by previous incompatible assignments.  We want to
   1031     // avoid a combinatorial search, so we ignore the blocking.
   1032 
   1033     const size_t kUnwieldy = 5;
   1034 
   1035     typedef std::map<LabelInfo*, int> LabelToScore;
   1036     typedef std::map<LabelInfo*, LabelToScore > ScoreSet;
   1037     ScoreSet maxima;
   1038 
   1039     size_t n_model_samples = 0;
   1040     for (ShinglePattern::Histogram::const_iterator model_iter =
   1041              pattern->model_histogram_.begin();
   1042          model_iter != pattern->model_histogram_.end();
   1043          ++model_iter) {
   1044       if (++n_model_samples > kUnwieldy) break;
   1045       const ShinglePattern::FreqView& model_freq = *model_iter;
   1046       int m1 = model_freq.count();
   1047       const Shingle* model_instance = model_freq.instance();
   1048 
   1049       ScoreSet sums;
   1050       size_t n_program_samples = 0;
   1051       for (ShinglePattern::Histogram::const_iterator program_iter =
   1052                pattern->program_histogram_.begin();
   1053            program_iter != pattern->program_histogram_.end();
   1054            ++program_iter) {
   1055         if (++n_program_samples > kUnwieldy) break;
   1056         const ShinglePattern::FreqView& program_freq = *program_iter;
   1057         int p1 = program_freq.count();
   1058         const Shingle* program_instance = program_freq.instance();
   1059 
   1060         // int score = p1;  // ? weigh all equally??
   1061         int score = std::min(p1, m1);
   1062 
   1063         for (size_t i = 0;  i < Shingle::kWidth;  ++i) {
   1064           LabelInfo* program_info = program_instance->at(i);
   1065           LabelInfo* model_info = model_instance->at(i);
   1066           if ((model_info->assignment_ == NULL) !=
   1067               (program_info->assignment_ == NULL)) {
   1068             VLOG(2) << "ERROR " << i
   1069                     << "\n\t"  << ToString(pattern, 10)
   1070                     << "\n\t" << ToString(program_instance)
   1071                     << "\n\t" << ToString(model_instance);
   1072           }
   1073           if (!program_info->assignment_ && !model_info->assignment_) {
   1074             sums[program_info][model_info] += score;
   1075           }
   1076         }
   1077 
   1078         for (ScoreSet::iterator assignee_iterator = sums.begin();
   1079              assignee_iterator != sums.end();
   1080              ++assignee_iterator) {
   1081           LabelInfo* program_info = assignee_iterator->first;
   1082           for (LabelToScore::iterator p = assignee_iterator->second.begin();
   1083                p != assignee_iterator->second.end();
   1084                ++p) {
   1085             LabelInfo* model_info = p->first;
   1086             int score = p->second;
   1087             int* slot = &maxima[program_info][model_info];
   1088             *slot = std::max(*slot, score);
   1089           }
   1090         }
   1091       }
   1092     }
   1093 
   1094     for (ScoreSet::iterator assignee_iterator = maxima.begin();
   1095          assignee_iterator != maxima.end();
   1096          ++assignee_iterator) {
   1097       LabelInfo* program_info = assignee_iterator->first;
   1098       for (LabelToScore::iterator p = assignee_iterator->second.begin();
   1099            p != assignee_iterator->second.end();
   1100            ++p) {
   1101         LabelInfo* model_info = p->first;
   1102         int score = sign * p->second;
   1103         variable_queue_.AddPendingUpdate(program_info, model_info, score);
   1104       }
   1105     }
   1106   }
   1107 
   1108   void AssignOne(LabelInfo* model_info, LabelInfo* program_info) {
   1109     LOG_ASSERT(!model_info->assignment_);
   1110     LOG_ASSERT(!program_info->assignment_);
   1111     LOG_ASSERT(model_info->is_model_);
   1112     LOG_ASSERT(!program_info->is_model_);
   1113 
   1114     VLOG(3) << "Assign " << ToString(program_info)
   1115             << " := " << ToString(model_info);
   1116 
   1117     ShingleSet affected_shingles;
   1118     AddAffectedPositions(model_info, &affected_shingles);
   1119     AddAffectedPositions(program_info, &affected_shingles);
   1120 
   1121     for (ShingleSet::iterator p = affected_shingles.begin();
   1122          p != affected_shingles.end();
   1123          ++p) {
   1124       patterns_needing_updates_.insert((*p)->pattern());
   1125     }
   1126 
   1127     RemovePatternsNeedingUpdatesFromQueues();
   1128 
   1129     for (ShingleSet::iterator p = affected_shingles.begin();
   1130          p != affected_shingles.end();
   1131          ++p) {
   1132       Declassify(*p);
   1133     }
   1134 
   1135     program_info->label_->index_ = model_info->label_->index_;
   1136     // Mark as assigned
   1137     model_info->assignment_ = program_info;
   1138     program_info->assignment_ = model_info;
   1139 
   1140     for (ShingleSet::iterator p = affected_shingles.begin();
   1141          p != affected_shingles.end();
   1142          ++p) {
   1143       Reclassify(*p);
   1144     }
   1145 
   1146     AddPatternsNeedingUpdatesToQueues();
   1147   }
   1148 
   1149   bool AssignFirstVariableOfHistogramHead(const ShinglePattern& pattern) {
   1150     const ShinglePattern::FreqView& program_1 =
   1151         *pattern.program_histogram_.begin();
   1152     const ShinglePattern::FreqView& model_1 = *pattern.model_histogram_.begin();
   1153     const Shingle* program_instance = program_1.instance();
   1154     const Shingle* model_instance = model_1.instance();
   1155     size_t variable_index = pattern.index_->first_variable_index_;
   1156     LabelInfo* program_info = program_instance->at(variable_index);
   1157     LabelInfo* model_info = model_instance->at(variable_index);
   1158     AssignOne(model_info, program_info);
   1159     return true;
   1160   }
   1161 
   1162   bool FindAndAssignBestLeader() {
   1163     LOG_ASSERT(patterns_needing_updates_.empty());
   1164 
   1165     if (!single_use_pattern_queue_.empty()) {
   1166       const ShinglePattern& pattern = **single_use_pattern_queue_.begin();
   1167       return AssignFirstVariableOfHistogramHead(pattern);
   1168     }
   1169 
   1170     if (variable_queue_.empty())
   1171       return false;
   1172 
   1173     const VariableQueue::ScoreAndLabel best = variable_queue_.first();
   1174     int score = best.first;
   1175     LabelInfo* assignee = best.second;
   1176 
   1177     // TODO(sra): score (best.first) can be zero.  A zero score means we are
   1178     // blindly picking between two (or more) alternatives which look the same.
   1179     // If we exit on the first zero-score we sometimes get 3-4% better total
   1180     // compression.  This indicates that 'infill' is doing a better job than
   1181     // picking blindly.  Perhaps we can use an extended region around the
   1182     // undistinguished competing alternatives to break the tie.
   1183     if (score == 0) {
   1184       variable_queue_.Print();
   1185       return false;
   1186     }
   1187 
   1188     AssignmentCandidates* candidates = assignee->candidates();
   1189     if (candidates->empty())
   1190       return false;  // Should not happen.
   1191 
   1192     AssignOne(candidates->top_candidate(), assignee);
   1193     return true;
   1194   }
   1195 
   1196  private:
   1197   // The trace vector contains the model sequence [0, model_end_) followed by
   1198   // the program sequence [model_end_, trace.end())
   1199   const Trace& trace_;
   1200   size_t model_end_;
   1201 
   1202   // |shingle_instances_| is the set of 'interned' shingles.
   1203   Shingle::OwningSet shingle_instances_;
   1204 
   1205   // |instances_| maps from position in |trace_| to Shingle at that position.
   1206   std::vector<Shingle*> instances_;
   1207 
   1208   SingleUsePatternQueue single_use_pattern_queue_;
   1209   ShinglePatternSet active_non_single_use_patterns_;
   1210   VariableQueue variable_queue_;
   1211 
   1212   // Transient information: when we make an assignment, we need to recompute
   1213   // priority queue information derived from these ShinglePatterns.
   1214   ShinglePatternSet patterns_needing_updates_;
   1215 
   1216   typedef std::map<ShinglePattern::Index,
   1217                    ShinglePattern, ShinglePatternIndexLess> IndexToPattern;
   1218   IndexToPattern patterns_;
   1219 
   1220   DISALLOW_COPY_AND_ASSIGN(AssignmentProblem);
   1221 };
   1222 
   1223 class Adjuster : public AdjustmentMethod {
   1224  public:
   1225   Adjuster() : prog_(NULL), model_(NULL) {}
   1226   ~Adjuster() {}
   1227 
   1228   bool Adjust(const AssemblyProgram& model, AssemblyProgram* program) {
   1229     VLOG(1) << "Adjuster::Adjust";
   1230     prog_ = program;
   1231     model_ = &model;
   1232     return Finish();
   1233   }
   1234 
   1235   bool Finish() {
   1236     prog_->UnassignIndexes();
   1237     Trace abs32_trace_;
   1238     Trace rel32_trace_;
   1239     CollectTraces(model_, &abs32_trace_, &rel32_trace_, true);
   1240     size_t abs32_model_end = abs32_trace_.size();
   1241     size_t rel32_model_end = rel32_trace_.size();
   1242     CollectTraces(prog_,  &abs32_trace_,  &rel32_trace_,  false);
   1243     Solve(abs32_trace_, abs32_model_end);
   1244     Solve(rel32_trace_, rel32_model_end);
   1245     prog_->AssignRemainingIndexes();
   1246     return true;
   1247   }
   1248 
   1249  private:
   1250   void CollectTraces(const AssemblyProgram* program, Trace* abs32, Trace* rel32,
   1251                      bool is_model) {
   1252     label_info_maker_.ResetDebugLabel();
   1253     const InstructionVector& instructions = program->instructions();
   1254     for (size_t i = 0;  i < instructions.size();  ++i) {
   1255       Instruction* instruction = instructions[i];
   1256       if (Label* label = program->InstructionAbs32Label(instruction))
   1257         ReferenceLabel(abs32, label, is_model);
   1258       if (Label* label = program->InstructionRel32Label(instruction))
   1259         ReferenceLabel(rel32, label, is_model);
   1260     }
   1261     // TODO(sra): we could simply append all the labels in index order to
   1262     // incorporate some costing for entropy (bigger deltas) that will be
   1263     // introduced into the label address table by non-monotonic ordering.  This
   1264     // would have some knock-on effects to parts of the algorithm that work on
   1265     // single-occurrence labels.
   1266   }
   1267 
   1268   void Solve(const Trace& model, size_t model_end) {
   1269     base::Time start_time = base::Time::Now();
   1270     AssignmentProblem a(model, model_end);
   1271     a.Solve();
   1272     VLOG(1) << " Adjuster::Solve "
   1273             << (base::Time::Now() - start_time).InSecondsF();
   1274   }
   1275 
   1276   void ReferenceLabel(Trace* trace, Label* label, bool is_model) {
   1277     trace->push_back(
   1278         label_info_maker_.MakeLabelInfo(label, is_model,
   1279                                         static_cast<uint32>(trace->size())));
   1280   }
   1281 
   1282   AssemblyProgram* prog_;         // Program to be adjusted, owned by caller.
   1283   const AssemblyProgram* model_;  // Program to be mimicked, owned by caller.
   1284 
   1285   LabelInfoMaker label_info_maker_;
   1286 
   1287  private:
   1288   DISALLOW_COPY_AND_ASSIGN(Adjuster);
   1289 };
   1290 
   1291 ////////////////////////////////////////////////////////////////////////////////
   1292 
   1293 }  // namespace adjustment_method_2
   1294 
   1295 AdjustmentMethod* AdjustmentMethod::MakeShingleAdjustmentMethod() {
   1296   return new adjustment_method_2::Adjuster();
   1297 }
   1298 
   1299 }  // namespace courgette
   1300