1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "courgette/adjustment_method.h" 6 7 #include <algorithm> 8 #include <list> 9 #include <map> 10 #include <set> 11 #include <string> 12 #include <vector> 13 14 #include "base/basictypes.h" 15 #include "base/logging.h" 16 #include "base/strings/string_number_conversions.h" 17 #include "base/strings/stringprintf.h" 18 #include "courgette/assembly_program.h" 19 #include "courgette/courgette.h" 20 #include "courgette/encoded_program.h" 21 22 namespace courgette { 23 24 //////////////////////////////////////////////////////////////////////////////// 25 26 class NullAdjustmentMethod : public AdjustmentMethod { 27 bool Adjust(const AssemblyProgram& model, AssemblyProgram* program) { 28 return true; 29 } 30 }; 31 32 //////////////////////////////////////////////////////////////////////////////// 33 34 // The purpose of adjustment is to assign indexes to Labels of a program 'p' to 35 // make the sequence of indexes similar to a 'model' program 'm'. Labels 36 // themselves don't have enough information to do this job, so we work with a 37 // LabelInfo surrogate for each label. 38 // 39 class LabelInfo { 40 public: 41 Label* label_; // The label that this info a surrogate for. 42 43 // Information used only in debugging messages. 44 uint32 is_model_ : 1; // Is the label in the model? 45 uint32 debug_index_ : 31; // An unique small number for naming the label. 46 47 uint32 refs_; // Number of times this Label is referenced. 48 49 LabelInfo* assignment_; // Label from other program corresponding to this. 50 51 // LabelInfos are in a doubly linked list ordered by address (label_->rva_) so 52 // we can quickly find Labels adjacent in address order. 53 LabelInfo* next_addr_; // Label(Info) at next highest address. 54 LabelInfo* prev_addr_; // Label(Info) at next lowest address. 55 56 std::vector<uint32> positions_; // Offsets into the trace of references. 57 58 // Just a no-argument constructor and copy constructor. Actual LabelInfo 59 // objects are allocated in std::pair structs in a std::map. 60 LabelInfo() 61 : label_(NULL), is_model_(false), debug_index_(0), refs_(0), 62 assignment_(NULL), 63 next_addr_(NULL), 64 prev_addr_(NULL) {} 65 66 private: 67 void operator=(const LabelInfo*); // Disallow assignment only. 68 69 // Public compiler generated copy constructor is needed to constuct 70 // std::pair<Label*, LabelInfo> so that fresh LabelInfos can be allocated 71 // inside a std::map. 72 }; 73 74 struct OrderLabelInfoByAddressAscending { 75 bool operator()(const LabelInfo* a, const LabelInfo* b) const { 76 return a->label_->rva_ < b->label_->rva_; 77 } 78 }; 79 80 static std::string ToString(LabelInfo* info) { 81 std::string s; 82 base::StringAppendF(&s, "%c%d", "pm"[info->is_model_], info->debug_index_); 83 if (info->label_->index_ != Label::kNoIndex) 84 base::StringAppendF(&s, " (%d)", info->label_->index_); 85 86 base::StringAppendF(&s, " #%u", info->refs_); 87 return s; 88 } 89 90 // General graph matching is exponential, essentially trying all permutations. 91 // The exponential algorithm can be made faster by avoiding consideration of 92 // impossible or unlikely matches. We can make the matching practical by eager 93 // matching - by looking for likely matches and commiting to them, and using the 94 // committed assignment as the basis for further matching. 95 // 96 // The basic eager graph-matching assignment is based on several ideas: 97 // 98 // * The strongest match will be for parts of the program that have not 99 // changed. If part of a program has not changed, then the number of 100 // references to a label will be the same, and corresponding pairs of 101 // adjacent labels will have the same RVA difference. 102 // 103 // * Some assignments are 'obvious' if you look at the distribution. Example: 104 // if both the program and the model have a label that is referred to much 105 // more often than the next most refered-to label, it is likely the two 106 // labels correspond. 107 // 108 // * If a label from the program corresponds to a label in the model, it is 109 // likely that the labels near the corresponding labels also match. A 110 // conservative way of extending the match is to assign only those labels 111 // which have exactly the same address offset and reference count. 112 // 113 // * If two labels correspond, then we can try to match up the references 114 // before and after the labels in the reference stream. For this to be 115 // practical, the number of references has to be small, e.g. each label has 116 // exactly one reference. 117 // 118 119 // Note: we also tried a completely different approach: random assignment 120 // followed by simulated annealing. This produced similar results. The results 121 // were not as good for very small differences because the simulated annealing 122 // never quite hit the groove. And simulated annealing was several orders of 123 // magnitude slower. 124 125 126 // TRIE node for suffix strings in the label reference sequence. 127 // 128 // We dynamically build a trie for both the program and model, growing the trie 129 // as necessary. The trie node for a (possibly) empty string of label 130 // references contains the distribution of labels following the string. The 131 // roots node (for the empty string) thus contains the simple distribution of 132 // labels within the label reference stream. 133 134 struct Node { 135 Node(LabelInfo* in_edge, Node* prev) 136 : in_edge_(in_edge), prev_(prev), count_(0), 137 in_queue_(false) { 138 length_ = 1 + (prev_ ? prev_->length_ : 0); 139 } 140 LabelInfo* in_edge_; // 141 Node* prev_; // Node at shorter length. 142 int count_; // Frequency of this path in Trie. 143 int length_; 144 typedef std::map<LabelInfo*, Node*> Edges; 145 Edges edges_; 146 std::vector<int> places_; // Indexes into sequence of this item. 147 std::list<Node*> edges_in_frequency_order; 148 149 bool in_queue_; 150 bool Extended() const { return !edges_.empty(); } 151 152 uint32 Weight() const { 153 return edges_in_frequency_order.front()->count_; 154 } 155 }; 156 157 static std::string ToString(Node* node) { 158 std::vector<std::string> prefix; 159 for (Node* n = node; n->prev_; n = n->prev_) 160 prefix.push_back(ToString(n->in_edge_)); 161 162 std::string s; 163 s += "{"; 164 const char* sep = ""; 165 while (!prefix.empty()) { 166 s += sep; 167 sep = ","; 168 s += prefix.back(); 169 prefix.pop_back(); 170 } 171 172 s += base::StringPrintf("%u", node->count_); 173 s += " @"; 174 s += base::Uint64ToString(node->edges_in_frequency_order.size()); 175 s += "}"; 176 return s; 177 } 178 179 typedef std::vector<LabelInfo*> Trace; 180 181 struct OrderNodeByCountDecreasing { 182 bool operator()(Node* a, Node* b) const { 183 if (a->count_ != b->count_) 184 return (a->count_) > (b->count_); 185 return a->places_.at(0) < b->places_.at(0); // Prefer first occuring. 186 } 187 }; 188 189 struct OrderNodeByWeightDecreasing { 190 bool operator()(Node* a, Node* b) const { 191 // (Maybe tie-break on total count, followed by lowest assigned node indexes 192 // in path.) 193 uint32 a_weight = a->Weight(); 194 uint32 b_weight = b->Weight(); 195 if (a_weight != b_weight) 196 return a_weight > b_weight; 197 if (a->length_ != b->length_) 198 return a->length_ > b->length_; // Prefer longer. 199 return a->places_.at(0) < b->places_.at(0); // Prefer first occuring. 200 } 201 }; 202 203 typedef std::set<Node*, OrderNodeByWeightDecreasing> NodeQueue; 204 205 class AssignmentProblem { 206 public: 207 AssignmentProblem(const Trace& model, 208 const Trace& problem) 209 : m_trace_(model), 210 p_trace_(problem), 211 m_root_(NULL), 212 p_root_(NULL) { 213 } 214 215 ~AssignmentProblem() { 216 for (size_t i = 0; i < all_nodes_.size(); ++i) 217 delete all_nodes_[i]; 218 } 219 220 bool Solve() { 221 m_root_ = MakeRootNode(m_trace_); 222 p_root_ = MakeRootNode(p_trace_); 223 AddToQueue(p_root_); 224 225 while (!worklist_.empty()) { 226 Node* node = *worklist_.begin(); 227 node->in_queue_ = false; 228 worklist_.erase(node); 229 TrySolveNode(node); 230 } 231 232 VLOG(2) << unsolved_.size() << " unsolved items"; 233 return true; 234 } 235 236 private: 237 void AddToQueue(Node* node) { 238 if (node->length_ >= 10) { 239 VLOG(4) << "Length clipped " << ToString(node->prev_); 240 return; 241 } 242 if (node->in_queue_) { 243 LOG(ERROR) << "Double add " << ToString(node); 244 return; 245 } 246 // just to be sure data for prioritizing is available 247 ExtendNode(node, p_trace_); 248 // SkipCommittedLabels(node); 249 if (node->edges_in_frequency_order.empty()) 250 return; 251 node->in_queue_ = true; 252 worklist_.insert(node); 253 } 254 255 void SkipCommittedLabels(Node* node) { 256 ExtendNode(node, p_trace_); 257 uint32 skipped = 0; 258 while (!node->edges_in_frequency_order.empty() && 259 node->edges_in_frequency_order.front()->in_edge_->assignment_) { 260 ++skipped; 261 node->edges_in_frequency_order.pop_front(); 262 } 263 if (skipped > 0) 264 VLOG(4) << "Skipped " << skipped << " at " << ToString(node); 265 } 266 267 void TrySolveNode(Node* p_node) { 268 Node* front = p_node->edges_in_frequency_order.front(); 269 if (front->in_edge_->assignment_) { 270 p_node->edges_in_frequency_order.pop_front(); 271 AddToQueue(front); 272 AddToQueue(p_node); 273 return; 274 } 275 276 // Compare frequencies of unassigned edges, and either make 277 // assignment(s) or move node to unsolved list 278 279 Node* m_node = FindModelNode(p_node); 280 281 if (m_node == NULL) { 282 VLOG(2) << "Can't find model node"; 283 unsolved_.insert(p_node); 284 return; 285 } 286 ExtendNode(m_node, m_trace_); 287 288 // Lets just try greedy 289 290 SkipCommittedLabels(m_node); 291 if (m_node->edges_in_frequency_order.empty()) { 292 VLOG(4) << "Punting, no elements left in model vs " 293 << p_node->edges_in_frequency_order.size(); 294 unsolved_.insert(p_node); 295 return; 296 } 297 Node* m_match = m_node->edges_in_frequency_order.front(); 298 Node* p_match = p_node->edges_in_frequency_order.front(); 299 300 if (p_match->count_ > 1.1 * m_match->count_ || 301 m_match->count_ > 1.1 * p_match->count_) { 302 VLOG(3) << "Tricky distribution " 303 << p_match->count_ << ":" << m_match->count_ << " " 304 << ToString(p_match) << " vs " << ToString(m_match); 305 return; 306 } 307 308 m_node->edges_in_frequency_order.pop_front(); 309 p_node->edges_in_frequency_order.pop_front(); 310 311 LabelInfo* p_label_info = p_match->in_edge_; 312 LabelInfo* m_label_info = m_match->in_edge_; 313 int m_index = p_label_info->label_->index_; 314 if (m_index != Label::kNoIndex) { 315 VLOG(2) << "Cant use unassigned label from model " << m_index; 316 unsolved_.insert(p_node); 317 return; 318 } 319 320 Assign(p_label_info, m_label_info); 321 322 AddToQueue(p_match); // find matches within new match 323 AddToQueue(p_node); // and more matches within this node 324 } 325 326 void Assign(LabelInfo* p_info, LabelInfo* m_info) { 327 AssignOne(p_info, m_info); 328 VLOG(4) << "Assign " << ToString(p_info) << " := " << ToString(m_info); 329 // Now consider unassigned adjacent addresses 330 TryExtendAssignment(p_info, m_info); 331 } 332 333 void AssignOne(LabelInfo* p_info, LabelInfo* m_info) { 334 p_info->label_->index_ = m_info->label_->index_; 335 336 // Mark as assigned 337 m_info->assignment_ = p_info; 338 p_info->assignment_ = m_info; 339 } 340 341 void TryExtendAssignment(LabelInfo* p_info, LabelInfo* m_info) { 342 RVA m_rva_base = m_info->label_->rva_; 343 RVA p_rva_base = p_info->label_->rva_; 344 345 LabelInfo* m_info_next = m_info->next_addr_; 346 LabelInfo* p_info_next = p_info->next_addr_; 347 for ( ; m_info_next && p_info_next; ) { 348 if (m_info_next->assignment_) 349 break; 350 351 RVA m_rva = m_info_next->label_->rva_; 352 RVA p_rva = p_info_next->label_->rva_; 353 354 if (m_rva - m_rva_base != p_rva - p_rva_base) { 355 // previous label was pointing to something that is different size 356 break; 357 } 358 LabelInfo* m_info_next_next = m_info_next->next_addr_; 359 LabelInfo* p_info_next_next = p_info_next->next_addr_; 360 if (m_info_next_next && p_info_next_next) { 361 RVA m_rva_next = m_info_next_next->label_->rva_; 362 RVA p_rva_next = p_info_next_next->label_->rva_; 363 if (m_rva_next - m_rva != p_rva_next - p_rva) { 364 // Since following labels are no longer in address lockstep, assume 365 // this address has a difference. 366 break; 367 } 368 } 369 370 // The label has inconsistent numbers of references, it is probably not 371 // the same thing. 372 if (m_info_next->refs_ != p_info_next->refs_) { 373 break; 374 } 375 376 VLOG(4) << " Extending assignment -> " 377 << ToString(p_info_next) << " := " << ToString(m_info_next); 378 379 AssignOne(p_info_next, m_info_next); 380 381 if (p_info_next->refs_ == m_info_next->refs_ && 382 p_info_next->refs_ == 1) { 383 TryExtendSequence(p_info_next->positions_[0], 384 m_info_next->positions_[0]); 385 TryExtendSequenceBackwards(p_info_next->positions_[0], 386 m_info_next->positions_[0]); 387 } 388 389 p_info_next = p_info_next_next; 390 m_info_next = m_info_next_next; 391 } 392 393 LabelInfo* m_info_prev = m_info->prev_addr_; 394 LabelInfo* p_info_prev = p_info->prev_addr_; 395 for ( ; m_info_prev && p_info_prev; ) { 396 if (m_info_prev->assignment_) 397 break; 398 399 RVA m_rva = m_info_prev->label_->rva_; 400 RVA p_rva = p_info_prev->label_->rva_; 401 402 if (m_rva - m_rva_base != p_rva - p_rva_base) { 403 // previous label was pointing to something that is different size 404 break; 405 } 406 LabelInfo* m_info_prev_prev = m_info_prev->prev_addr_; 407 LabelInfo* p_info_prev_prev = p_info_prev->prev_addr_; 408 409 // The the label has inconsistent numbers of references, it is 410 // probably not the same thing 411 if (m_info_prev->refs_ != p_info_prev->refs_) { 412 break; 413 } 414 415 AssignOne(p_info_prev, m_info_prev); 416 VLOG(4) << " Extending assignment <- " << ToString(p_info_prev) << " := " 417 << ToString(m_info_prev); 418 419 p_info_prev = p_info_prev_prev; 420 m_info_prev = m_info_prev_prev; 421 } 422 } 423 424 uint32 TryExtendSequence(uint32 p_pos_start, uint32 m_pos_start) { 425 uint32 p_pos = p_pos_start + 1; 426 uint32 m_pos = m_pos_start + 1; 427 428 while (p_pos < p_trace_.size() && m_pos < m_trace_.size()) { 429 LabelInfo* p_info = p_trace_[p_pos]; 430 LabelInfo* m_info = m_trace_[m_pos]; 431 432 // To match, either (1) both are assigned or (2) both are unassigned. 433 if ((p_info->assignment_ == NULL) != (m_info->assignment_ == NULL)) 434 break; 435 436 // If they are assigned, it needs to be consistent (same index). 437 if (p_info->assignment_ && m_info->assignment_) { 438 if (p_info->label_->index_ != m_info->label_->index_) 439 break; 440 ++p_pos; 441 ++m_pos; 442 continue; 443 } 444 445 if (p_info->refs_ != m_info->refs_) 446 break; 447 448 AssignOne(p_info, m_info); 449 VLOG(4) << " Extending assignment seq[+" << p_pos - p_pos_start 450 << "] -> " << ToString(p_info) << " := " << ToString(m_info); 451 452 ++p_pos; 453 ++m_pos; 454 } 455 456 return p_pos - p_pos_start; 457 } 458 459 uint32 TryExtendSequenceBackwards(uint32 p_pos_start, uint32 m_pos_start) { 460 if (p_pos_start == 0 || m_pos_start == 0) 461 return 0; 462 463 uint32 p_pos = p_pos_start - 1; 464 uint32 m_pos = m_pos_start - 1; 465 466 while (p_pos > 0 && m_pos > 0) { 467 LabelInfo* p_info = p_trace_[p_pos]; 468 LabelInfo* m_info = m_trace_[m_pos]; 469 470 if ((p_info->assignment_ == NULL) != (m_info->assignment_ == NULL)) 471 break; 472 473 if (p_info->assignment_ && m_info->assignment_) { 474 if (p_info->label_->index_ != m_info->label_->index_) 475 break; 476 --p_pos; 477 --m_pos; 478 continue; 479 } 480 481 if (p_info->refs_ != m_info->refs_) 482 break; 483 484 AssignOne(p_info, m_info); 485 VLOG(4) << " Extending assignment seq[-" << p_pos_start - p_pos 486 << "] <- " << ToString(p_info) << " := " << ToString(m_info); 487 488 --p_pos; 489 --m_pos; 490 } 491 492 return p_pos - p_pos_start; 493 } 494 495 Node* FindModelNode(Node* node) { 496 if (node->prev_ == NULL) 497 return m_root_; 498 499 Node* m_parent = FindModelNode(node->prev_); 500 if (m_parent == NULL) { 501 return NULL; 502 } 503 504 ExtendNode(m_parent, m_trace_); 505 506 LabelInfo* p_label = node->in_edge_; 507 LabelInfo* m_label = p_label->assignment_; 508 if (m_label == NULL) { 509 VLOG(2) << "Expected assigned prefix"; 510 return NULL; 511 } 512 513 Node::Edges::iterator e = m_parent->edges_.find(m_label); 514 if (e == m_parent->edges_.end()) { 515 VLOG(3) << "Expected defined edge in parent"; 516 return NULL; 517 } 518 519 return e->second; 520 } 521 522 Node* MakeRootNode(const Trace& trace) { 523 Node* node = new Node(NULL, NULL); 524 all_nodes_.push_back(node); 525 for (uint32 i = 0; i < trace.size(); ++i) { 526 ++node->count_; 527 node->places_.push_back(i); 528 } 529 return node; 530 } 531 532 void ExtendNode(Node* node, const Trace& trace) { 533 // Make sure trie is filled in at this node. 534 if (node->Extended()) 535 return; 536 for (size_t i = 0; i < node->places_.size(); ++i) { 537 uint32 index = node->places_.at(i); 538 if (index < trace.size()) { 539 LabelInfo* item = trace.at(index); 540 Node*& slot = node->edges_[item]; 541 if (slot == NULL) { 542 slot = new Node(item, node); 543 all_nodes_.push_back(slot); 544 node->edges_in_frequency_order.push_back(slot); 545 } 546 slot->places_.push_back(index + 1); 547 ++slot->count_; 548 } 549 } 550 node->edges_in_frequency_order.sort(OrderNodeByCountDecreasing()); 551 } 552 553 const Trace& m_trace_; 554 const Trace& p_trace_; 555 Node* m_root_; 556 Node* p_root_; 557 558 NodeQueue worklist_; 559 NodeQueue unsolved_; 560 561 std::vector<Node*> all_nodes_; 562 563 DISALLOW_COPY_AND_ASSIGN(AssignmentProblem); 564 }; 565 566 class GraphAdjuster : public AdjustmentMethod { 567 public: 568 GraphAdjuster() 569 : prog_(NULL), 570 model_(NULL), 571 debug_label_index_gen_(0) {} 572 ~GraphAdjuster() {} 573 574 bool Adjust(const AssemblyProgram& model, AssemblyProgram* program) { 575 VLOG(1) << "GraphAdjuster::Adjust"; 576 prog_ = program; 577 model_ = &model; 578 debug_label_index_gen_ = 0; 579 return Finish(); 580 } 581 582 bool Finish() { 583 prog_->UnassignIndexes(); 584 CollectTraces(model_, &model_abs32_, &model_rel32_, true); 585 CollectTraces(prog_, &prog_abs32_, &prog_rel32_, false); 586 Solve(model_abs32_, prog_abs32_); 587 Solve(model_rel32_, prog_rel32_); 588 prog_->AssignRemainingIndexes(); 589 return true; 590 } 591 592 private: 593 594 void CollectTraces(const AssemblyProgram* program, Trace* abs32, Trace* rel32, 595 bool is_model) { 596 const InstructionVector& instructions = program->instructions(); 597 for (size_t i = 0; i < instructions.size(); ++i) { 598 Instruction* instruction = instructions[i]; 599 if (Label* label = program->InstructionAbs32Label(instruction)) 600 ReferenceLabel(abs32, label, is_model); 601 if (Label* label = program->InstructionRel32Label(instruction)) 602 ReferenceLabel(rel32, label, is_model); 603 } 604 // TODO(sra): we could simply append all the labels in index order to 605 // incorporate some costing for entropy (bigger deltas) that will be 606 // introduced into the label address table by non-monotonic ordering. This 607 // would have some knock-on effects to parts of the algorithm that work on 608 // single-occurrence labels. 609 } 610 611 void Solve(const Trace& model, const Trace& problem) { 612 LinkLabelInfos(model); 613 LinkLabelInfos(problem); 614 AssignmentProblem a(model, problem); 615 a.Solve(); 616 } 617 618 void LinkLabelInfos(const Trace& trace) { 619 typedef std::set<LabelInfo*, OrderLabelInfoByAddressAscending> Ordered; 620 Ordered ordered; 621 for (Trace::const_iterator p = trace.begin(); p != trace.end(); ++p) 622 ordered.insert(*p); 623 LabelInfo* prev = NULL; 624 for (Ordered::iterator p = ordered.begin(); p != ordered.end(); ++p) { 625 LabelInfo* curr = *p; 626 if (prev) prev->next_addr_ = curr; 627 curr->prev_addr_ = prev; 628 prev = curr; 629 630 if (curr->positions_.size() != curr->refs_) 631 NOTREACHED(); 632 } 633 } 634 635 void ReferenceLabel(Trace* trace, Label* label, bool is_model) { 636 trace->push_back(MakeLabelInfo(label, is_model, 637 static_cast<uint32>(trace->size()))); 638 } 639 640 LabelInfo* MakeLabelInfo(Label* label, bool is_model, uint32 position) { 641 LabelInfo& slot = label_infos_[label]; 642 if (slot.label_ == NULL) { 643 slot.label_ = label; 644 slot.is_model_ = is_model; 645 slot.debug_index_ = ++debug_label_index_gen_; 646 } 647 slot.positions_.push_back(position); 648 ++slot.refs_; 649 return &slot; 650 } 651 652 AssemblyProgram* prog_; // Program to be adjusted, owned by caller. 653 const AssemblyProgram* model_; // Program to be mimicked, owned by caller. 654 655 Trace model_abs32_; 656 Trace model_rel32_; 657 Trace prog_abs32_; 658 Trace prog_rel32_; 659 660 int debug_label_index_gen_; 661 662 // Note LabelInfo is allocated inside map, so the LabelInfo lifetimes are 663 // managed by the map. 664 std::map<Label*, LabelInfo> label_infos_; 665 666 private: 667 DISALLOW_COPY_AND_ASSIGN(GraphAdjuster); 668 }; 669 670 671 //////////////////////////////////////////////////////////////////////////////// 672 673 void AdjustmentMethod::Destroy() { delete this; } 674 675 AdjustmentMethod* AdjustmentMethod::MakeNullAdjustmentMethod() { 676 return new NullAdjustmentMethod(); 677 } 678 679 AdjustmentMethod* AdjustmentMethod::MakeTrieAdjustmentMethod() { 680 return new GraphAdjuster(); 681 } 682 683 Status Adjust(const AssemblyProgram& model, AssemblyProgram* program) { 684 AdjustmentMethod* method = AdjustmentMethod::MakeProductionAdjustmentMethod(); 685 bool ok = method->Adjust(model, program); 686 method->Destroy(); 687 if (ok) 688 return C_OK; 689 else 690 return C_ADJUSTMENT_FAILED; 691 } 692 693 } // namespace courgette 694