Home | History | Annotate | Download | only in optimizing
      1 /*
      2  * Copyright (C) 2016 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_H_
     18 #define ART_COMPILER_OPTIMIZING_SCHEDULER_H_
     19 
     20 #include <fstream>
     21 
     22 #include "base/time_utils.h"
     23 #include "driver/compiler_driver.h"
     24 #include "load_store_analysis.h"
     25 #include "nodes.h"
     26 #include "optimization.h"
     27 #include "code_generator.h"
     28 
     29 namespace art {
     30 
     31 // General description of instruction scheduling.
     32 //
     33 // This pass tries to improve the quality of the generated code by reordering
     34 // instructions in the graph to avoid execution delays caused by execution
     35 // dependencies.
     36 // Currently, scheduling is performed at the block level, so no `HInstruction`
     37 // ever leaves its block in this pass.
     38 //
     39 // The scheduling process iterates through blocks in the graph. For blocks that
     40 // we can and want to schedule:
     41 // 1) Build a dependency graph for instructions.
     42 //    It includes data dependencies (inputs/uses), but also environment
     43 //    dependencies and side-effect dependencies.
     44 // 2) Schedule the dependency graph.
     45 //    This is a topological sort of the dependency graph, using heuristics to
     46 //    decide what node to scheduler first when there are multiple candidates.
     47 //
     48 // A few factors impacting the quality of the scheduling are:
     49 // - The heuristics used to decide what node to schedule in the topological sort
     50 //   when there are multiple valid candidates. There is a wide range of
     51 //   complexity possible here, going from a simple model only considering
     52 //   latencies, to a super detailed CPU pipeline model.
     53 // - Fewer dependencies in the dependency graph give more freedom for the
     54 //   scheduling heuristics. For example de-aliasing can allow possibilities for
     55 //   reordering of memory accesses.
     56 // - The level of abstraction of the IR. It is easier to evaluate scheduling for
     57 //   IRs that translate to a single assembly instruction than for IRs
     58 //   that generate multiple assembly instructions or generate different code
     59 //   depending on properties of the IR.
     60 // - Scheduling is performed before register allocation, it is not aware of the
     61 //   impact of moving instructions on register allocation.
     62 //
     63 //
     64 // The scheduling code uses the terms predecessors, successors, and dependencies.
     65 // This can be confusing at times, so here are clarifications.
     66 // These terms are used from the point of view of the program dependency graph. So
     67 // the inputs of an instruction are part of its dependencies, and hence part its
     68 // predecessors. So the uses of an instruction are (part of) its successors.
     69 // (Side-effect dependencies can yield predecessors or successors that are not
     70 // inputs or uses.)
     71 //
     72 // Here is a trivial example. For the Java code:
     73 //
     74 //    int a = 1 + 2;
     75 //
     76 // we would have the instructions
     77 //
     78 //    i1 HIntConstant 1
     79 //    i2 HIntConstant 2
     80 //    i3 HAdd [i1,i2]
     81 //
     82 // `i1` and `i2` are predecessors of `i3`.
     83 // `i3` is a successor of `i1` and a successor of `i2`.
     84 // In a scheduling graph for this code we would have three nodes `n1`, `n2`,
     85 // and `n3` (respectively for instructions `i1`, `i1`, and `i3`).
     86 // Conceptually the program dependency graph for this would contain two edges
     87 //
     88 //    n1 -> n3
     89 //    n2 -> n3
     90 //
     91 // Since we schedule backwards (starting from the last instruction in each basic
     92 // block), the implementation of nodes keeps a list of pointers their
     93 // predecessors. So `n3` would keep pointers to its predecessors `n1` and `n2`.
     94 //
     95 // Node dependencies are also referred to from the program dependency graph
     96 // point of view: we say that node `B` immediately depends on `A` if there is an
     97 // edge from `A` to `B` in the program dependency graph. `A` is a predecessor of
     98 // `B`, `B` is a successor of `A`. In the example above `n3` depends on `n1` and
     99 // `n2`.
    100 // Since nodes in the scheduling graph keep a list of their predecessors, node
    101 // `B` will have a pointer to its predecessor `A`.
    102 // As we schedule backwards, `B` will be selected for scheduling before `A` is.
    103 //
    104 // So the scheduling for the example above could happen as follow
    105 //
    106 //    |---------------------------+------------------------|
    107 //    | candidates for scheduling | instructions scheduled |
    108 //    | --------------------------+------------------------|
    109 //
    110 // The only node without successors is `n3`, so it is the only initial
    111 // candidate.
    112 //
    113 //    | n3                        | (none)                 |
    114 //
    115 // We schedule `n3` as the last (and only) instruction. All its predecessors
    116 // that do not have any unscheduled successors become candidate. That is, `n1`
    117 // and `n2` become candidates.
    118 //
    119 //    | n1, n2                    | n3                     |
    120 //
    121 // One of the candidates is selected. In practice this is where scheduling
    122 // heuristics kick in, to decide which of the candidates should be selected.
    123 // In this example, let it be `n1`. It is scheduled before previously scheduled
    124 // nodes (in program order). There are no other nodes to add to the list of
    125 // candidates.
    126 //
    127 //    | n2                        | n1                     |
    128 //    |                           | n3                     |
    129 //
    130 // The only candidate available for scheduling is `n2`. Schedule it before
    131 // (in program order) the previously scheduled nodes.
    132 //
    133 //    | (none)                    | n2                     |
    134 //    |                           | n1                     |
    135 //    |                           | n3                     |
    136 //    |---------------------------+------------------------|
    137 //
    138 // So finally the instructions will be executed in the order `i2`, `i1`, and `i3`.
    139 // In this trivial example, it does not matter which of `i1` and `i2` is
    140 // scheduled first since they are constants. However the same process would
    141 // apply if `i1` and `i2` were actual operations (for example `HMul` and `HDiv`).
    142 
    143 // Set to true to have instruction scheduling dump scheduling graphs to the file
    144 // `scheduling_graphs.dot`. See `SchedulingGraph::DumpAsDotGraph()`.
    145 static constexpr bool kDumpDotSchedulingGraphs = false;
    146 
    147 // Typically used as a default instruction latency.
    148 static constexpr uint32_t kGenericInstructionLatency = 1;
    149 
    150 class HScheduler;
    151 
    152 /**
    153  * A node representing an `HInstruction` in the `SchedulingGraph`.
    154  */
    155 class SchedulingNode : public ArenaObject<kArenaAllocScheduler> {
    156  public:
    157   SchedulingNode(HInstruction* instr, ArenaAllocator* arena, bool is_scheduling_barrier)
    158       : latency_(0),
    159         internal_latency_(0),
    160         critical_path_(0),
    161         instruction_(instr),
    162         is_scheduling_barrier_(is_scheduling_barrier),
    163         data_predecessors_(arena->Adapter(kArenaAllocScheduler)),
    164         other_predecessors_(arena->Adapter(kArenaAllocScheduler)),
    165         num_unscheduled_successors_(0) {
    166     data_predecessors_.reserve(kPreallocatedPredecessors);
    167   }
    168 
    169   void AddDataPredecessor(SchedulingNode* predecessor) {
    170     data_predecessors_.push_back(predecessor);
    171     predecessor->num_unscheduled_successors_++;
    172   }
    173 
    174   void AddOtherPredecessor(SchedulingNode* predecessor) {
    175     other_predecessors_.push_back(predecessor);
    176     predecessor->num_unscheduled_successors_++;
    177   }
    178 
    179   void DecrementNumberOfUnscheduledSuccessors() {
    180     num_unscheduled_successors_--;
    181   }
    182 
    183   void MaybeUpdateCriticalPath(uint32_t other_critical_path) {
    184     critical_path_ = std::max(critical_path_, other_critical_path);
    185   }
    186 
    187   bool HasUnscheduledSuccessors() const {
    188     return num_unscheduled_successors_ != 0;
    189   }
    190 
    191   HInstruction* GetInstruction() const { return instruction_; }
    192   uint32_t GetLatency() const { return latency_; }
    193   void SetLatency(uint32_t latency) { latency_ = latency; }
    194   uint32_t GetInternalLatency() const { return internal_latency_; }
    195   void SetInternalLatency(uint32_t internal_latency) { internal_latency_ = internal_latency; }
    196   uint32_t GetCriticalPath() const { return critical_path_; }
    197   bool IsSchedulingBarrier() const { return is_scheduling_barrier_; }
    198   const ArenaVector<SchedulingNode*>& GetDataPredecessors() const { return data_predecessors_; }
    199   const ArenaVector<SchedulingNode*>& GetOtherPredecessors() const { return other_predecessors_; }
    200 
    201  private:
    202   // The latency of this node. It represents the latency between the moment the
    203   // last instruction for this node has executed to the moment the result
    204   // produced by this node is available to users.
    205   uint32_t latency_;
    206   // This represents the time spent *within* the generated code for this node.
    207   // It should be zero for nodes that only generate a single instruction.
    208   uint32_t internal_latency_;
    209 
    210   // The critical path from this instruction to the end of scheduling. It is
    211   // used by the scheduling heuristics to measure the priority of this instruction.
    212   // It is defined as
    213   //     critical_path_ = latency_ + max((use.internal_latency_ + use.critical_path_) for all uses)
    214   // (Note that here 'uses' is equivalent to 'data successors'. Also see comments in
    215   // `HScheduler::Schedule(SchedulingNode* scheduling_node)`).
    216   uint32_t critical_path_;
    217 
    218   // The instruction that this node represents.
    219   HInstruction* const instruction_;
    220 
    221   // If a node is scheduling barrier, other nodes cannot be scheduled before it.
    222   const bool is_scheduling_barrier_;
    223 
    224   // The lists of predecessors. They cannot be scheduled before this node. Once
    225   // this node is scheduled, we check whether any of its predecessors has become a
    226   // valid candidate for scheduling.
    227   // Predecessors in `data_predecessors_` are data dependencies. Those in
    228   // `other_predecessors_` contain side-effect dependencies, environment
    229   // dependencies, and scheduling barrier dependencies.
    230   ArenaVector<SchedulingNode*> data_predecessors_;
    231   ArenaVector<SchedulingNode*> other_predecessors_;
    232 
    233   // The number of unscheduled successors for this node. This number is
    234   // decremented as successors are scheduled. When it reaches zero this node
    235   // becomes a valid candidate to schedule.
    236   uint32_t num_unscheduled_successors_;
    237 
    238   static constexpr size_t kPreallocatedPredecessors = 4;
    239 };
    240 
    241 /*
    242  * Directed acyclic graph for scheduling.
    243  */
    244 class SchedulingGraph : public ValueObject {
    245  public:
    246   SchedulingGraph(const HScheduler* scheduler, ArenaAllocator* arena)
    247       : scheduler_(scheduler),
    248         arena_(arena),
    249         contains_scheduling_barrier_(false),
    250         nodes_map_(arena_->Adapter(kArenaAllocScheduler)),
    251         heap_location_collector_(nullptr) {}
    252 
    253   SchedulingNode* AddNode(HInstruction* instr, bool is_scheduling_barrier = false) {
    254     SchedulingNode* node = new (arena_) SchedulingNode(instr, arena_, is_scheduling_barrier);
    255     nodes_map_.Insert(std::make_pair(instr, node));
    256     contains_scheduling_barrier_ |= is_scheduling_barrier;
    257     AddDependencies(instr, is_scheduling_barrier);
    258     return node;
    259   }
    260 
    261   void Clear() {
    262     nodes_map_.Clear();
    263     contains_scheduling_barrier_ = false;
    264   }
    265 
    266   void SetHeapLocationCollector(const HeapLocationCollector& heap_location_collector) {
    267     heap_location_collector_ = &heap_location_collector;
    268   }
    269 
    270   SchedulingNode* GetNode(const HInstruction* instr) const {
    271     auto it = nodes_map_.Find(instr);
    272     if (it == nodes_map_.end()) {
    273       return nullptr;
    274     } else {
    275       return it->second;
    276     }
    277   }
    278 
    279   bool IsSchedulingBarrier(const HInstruction* instruction) const;
    280 
    281   bool HasImmediateDataDependency(const SchedulingNode* node, const SchedulingNode* other) const;
    282   bool HasImmediateDataDependency(const HInstruction* node, const HInstruction* other) const;
    283   bool HasImmediateOtherDependency(const SchedulingNode* node, const SchedulingNode* other) const;
    284   bool HasImmediateOtherDependency(const HInstruction* node, const HInstruction* other) const;
    285 
    286   size_t Size() const {
    287     return nodes_map_.Size();
    288   }
    289 
    290   // Dump the scheduling graph, in dot file format, appending it to the file
    291   // `scheduling_graphs.dot`.
    292   void DumpAsDotGraph(const std::string& description,
    293                       const ArenaVector<SchedulingNode*>& initial_candidates);
    294 
    295  protected:
    296   void AddDependency(SchedulingNode* node, SchedulingNode* dependency, bool is_data_dependency);
    297   void AddDataDependency(SchedulingNode* node, SchedulingNode* dependency) {
    298     AddDependency(node, dependency, /*is_data_dependency*/true);
    299   }
    300   void AddOtherDependency(SchedulingNode* node, SchedulingNode* dependency) {
    301     AddDependency(node, dependency, /*is_data_dependency*/false);
    302   }
    303   bool HasMemoryDependency(const HInstruction* node, const HInstruction* other) const;
    304   bool HasExceptionDependency(const HInstruction* node, const HInstruction* other) const;
    305   bool HasSideEffectDependency(const HInstruction* node, const HInstruction* other) const;
    306   bool ArrayAccessMayAlias(const HInstruction* node, const HInstruction* other) const;
    307   bool FieldAccessMayAlias(const HInstruction* node, const HInstruction* other) const;
    308   size_t ArrayAccessHeapLocation(HInstruction* array, HInstruction* index) const;
    309   size_t FieldAccessHeapLocation(HInstruction* obj, const FieldInfo* field) const;
    310 
    311   // Add dependencies nodes for the given `HInstruction`: inputs, environments, and side-effects.
    312   void AddDependencies(HInstruction* instruction, bool is_scheduling_barrier = false);
    313 
    314   const HScheduler* const scheduler_;
    315 
    316   ArenaAllocator* const arena_;
    317 
    318   bool contains_scheduling_barrier_;
    319 
    320   ArenaHashMap<const HInstruction*, SchedulingNode*> nodes_map_;
    321 
    322   const HeapLocationCollector* heap_location_collector_;
    323 };
    324 
    325 /*
    326  * The visitors derived from this base class are used by schedulers to evaluate
    327  * the latencies of `HInstruction`s.
    328  */
    329 class SchedulingLatencyVisitor : public HGraphDelegateVisitor {
    330  public:
    331   // This class and its sub-classes will never be used to drive a visit of an
    332   // `HGraph` but only to visit `HInstructions` one at a time, so we do not need
    333   // to pass a valid graph to `HGraphDelegateVisitor()`.
    334   SchedulingLatencyVisitor()
    335       : HGraphDelegateVisitor(nullptr),
    336         last_visited_latency_(0),
    337         last_visited_internal_latency_(0) {}
    338 
    339   void VisitInstruction(HInstruction* instruction) OVERRIDE {
    340     LOG(FATAL) << "Error visiting " << instruction->DebugName() << ". "
    341         "Architecture-specific scheduling latency visitors must handle all instructions"
    342         " (potentially by overriding the generic `VisitInstruction()`.";
    343     UNREACHABLE();
    344   }
    345 
    346   void Visit(HInstruction* instruction) {
    347     instruction->Accept(this);
    348   }
    349 
    350   void CalculateLatency(SchedulingNode* node) {
    351     // By default nodes have no internal latency.
    352     last_visited_internal_latency_ = 0;
    353     Visit(node->GetInstruction());
    354   }
    355 
    356   uint32_t GetLastVisitedLatency() const { return last_visited_latency_; }
    357   uint32_t GetLastVisitedInternalLatency() const { return last_visited_internal_latency_; }
    358 
    359  protected:
    360   // The latency of the most recent visited SchedulingNode.
    361   // This is for reporting the latency value to the user of this visitor.
    362   uint32_t last_visited_latency_;
    363   // This represents the time spent *within* the generated code for the most recent visited
    364   // SchedulingNode. This is for reporting the internal latency value to the user of this visitor.
    365   uint32_t last_visited_internal_latency_;
    366 };
    367 
    368 class SchedulingNodeSelector : public ArenaObject<kArenaAllocScheduler> {
    369  public:
    370   virtual SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode*>* nodes,
    371                                                  const SchedulingGraph& graph) = 0;
    372   virtual ~SchedulingNodeSelector() {}
    373  protected:
    374   static void DeleteNodeAtIndex(ArenaVector<SchedulingNode*>* nodes, size_t index) {
    375     (*nodes)[index] = nodes->back();
    376     nodes->pop_back();
    377   }
    378 };
    379 
    380 /*
    381  * Select a `SchedulingNode` at random within the candidates.
    382  */
    383 class RandomSchedulingNodeSelector : public SchedulingNodeSelector {
    384  public:
    385   explicit RandomSchedulingNodeSelector() : seed_(0) {
    386     seed_  = static_cast<uint32_t>(NanoTime());
    387     srand(seed_);
    388   }
    389 
    390   SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode*>* nodes,
    391                                          const SchedulingGraph& graph) OVERRIDE {
    392     UNUSED(graph);
    393     DCHECK(!nodes->empty());
    394     size_t select = rand_r(&seed_) % nodes->size();
    395     SchedulingNode* select_node = (*nodes)[select];
    396     DeleteNodeAtIndex(nodes, select);
    397     return select_node;
    398   }
    399 
    400   uint32_t seed_;
    401 };
    402 
    403 /*
    404  * Select a `SchedulingNode` according to critical path information,
    405  * with heuristics to favor certain instruction patterns like materialized condition.
    406  */
    407 class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
    408  public:
    409   CriticalPathSchedulingNodeSelector() : prev_select_(nullptr) {}
    410 
    411   SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode*>* nodes,
    412                                          const SchedulingGraph& graph) OVERRIDE;
    413 
    414  protected:
    415   SchedulingNode* GetHigherPrioritySchedulingNode(SchedulingNode* candidate,
    416                                                   SchedulingNode* check) const;
    417 
    418   SchedulingNode* SelectMaterializedCondition(ArenaVector<SchedulingNode*>* nodes,
    419                                                const SchedulingGraph& graph) const;
    420 
    421  private:
    422   const SchedulingNode* prev_select_;
    423 };
    424 
    425 class HScheduler {
    426  public:
    427   HScheduler(ArenaAllocator* arena,
    428              SchedulingLatencyVisitor* latency_visitor,
    429              SchedulingNodeSelector* selector)
    430       : arena_(arena),
    431         latency_visitor_(latency_visitor),
    432         selector_(selector),
    433         only_optimize_loop_blocks_(true),
    434         scheduling_graph_(this, arena),
    435         cursor_(nullptr),
    436         candidates_(arena_->Adapter(kArenaAllocScheduler)) {}
    437   virtual ~HScheduler() {}
    438 
    439   void Schedule(HGraph* graph);
    440 
    441   void SetOnlyOptimizeLoopBlocks(bool loop_only) { only_optimize_loop_blocks_ = loop_only; }
    442 
    443   // Instructions can not be rescheduled across a scheduling barrier.
    444   virtual bool IsSchedulingBarrier(const HInstruction* instruction) const;
    445 
    446  protected:
    447   void Schedule(HBasicBlock* block);
    448   void Schedule(SchedulingNode* scheduling_node);
    449   void Schedule(HInstruction* instruction);
    450 
    451   // Any instruction returning `false` via this method will prevent its
    452   // containing basic block from being scheduled.
    453   // This method is used to restrict scheduling to instructions that we know are
    454   // safe to handle.
    455   virtual bool IsSchedulable(const HInstruction* instruction) const;
    456   bool IsSchedulable(const HBasicBlock* block) const;
    457 
    458   void CalculateLatency(SchedulingNode* node) {
    459     latency_visitor_->CalculateLatency(node);
    460     node->SetLatency(latency_visitor_->GetLastVisitedLatency());
    461     node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency());
    462   }
    463 
    464   ArenaAllocator* const arena_;
    465   SchedulingLatencyVisitor* const latency_visitor_;
    466   SchedulingNodeSelector* const selector_;
    467   bool only_optimize_loop_blocks_;
    468 
    469   // We instantiate the members below as part of this class to avoid
    470   // instantiating them locally for every chunk scheduled.
    471   SchedulingGraph scheduling_graph_;
    472   // A pointer indicating where the next instruction to be scheduled will be inserted.
    473   HInstruction* cursor_;
    474   // The list of candidates for scheduling. A node becomes a candidate when all
    475   // its predecessors have been scheduled.
    476   ArenaVector<SchedulingNode*> candidates_;
    477 
    478  private:
    479   DISALLOW_COPY_AND_ASSIGN(HScheduler);
    480 };
    481 
    482 inline bool SchedulingGraph::IsSchedulingBarrier(const HInstruction* instruction) const {
    483   return scheduler_->IsSchedulingBarrier(instruction);
    484 }
    485 
    486 class HInstructionScheduling : public HOptimization {
    487  public:
    488   HInstructionScheduling(HGraph* graph, InstructionSet instruction_set, CodeGenerator* cg = nullptr)
    489       : HOptimization(graph, kInstructionScheduling),
    490         codegen_(cg),
    491         instruction_set_(instruction_set) {}
    492 
    493   void Run() {
    494     Run(/*only_optimize_loop_blocks*/ true, /*schedule_randomly*/ false);
    495   }
    496   void Run(bool only_optimize_loop_blocks, bool schedule_randomly);
    497 
    498   static constexpr const char* kInstructionScheduling = "scheduler";
    499 
    500  private:
    501   CodeGenerator* const codegen_;
    502   const InstructionSet instruction_set_;
    503   DISALLOW_COPY_AND_ASSIGN(HInstructionScheduling);
    504 };
    505 
    506 }  // namespace art
    507 
    508 #endif  // ART_COMPILER_OPTIMIZING_SCHEDULER_H_
    509