Home | History | Annotate | Download | only in optimizing
      1 /*
      2  * Copyright (C) 2016 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_H_
     18 #define ART_COMPILER_OPTIMIZING_SCHEDULER_H_
     19 
     20 #include <fstream>
     21 
     22 #include "base/time_utils.h"
     23 #include "driver/compiler_driver.h"
     24 #include "nodes.h"
     25 #include "optimization.h"
     26 
     27 namespace art {
     28 
     29 // General description of instruction scheduling.
     30 //
     31 // This pass tries to improve the quality of the generated code by reordering
     32 // instructions in the graph to avoid execution delays caused by execution
     33 // dependencies.
     34 // Currently, scheduling is performed at the block level, so no `HInstruction`
     35 // ever leaves its block in this pass.
     36 //
     37 // The scheduling process iterates through blocks in the graph. For blocks that
     38 // we can and want to schedule:
     39 // 1) Build a dependency graph for instructions.
     40 //    It includes data dependencies (inputs/uses), but also environment
     41 //    dependencies and side-effect dependencies.
     42 // 2) Schedule the dependency graph.
     43 //    This is a topological sort of the dependency graph, using heuristics to
     44 //    decide what node to scheduler first when there are multiple candidates.
     45 //
     46 // A few factors impacting the quality of the scheduling are:
     47 // - The heuristics used to decide what node to schedule in the topological sort
     48 //   when there are multiple valid candidates. There is a wide range of
     49 //   complexity possible here, going from a simple model only considering
     50 //   latencies, to a super detailed CPU pipeline model.
     51 // - Fewer dependencies in the dependency graph give more freedom for the
     52 //   scheduling heuristics. For example de-aliasing can allow possibilities for
     53 //   reordering of memory accesses.
     54 // - The level of abstraction of the IR. It is easier to evaluate scheduling for
     55 //   IRs that translate to a single assembly instruction than for IRs
     56 //   that generate multiple assembly instructions or generate different code
     57 //   depending on properties of the IR.
     58 // - Scheduling is performed before register allocation, it is not aware of the
     59 //   impact of moving instructions on register allocation.
     60 //
     61 //
     62 // The scheduling code uses the terms predecessors, successors, and dependencies.
     63 // This can be confusing at times, so here are clarifications.
     64 // These terms are used from the point of view of the program dependency graph. So
     65 // the inputs of an instruction are part of its dependencies, and hence part its
     66 // predecessors. So the uses of an instruction are (part of) its successors.
     67 // (Side-effect dependencies can yield predecessors or successors that are not
     68 // inputs or uses.)
     69 //
     70 // Here is a trivial example. For the Java code:
     71 //
     72 //    int a = 1 + 2;
     73 //
     74 // we would have the instructions
     75 //
     76 //    i1 HIntConstant 1
     77 //    i2 HIntConstant 2
     78 //    i3 HAdd [i1,i2]
     79 //
     80 // `i1` and `i2` are predecessors of `i3`.
     81 // `i3` is a successor of `i1` and a successor of `i2`.
     82 // In a scheduling graph for this code we would have three nodes `n1`, `n2`,
     83 // and `n3` (respectively for instructions `i1`, `i1`, and `i3`).
     84 // Conceptually the program dependency graph for this would contain two edges
     85 //
     86 //    n1 -> n3
     87 //    n2 -> n3
     88 //
     89 // Since we schedule backwards (starting from the last instruction in each basic
     90 // block), the implementation of nodes keeps a list of pointers their
     91 // predecessors. So `n3` would keep pointers to its predecessors `n1` and `n2`.
     92 //
     93 // Node dependencies are also referred to from the program dependency graph
     94 // point of view: we say that node `B` immediately depends on `A` if there is an
     95 // edge from `A` to `B` in the program dependency graph. `A` is a predecessor of
     96 // `B`, `B` is a successor of `A`. In the example above `n3` depends on `n1` and
     97 // `n2`.
     98 // Since nodes in the scheduling graph keep a list of their predecessors, node
     99 // `B` will have a pointer to its predecessor `A`.
    100 // As we schedule backwards, `B` will be selected for scheduling before `A` is.
    101 //
    102 // So the scheduling for the example above could happen as follow
    103 //
    104 //    |---------------------------+------------------------|
    105 //    | candidates for scheduling | instructions scheduled |
    106 //    | --------------------------+------------------------|
    107 //
    108 // The only node without successors is `n3`, so it is the only initial
    109 // candidate.
    110 //
    111 //    | n3                        | (none)                 |
    112 //
    113 // We schedule `n3` as the last (and only) instruction. All its predecessors
    114 // that do not have any unscheduled successors become candidate. That is, `n1`
    115 // and `n2` become candidates.
    116 //
    117 //    | n1, n2                    | n3                     |
    118 //
    119 // One of the candidates is selected. In practice this is where scheduling
    120 // heuristics kick in, to decide which of the candidates should be selected.
    121 // In this example, let it be `n1`. It is scheduled before previously scheduled
    122 // nodes (in program order). There are no other nodes to add to the list of
    123 // candidates.
    124 //
    125 //    | n2                        | n1                     |
    126 //    |                           | n3                     |
    127 //
    128 // The only candidate available for scheduling is `n2`. Schedule it before
    129 // (in program order) the previously scheduled nodes.
    130 //
    131 //    | (none)                    | n2                     |
    132 //    |                           | n1                     |
    133 //    |                           | n3                     |
    134 //    |---------------------------+------------------------|
    135 //
    136 // So finally the instructions will be executed in the order `i2`, `i1`, and `i3`.
    137 // In this trivial example, it does not matter which of `i1` and `i2` is
    138 // scheduled first since they are constants. However the same process would
    139 // apply if `i1` and `i2` were actual operations (for example `HMul` and `HDiv`).
    140 
    141 // Set to true to have instruction scheduling dump scheduling graphs to the file
    142 // `scheduling_graphs.dot`. See `SchedulingGraph::DumpAsDotGraph()`.
    143 static constexpr bool kDumpDotSchedulingGraphs = false;
    144 
    145 // Typically used as a default instruction latency.
    146 static constexpr uint32_t kGenericInstructionLatency = 1;
    147 
    148 class HScheduler;
    149 
    150 /**
    151  * A node representing an `HInstruction` in the `SchedulingGraph`.
    152  */
    153 class SchedulingNode : public ArenaObject<kArenaAllocScheduler> {
    154  public:
    155   SchedulingNode(HInstruction* instr, ArenaAllocator* arena, bool is_scheduling_barrier)
    156       : latency_(0),
    157         internal_latency_(0),
    158         critical_path_(0),
    159         instruction_(instr),
    160         is_scheduling_barrier_(is_scheduling_barrier),
    161         data_predecessors_(arena->Adapter(kArenaAllocScheduler)),
    162         other_predecessors_(arena->Adapter(kArenaAllocScheduler)),
    163         num_unscheduled_successors_(0) {
    164     data_predecessors_.reserve(kPreallocatedPredecessors);
    165   }
    166 
    167   void AddDataPredecessor(SchedulingNode* predecessor) {
    168     data_predecessors_.push_back(predecessor);
    169     predecessor->num_unscheduled_successors_++;
    170   }
    171 
    172   void AddOtherPredecessor(SchedulingNode* predecessor) {
    173     other_predecessors_.push_back(predecessor);
    174     predecessor->num_unscheduled_successors_++;
    175   }
    176 
    177   void DecrementNumberOfUnscheduledSuccessors() {
    178     num_unscheduled_successors_--;
    179   }
    180 
    181   void MaybeUpdateCriticalPath(uint32_t other_critical_path) {
    182     critical_path_ = std::max(critical_path_, other_critical_path);
    183   }
    184 
    185   bool HasUnscheduledSuccessors() const {
    186     return num_unscheduled_successors_ != 0;
    187   }
    188 
    189   HInstruction* GetInstruction() const { return instruction_; }
    190   uint32_t GetLatency() const { return latency_; }
    191   void SetLatency(uint32_t latency) { latency_ = latency; }
    192   uint32_t GetInternalLatency() const { return internal_latency_; }
    193   void SetInternalLatency(uint32_t internal_latency) { internal_latency_ = internal_latency; }
    194   uint32_t GetCriticalPath() const { return critical_path_; }
    195   bool IsSchedulingBarrier() const { return is_scheduling_barrier_; }
    196   const ArenaVector<SchedulingNode*>& GetDataPredecessors() const { return data_predecessors_; }
    197   const ArenaVector<SchedulingNode*>& GetOtherPredecessors() const { return other_predecessors_; }
    198 
    199  private:
    200   // The latency of this node. It represents the latency between the moment the
    201   // last instruction for this node has executed to the moment the result
    202   // produced by this node is available to users.
    203   uint32_t latency_;
    204   // This represents the time spent *within* the generated code for this node.
    205   // It should be zero for nodes that only generate a single instruction.
    206   uint32_t internal_latency_;
    207 
    208   // The critical path from this instruction to the end of scheduling. It is
    209   // used by the scheduling heuristics to measure the priority of this instruction.
    210   // It is defined as
    211   //     critical_path_ = latency_ + max((use.internal_latency_ + use.critical_path_) for all uses)
    212   // (Note that here 'uses' is equivalent to 'data successors'. Also see comments in
    213   // `HScheduler::Schedule(SchedulingNode* scheduling_node)`).
    214   uint32_t critical_path_;
    215 
    216   // The instruction that this node represents.
    217   HInstruction* const instruction_;
    218 
    219   // If a node is scheduling barrier, other nodes cannot be scheduled before it.
    220   const bool is_scheduling_barrier_;
    221 
    222   // The lists of predecessors. They cannot be scheduled before this node. Once
    223   // this node is scheduled, we check whether any of its predecessors has become a
    224   // valid candidate for scheduling.
    225   // Predecessors in `data_predecessors_` are data dependencies. Those in
    226   // `other_predecessors_` contain side-effect dependencies, environment
    227   // dependencies, and scheduling barrier dependencies.
    228   ArenaVector<SchedulingNode*> data_predecessors_;
    229   ArenaVector<SchedulingNode*> other_predecessors_;
    230 
    231   // The number of unscheduled successors for this node. This number is
    232   // decremented as successors are scheduled. When it reaches zero this node
    233   // becomes a valid candidate to schedule.
    234   uint32_t num_unscheduled_successors_;
    235 
    236   static constexpr size_t kPreallocatedPredecessors = 4;
    237 };
    238 
    239 /*
    240  * Directed acyclic graph for scheduling.
    241  */
    242 class SchedulingGraph : public ValueObject {
    243  public:
    244   SchedulingGraph(const HScheduler* scheduler, ArenaAllocator* arena)
    245       : scheduler_(scheduler),
    246         arena_(arena),
    247         contains_scheduling_barrier_(false),
    248         nodes_map_(arena_->Adapter(kArenaAllocScheduler)) {}
    249 
    250   SchedulingNode* AddNode(HInstruction* instr, bool is_scheduling_barrier = false) {
    251     SchedulingNode* node = new (arena_) SchedulingNode(instr, arena_, is_scheduling_barrier);
    252     nodes_map_.Insert(std::make_pair(instr, node));
    253     contains_scheduling_barrier_ |= is_scheduling_barrier;
    254     AddDependencies(instr, is_scheduling_barrier);
    255     return node;
    256   }
    257 
    258   void Clear() {
    259     nodes_map_.Clear();
    260     contains_scheduling_barrier_ = false;
    261   }
    262 
    263   SchedulingNode* GetNode(const HInstruction* instr) const {
    264     auto it = nodes_map_.Find(instr);
    265     if (it == nodes_map_.end()) {
    266       return nullptr;
    267     } else {
    268       return it->second;
    269     }
    270   }
    271 
    272   bool IsSchedulingBarrier(const HInstruction* instruction) const;
    273 
    274   bool HasImmediateDataDependency(const SchedulingNode* node, const SchedulingNode* other) const;
    275   bool HasImmediateDataDependency(const HInstruction* node, const HInstruction* other) const;
    276   bool HasImmediateOtherDependency(const SchedulingNode* node, const SchedulingNode* other) const;
    277   bool HasImmediateOtherDependency(const HInstruction* node, const HInstruction* other) const;
    278 
    279   size_t Size() const {
    280     return nodes_map_.Size();
    281   }
    282 
    283   // Dump the scheduling graph, in dot file format, appending it to the file
    284   // `scheduling_graphs.dot`.
    285   void DumpAsDotGraph(const std::string& description,
    286                       const ArenaVector<SchedulingNode*>& initial_candidates);
    287 
    288  protected:
    289   void AddDependency(SchedulingNode* node, SchedulingNode* dependency, bool is_data_dependency);
    290   void AddDataDependency(SchedulingNode* node, SchedulingNode* dependency) {
    291     AddDependency(node, dependency, /*is_data_dependency*/true);
    292   }
    293   void AddOtherDependency(SchedulingNode* node, SchedulingNode* dependency) {
    294     AddDependency(node, dependency, /*is_data_dependency*/false);
    295   }
    296 
    297   // Add dependencies nodes for the given `HInstruction`: inputs, environments, and side-effects.
    298   void AddDependencies(HInstruction* instruction, bool is_scheduling_barrier = false);
    299 
    300   const HScheduler* const scheduler_;
    301 
    302   ArenaAllocator* const arena_;
    303 
    304   bool contains_scheduling_barrier_;
    305 
    306   ArenaHashMap<const HInstruction*, SchedulingNode*> nodes_map_;
    307 };
    308 
    309 /*
    310  * The visitors derived from this base class are used by schedulers to evaluate
    311  * the latencies of `HInstruction`s.
    312  */
    313 class SchedulingLatencyVisitor : public HGraphDelegateVisitor {
    314  public:
    315   // This class and its sub-classes will never be used to drive a visit of an
    316   // `HGraph` but only to visit `HInstructions` one at a time, so we do not need
    317   // to pass a valid graph to `HGraphDelegateVisitor()`.
    318   SchedulingLatencyVisitor()
    319       : HGraphDelegateVisitor(nullptr),
    320         last_visited_latency_(0),
    321         last_visited_internal_latency_(0) {}
    322 
    323   void VisitInstruction(HInstruction* instruction) OVERRIDE {
    324     LOG(FATAL) << "Error visiting " << instruction->DebugName() << ". "
    325         "Architecture-specific scheduling latency visitors must handle all instructions"
    326         " (potentially by overriding the generic `VisitInstruction()`.";
    327     UNREACHABLE();
    328   }
    329 
    330   void Visit(HInstruction* instruction) {
    331     instruction->Accept(this);
    332   }
    333 
    334   void CalculateLatency(SchedulingNode* node) {
    335     // By default nodes have no internal latency.
    336     last_visited_internal_latency_ = 0;
    337     Visit(node->GetInstruction());
    338   }
    339 
    340   uint32_t GetLastVisitedLatency() const { return last_visited_latency_; }
    341   uint32_t GetLastVisitedInternalLatency() const { return last_visited_internal_latency_; }
    342 
    343  protected:
    344   // The latency of the most recent visited SchedulingNode.
    345   // This is for reporting the latency value to the user of this visitor.
    346   uint32_t last_visited_latency_;
    347   // This represents the time spent *within* the generated code for the most recent visited
    348   // SchedulingNode. This is for reporting the internal latency value to the user of this visitor.
    349   uint32_t last_visited_internal_latency_;
    350 };
    351 
    352 class SchedulingNodeSelector : public ArenaObject<kArenaAllocScheduler> {
    353  public:
    354   virtual SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode*>* nodes,
    355                                                  const SchedulingGraph& graph) = 0;
    356   virtual ~SchedulingNodeSelector() {}
    357  protected:
    358   static void DeleteNodeAtIndex(ArenaVector<SchedulingNode*>* nodes, size_t index) {
    359     (*nodes)[index] = nodes->back();
    360     nodes->pop_back();
    361   }
    362 };
    363 
    364 /*
    365  * Select a `SchedulingNode` at random within the candidates.
    366  */
    367 class RandomSchedulingNodeSelector : public SchedulingNodeSelector {
    368  public:
    369   explicit RandomSchedulingNodeSelector() : seed_(0) {
    370     seed_  = static_cast<uint32_t>(NanoTime());
    371     srand(seed_);
    372   }
    373 
    374   SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode*>* nodes,
    375                                          const SchedulingGraph& graph) OVERRIDE {
    376     UNUSED(graph);
    377     DCHECK(!nodes->empty());
    378     size_t select = rand_r(&seed_) % nodes->size();
    379     SchedulingNode* select_node = (*nodes)[select];
    380     DeleteNodeAtIndex(nodes, select);
    381     return select_node;
    382   }
    383 
    384   uint32_t seed_;
    385 };
    386 
    387 /*
    388  * Select a `SchedulingNode` according to critical path information,
    389  * with heuristics to favor certain instruction patterns like materialized condition.
    390  */
    391 class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
    392  public:
    393   CriticalPathSchedulingNodeSelector() : prev_select_(nullptr) {}
    394 
    395   SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode*>* nodes,
    396                                          const SchedulingGraph& graph) OVERRIDE;
    397 
    398  protected:
    399   SchedulingNode* GetHigherPrioritySchedulingNode(SchedulingNode* candidate,
    400                                                   SchedulingNode* check) const;
    401 
    402   SchedulingNode* SelectMaterializedCondition(ArenaVector<SchedulingNode*>* nodes,
    403                                                const SchedulingGraph& graph) const;
    404 
    405  private:
    406   const SchedulingNode* prev_select_;
    407 };
    408 
    409 class HScheduler {
    410  public:
    411   HScheduler(ArenaAllocator* arena,
    412              SchedulingLatencyVisitor* latency_visitor,
    413              SchedulingNodeSelector* selector)
    414       : arena_(arena),
    415         latency_visitor_(latency_visitor),
    416         selector_(selector),
    417         only_optimize_loop_blocks_(true),
    418         scheduling_graph_(this, arena),
    419         cursor_(nullptr),
    420         candidates_(arena_->Adapter(kArenaAllocScheduler)) {}
    421   virtual ~HScheduler() {}
    422 
    423   void Schedule(HGraph* graph);
    424 
    425   void SetOnlyOptimizeLoopBlocks(bool loop_only) { only_optimize_loop_blocks_ = loop_only; }
    426 
    427   // Instructions can not be rescheduled across a scheduling barrier.
    428   virtual bool IsSchedulingBarrier(const HInstruction* instruction) const;
    429 
    430  protected:
    431   void Schedule(HBasicBlock* block);
    432   void Schedule(SchedulingNode* scheduling_node);
    433   void Schedule(HInstruction* instruction);
    434 
    435   // Any instruction returning `false` via this method will prevent its
    436   // containing basic block from being scheduled.
    437   // This method is used to restrict scheduling to instructions that we know are
    438   // safe to handle.
    439   virtual bool IsSchedulable(const HInstruction* instruction) const;
    440   bool IsSchedulable(const HBasicBlock* block) const;
    441 
    442   void CalculateLatency(SchedulingNode* node) {
    443     latency_visitor_->CalculateLatency(node);
    444     node->SetLatency(latency_visitor_->GetLastVisitedLatency());
    445     node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency());
    446   }
    447 
    448   ArenaAllocator* const arena_;
    449   SchedulingLatencyVisitor* const latency_visitor_;
    450   SchedulingNodeSelector* const selector_;
    451   bool only_optimize_loop_blocks_;
    452 
    453   // We instantiate the members below as part of this class to avoid
    454   // instantiating them locally for every chunk scheduled.
    455   SchedulingGraph scheduling_graph_;
    456   // A pointer indicating where the next instruction to be scheduled will be inserted.
    457   HInstruction* cursor_;
    458   // The list of candidates for scheduling. A node becomes a candidate when all
    459   // its predecessors have been scheduled.
    460   ArenaVector<SchedulingNode*> candidates_;
    461 
    462  private:
    463   DISALLOW_COPY_AND_ASSIGN(HScheduler);
    464 };
    465 
    466 inline bool SchedulingGraph::IsSchedulingBarrier(const HInstruction* instruction) const {
    467   return scheduler_->IsSchedulingBarrier(instruction);
    468 }
    469 
    470 class HInstructionScheduling : public HOptimization {
    471  public:
    472   HInstructionScheduling(HGraph* graph, InstructionSet instruction_set)
    473       : HOptimization(graph, kInstructionScheduling),
    474         instruction_set_(instruction_set) {}
    475 
    476   void Run() {
    477     Run(/*only_optimize_loop_blocks*/ true, /*schedule_randomly*/ false);
    478   }
    479   void Run(bool only_optimize_loop_blocks, bool schedule_randomly);
    480 
    481   static constexpr const char* kInstructionScheduling = "scheduler";
    482 
    483   const InstructionSet instruction_set_;
    484 
    485  private:
    486   DISALLOW_COPY_AND_ASSIGN(HInstructionScheduling);
    487 };
    488 
    489 }  // namespace art
    490 
    491 #endif  // ART_COMPILER_OPTIMIZING_SCHEDULER_H_
    492