1 /* 2 * Copyright (C) 2016 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_H_ 18 #define ART_COMPILER_OPTIMIZING_SCHEDULER_H_ 19 20 #include <fstream> 21 22 #include "base/scoped_arena_allocator.h" 23 #include "base/scoped_arena_containers.h" 24 #include "base/time_utils.h" 25 #include "code_generator.h" 26 #include "driver/compiler_driver.h" 27 #include "load_store_analysis.h" 28 #include "nodes.h" 29 #include "optimization.h" 30 31 namespace art { 32 33 // General description of instruction scheduling. 34 // 35 // This pass tries to improve the quality of the generated code by reordering 36 // instructions in the graph to avoid execution delays caused by execution 37 // dependencies. 38 // Currently, scheduling is performed at the block level, so no `HInstruction` 39 // ever leaves its block in this pass. 40 // 41 // The scheduling process iterates through blocks in the graph. For blocks that 42 // we can and want to schedule: 43 // 1) Build a dependency graph for instructions. 44 // It includes data dependencies (inputs/uses), but also environment 45 // dependencies and side-effect dependencies. 46 // 2) Schedule the dependency graph. 47 // This is a topological sort of the dependency graph, using heuristics to 48 // decide what node to scheduler first when there are multiple candidates. 49 // 50 // A few factors impacting the quality of the scheduling are: 51 // - The heuristics used to decide what node to schedule in the topological sort 52 // when there are multiple valid candidates. There is a wide range of 53 // complexity possible here, going from a simple model only considering 54 // latencies, to a super detailed CPU pipeline model. 55 // - Fewer dependencies in the dependency graph give more freedom for the 56 // scheduling heuristics. For example de-aliasing can allow possibilities for 57 // reordering of memory accesses. 58 // - The level of abstraction of the IR. It is easier to evaluate scheduling for 59 // IRs that translate to a single assembly instruction than for IRs 60 // that generate multiple assembly instructions or generate different code 61 // depending on properties of the IR. 62 // - Scheduling is performed before register allocation, it is not aware of the 63 // impact of moving instructions on register allocation. 64 // 65 // 66 // The scheduling code uses the terms predecessors, successors, and dependencies. 67 // This can be confusing at times, so here are clarifications. 68 // These terms are used from the point of view of the program dependency graph. So 69 // the inputs of an instruction are part of its dependencies, and hence part its 70 // predecessors. So the uses of an instruction are (part of) its successors. 71 // (Side-effect dependencies can yield predecessors or successors that are not 72 // inputs or uses.) 73 // 74 // Here is a trivial example. For the Java code: 75 // 76 // int a = 1 + 2; 77 // 78 // we would have the instructions 79 // 80 // i1 HIntConstant 1 81 // i2 HIntConstant 2 82 // i3 HAdd [i1,i2] 83 // 84 // `i1` and `i2` are predecessors of `i3`. 85 // `i3` is a successor of `i1` and a successor of `i2`. 86 // In a scheduling graph for this code we would have three nodes `n1`, `n2`, 87 // and `n3` (respectively for instructions `i1`, `i1`, and `i3`). 88 // Conceptually the program dependency graph for this would contain two edges 89 // 90 // n1 -> n3 91 // n2 -> n3 92 // 93 // Since we schedule backwards (starting from the last instruction in each basic 94 // block), the implementation of nodes keeps a list of pointers their 95 // predecessors. So `n3` would keep pointers to its predecessors `n1` and `n2`. 96 // 97 // Node dependencies are also referred to from the program dependency graph 98 // point of view: we say that node `B` immediately depends on `A` if there is an 99 // edge from `A` to `B` in the program dependency graph. `A` is a predecessor of 100 // `B`, `B` is a successor of `A`. In the example above `n3` depends on `n1` and 101 // `n2`. 102 // Since nodes in the scheduling graph keep a list of their predecessors, node 103 // `B` will have a pointer to its predecessor `A`. 104 // As we schedule backwards, `B` will be selected for scheduling before `A` is. 105 // 106 // So the scheduling for the example above could happen as follow 107 // 108 // |---------------------------+------------------------| 109 // | candidates for scheduling | instructions scheduled | 110 // | --------------------------+------------------------| 111 // 112 // The only node without successors is `n3`, so it is the only initial 113 // candidate. 114 // 115 // | n3 | (none) | 116 // 117 // We schedule `n3` as the last (and only) instruction. All its predecessors 118 // that do not have any unscheduled successors become candidate. That is, `n1` 119 // and `n2` become candidates. 120 // 121 // | n1, n2 | n3 | 122 // 123 // One of the candidates is selected. In practice this is where scheduling 124 // heuristics kick in, to decide which of the candidates should be selected. 125 // In this example, let it be `n1`. It is scheduled before previously scheduled 126 // nodes (in program order). There are no other nodes to add to the list of 127 // candidates. 128 // 129 // | n2 | n1 | 130 // | | n3 | 131 // 132 // The only candidate available for scheduling is `n2`. Schedule it before 133 // (in program order) the previously scheduled nodes. 134 // 135 // | (none) | n2 | 136 // | | n1 | 137 // | | n3 | 138 // |---------------------------+------------------------| 139 // 140 // So finally the instructions will be executed in the order `i2`, `i1`, and `i3`. 141 // In this trivial example, it does not matter which of `i1` and `i2` is 142 // scheduled first since they are constants. However the same process would 143 // apply if `i1` and `i2` were actual operations (for example `HMul` and `HDiv`). 144 145 // Set to true to have instruction scheduling dump scheduling graphs to the file 146 // `scheduling_graphs.dot`. See `SchedulingGraph::DumpAsDotGraph()`. 147 static constexpr bool kDumpDotSchedulingGraphs = false; 148 149 // Typically used as a default instruction latency. 150 static constexpr uint32_t kGenericInstructionLatency = 1; 151 152 class HScheduler; 153 154 /** 155 * A node representing an `HInstruction` in the `SchedulingGraph`. 156 */ 157 class SchedulingNode : public DeletableArenaObject<kArenaAllocScheduler> { 158 public: 159 SchedulingNode(HInstruction* instr, ScopedArenaAllocator* allocator, bool is_scheduling_barrier) 160 : latency_(0), 161 internal_latency_(0), 162 critical_path_(0), 163 instruction_(instr), 164 is_scheduling_barrier_(is_scheduling_barrier), 165 data_predecessors_(allocator->Adapter(kArenaAllocScheduler)), 166 other_predecessors_(allocator->Adapter(kArenaAllocScheduler)), 167 num_unscheduled_successors_(0) { 168 data_predecessors_.reserve(kPreallocatedPredecessors); 169 } 170 171 void AddDataPredecessor(SchedulingNode* predecessor) { 172 data_predecessors_.push_back(predecessor); 173 predecessor->num_unscheduled_successors_++; 174 } 175 176 const ScopedArenaVector<SchedulingNode*>& GetDataPredecessors() const { 177 return data_predecessors_; 178 } 179 180 void AddOtherPredecessor(SchedulingNode* predecessor) { 181 other_predecessors_.push_back(predecessor); 182 predecessor->num_unscheduled_successors_++; 183 } 184 185 const ScopedArenaVector<SchedulingNode*>& GetOtherPredecessors() const { 186 return other_predecessors_; 187 } 188 189 void DecrementNumberOfUnscheduledSuccessors() { 190 num_unscheduled_successors_--; 191 } 192 193 void MaybeUpdateCriticalPath(uint32_t other_critical_path) { 194 critical_path_ = std::max(critical_path_, other_critical_path); 195 } 196 197 bool HasUnscheduledSuccessors() const { 198 return num_unscheduled_successors_ != 0; 199 } 200 201 HInstruction* GetInstruction() const { return instruction_; } 202 uint32_t GetLatency() const { return latency_; } 203 void SetLatency(uint32_t latency) { latency_ = latency; } 204 uint32_t GetInternalLatency() const { return internal_latency_; } 205 void SetInternalLatency(uint32_t internal_latency) { internal_latency_ = internal_latency; } 206 uint32_t GetCriticalPath() const { return critical_path_; } 207 bool IsSchedulingBarrier() const { return is_scheduling_barrier_; } 208 209 private: 210 // The latency of this node. It represents the latency between the moment the 211 // last instruction for this node has executed to the moment the result 212 // produced by this node is available to users. 213 uint32_t latency_; 214 // This represents the time spent *within* the generated code for this node. 215 // It should be zero for nodes that only generate a single instruction. 216 uint32_t internal_latency_; 217 218 // The critical path from this instruction to the end of scheduling. It is 219 // used by the scheduling heuristics to measure the priority of this instruction. 220 // It is defined as 221 // critical_path_ = latency_ + max((use.internal_latency_ + use.critical_path_) for all uses) 222 // (Note that here 'uses' is equivalent to 'data successors'. Also see comments in 223 // `HScheduler::Schedule(SchedulingNode* scheduling_node)`). 224 uint32_t critical_path_; 225 226 // The instruction that this node represents. 227 HInstruction* const instruction_; 228 229 // If a node is scheduling barrier, other nodes cannot be scheduled before it. 230 const bool is_scheduling_barrier_; 231 232 // The lists of predecessors. They cannot be scheduled before this node. Once 233 // this node is scheduled, we check whether any of its predecessors has become a 234 // valid candidate for scheduling. 235 // Predecessors in `data_predecessors_` are data dependencies. Those in 236 // `other_predecessors_` contain side-effect dependencies, environment 237 // dependencies, and scheduling barrier dependencies. 238 ScopedArenaVector<SchedulingNode*> data_predecessors_; 239 ScopedArenaVector<SchedulingNode*> other_predecessors_; 240 241 // The number of unscheduled successors for this node. This number is 242 // decremented as successors are scheduled. When it reaches zero this node 243 // becomes a valid candidate to schedule. 244 uint32_t num_unscheduled_successors_; 245 246 static constexpr size_t kPreallocatedPredecessors = 4; 247 }; 248 249 /* 250 * Directed acyclic graph for scheduling. 251 */ 252 class SchedulingGraph : public ValueObject { 253 public: 254 SchedulingGraph(const HScheduler* scheduler, ScopedArenaAllocator* allocator) 255 : scheduler_(scheduler), 256 allocator_(allocator), 257 contains_scheduling_barrier_(false), 258 nodes_map_(allocator_->Adapter(kArenaAllocScheduler)), 259 heap_location_collector_(nullptr) {} 260 261 SchedulingNode* AddNode(HInstruction* instr, bool is_scheduling_barrier = false) { 262 std::unique_ptr<SchedulingNode> node( 263 new (allocator_) SchedulingNode(instr, allocator_, is_scheduling_barrier)); 264 SchedulingNode* result = node.get(); 265 nodes_map_.Insert(std::make_pair(instr, std::move(node))); 266 contains_scheduling_barrier_ |= is_scheduling_barrier; 267 AddDependencies(instr, is_scheduling_barrier); 268 return result; 269 } 270 271 void Clear() { 272 nodes_map_.Clear(); 273 contains_scheduling_barrier_ = false; 274 } 275 276 void SetHeapLocationCollector(const HeapLocationCollector& heap_location_collector) { 277 heap_location_collector_ = &heap_location_collector; 278 } 279 280 SchedulingNode* GetNode(const HInstruction* instr) const { 281 auto it = nodes_map_.Find(instr); 282 if (it == nodes_map_.end()) { 283 return nullptr; 284 } else { 285 return it->second.get(); 286 } 287 } 288 289 bool IsSchedulingBarrier(const HInstruction* instruction) const; 290 291 bool HasImmediateDataDependency(const SchedulingNode* node, const SchedulingNode* other) const; 292 bool HasImmediateDataDependency(const HInstruction* node, const HInstruction* other) const; 293 bool HasImmediateOtherDependency(const SchedulingNode* node, const SchedulingNode* other) const; 294 bool HasImmediateOtherDependency(const HInstruction* node, const HInstruction* other) const; 295 296 size_t Size() const { 297 return nodes_map_.Size(); 298 } 299 300 // Dump the scheduling graph, in dot file format, appending it to the file 301 // `scheduling_graphs.dot`. 302 void DumpAsDotGraph(const std::string& description, 303 const ScopedArenaVector<SchedulingNode*>& initial_candidates); 304 305 protected: 306 void AddDependency(SchedulingNode* node, SchedulingNode* dependency, bool is_data_dependency); 307 void AddDataDependency(SchedulingNode* node, SchedulingNode* dependency) { 308 AddDependency(node, dependency, /*is_data_dependency*/true); 309 } 310 void AddOtherDependency(SchedulingNode* node, SchedulingNode* dependency) { 311 AddDependency(node, dependency, /*is_data_dependency*/false); 312 } 313 bool HasMemoryDependency(const HInstruction* node, const HInstruction* other) const; 314 bool HasExceptionDependency(const HInstruction* node, const HInstruction* other) const; 315 bool HasSideEffectDependency(const HInstruction* node, const HInstruction* other) const; 316 bool ArrayAccessMayAlias(const HInstruction* node, const HInstruction* other) const; 317 bool FieldAccessMayAlias(const HInstruction* node, const HInstruction* other) const; 318 size_t ArrayAccessHeapLocation(HInstruction* array, HInstruction* index) const; 319 size_t FieldAccessHeapLocation(HInstruction* obj, const FieldInfo* field) const; 320 321 // Add dependencies nodes for the given `HInstruction`: inputs, environments, and side-effects. 322 void AddDependencies(HInstruction* instruction, bool is_scheduling_barrier = false); 323 324 const HScheduler* const scheduler_; 325 326 ScopedArenaAllocator* const allocator_; 327 328 bool contains_scheduling_barrier_; 329 330 ScopedArenaHashMap<const HInstruction*, std::unique_ptr<SchedulingNode>> nodes_map_; 331 332 const HeapLocationCollector* heap_location_collector_; 333 }; 334 335 /* 336 * The visitors derived from this base class are used by schedulers to evaluate 337 * the latencies of `HInstruction`s. 338 */ 339 class SchedulingLatencyVisitor : public HGraphDelegateVisitor { 340 public: 341 // This class and its sub-classes will never be used to drive a visit of an 342 // `HGraph` but only to visit `HInstructions` one at a time, so we do not need 343 // to pass a valid graph to `HGraphDelegateVisitor()`. 344 SchedulingLatencyVisitor() 345 : HGraphDelegateVisitor(nullptr), 346 last_visited_latency_(0), 347 last_visited_internal_latency_(0) {} 348 349 void VisitInstruction(HInstruction* instruction) OVERRIDE { 350 LOG(FATAL) << "Error visiting " << instruction->DebugName() << ". " 351 "Architecture-specific scheduling latency visitors must handle all instructions" 352 " (potentially by overriding the generic `VisitInstruction()`."; 353 UNREACHABLE(); 354 } 355 356 void Visit(HInstruction* instruction) { 357 instruction->Accept(this); 358 } 359 360 void CalculateLatency(SchedulingNode* node) { 361 // By default nodes have no internal latency. 362 last_visited_internal_latency_ = 0; 363 Visit(node->GetInstruction()); 364 } 365 366 uint32_t GetLastVisitedLatency() const { return last_visited_latency_; } 367 uint32_t GetLastVisitedInternalLatency() const { return last_visited_internal_latency_; } 368 369 protected: 370 // The latency of the most recent visited SchedulingNode. 371 // This is for reporting the latency value to the user of this visitor. 372 uint32_t last_visited_latency_; 373 // This represents the time spent *within* the generated code for the most recent visited 374 // SchedulingNode. This is for reporting the internal latency value to the user of this visitor. 375 uint32_t last_visited_internal_latency_; 376 }; 377 378 class SchedulingNodeSelector : public ArenaObject<kArenaAllocScheduler> { 379 public: 380 virtual SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes, 381 const SchedulingGraph& graph) = 0; 382 virtual ~SchedulingNodeSelector() {} 383 protected: 384 static void DeleteNodeAtIndex(ScopedArenaVector<SchedulingNode*>* nodes, size_t index) { 385 (*nodes)[index] = nodes->back(); 386 nodes->pop_back(); 387 } 388 }; 389 390 /* 391 * Select a `SchedulingNode` at random within the candidates. 392 */ 393 class RandomSchedulingNodeSelector : public SchedulingNodeSelector { 394 public: 395 RandomSchedulingNodeSelector() : seed_(0) { 396 seed_ = static_cast<uint32_t>(NanoTime()); 397 srand(seed_); 398 } 399 400 SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes, 401 const SchedulingGraph& graph) OVERRIDE { 402 UNUSED(graph); 403 DCHECK(!nodes->empty()); 404 size_t select = rand_r(&seed_) % nodes->size(); 405 SchedulingNode* select_node = (*nodes)[select]; 406 DeleteNodeAtIndex(nodes, select); 407 return select_node; 408 } 409 410 uint32_t seed_; 411 }; 412 413 /* 414 * Select a `SchedulingNode` according to critical path information, 415 * with heuristics to favor certain instruction patterns like materialized condition. 416 */ 417 class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector { 418 public: 419 CriticalPathSchedulingNodeSelector() : prev_select_(nullptr) {} 420 421 SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes, 422 const SchedulingGraph& graph) OVERRIDE; 423 424 protected: 425 SchedulingNode* GetHigherPrioritySchedulingNode(SchedulingNode* candidate, 426 SchedulingNode* check) const; 427 428 SchedulingNode* SelectMaterializedCondition(ScopedArenaVector<SchedulingNode*>* nodes, 429 const SchedulingGraph& graph) const; 430 431 private: 432 const SchedulingNode* prev_select_; 433 }; 434 435 class HScheduler { 436 public: 437 HScheduler(ScopedArenaAllocator* allocator, 438 SchedulingLatencyVisitor* latency_visitor, 439 SchedulingNodeSelector* selector) 440 : allocator_(allocator), 441 latency_visitor_(latency_visitor), 442 selector_(selector), 443 only_optimize_loop_blocks_(true), 444 scheduling_graph_(this, allocator), 445 cursor_(nullptr), 446 candidates_(allocator_->Adapter(kArenaAllocScheduler)) {} 447 virtual ~HScheduler() {} 448 449 void Schedule(HGraph* graph); 450 451 void SetOnlyOptimizeLoopBlocks(bool loop_only) { only_optimize_loop_blocks_ = loop_only; } 452 453 // Instructions can not be rescheduled across a scheduling barrier. 454 virtual bool IsSchedulingBarrier(const HInstruction* instruction) const; 455 456 protected: 457 void Schedule(HBasicBlock* block); 458 void Schedule(SchedulingNode* scheduling_node); 459 void Schedule(HInstruction* instruction); 460 461 // Any instruction returning `false` via this method will prevent its 462 // containing basic block from being scheduled. 463 // This method is used to restrict scheduling to instructions that we know are 464 // safe to handle. 465 // 466 // For newly introduced instructions by default HScheduler::IsSchedulable returns false. 467 // HScheduler${ARCH}::IsSchedulable can be overridden to return true for an instruction (see 468 // scheduler_arm64.h for example) if it is safe to schedule it; in this case one *must* also 469 // look at/update HScheduler${ARCH}::IsSchedulingBarrier for this instruction. 470 virtual bool IsSchedulable(const HInstruction* instruction) const; 471 bool IsSchedulable(const HBasicBlock* block) const; 472 473 void CalculateLatency(SchedulingNode* node) { 474 latency_visitor_->CalculateLatency(node); 475 node->SetLatency(latency_visitor_->GetLastVisitedLatency()); 476 node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency()); 477 } 478 479 ScopedArenaAllocator* const allocator_; 480 SchedulingLatencyVisitor* const latency_visitor_; 481 SchedulingNodeSelector* const selector_; 482 bool only_optimize_loop_blocks_; 483 484 // We instantiate the members below as part of this class to avoid 485 // instantiating them locally for every chunk scheduled. 486 SchedulingGraph scheduling_graph_; 487 // A pointer indicating where the next instruction to be scheduled will be inserted. 488 HInstruction* cursor_; 489 // The list of candidates for scheduling. A node becomes a candidate when all 490 // its predecessors have been scheduled. 491 ScopedArenaVector<SchedulingNode*> candidates_; 492 493 private: 494 DISALLOW_COPY_AND_ASSIGN(HScheduler); 495 }; 496 497 inline bool SchedulingGraph::IsSchedulingBarrier(const HInstruction* instruction) const { 498 return scheduler_->IsSchedulingBarrier(instruction); 499 } 500 501 class HInstructionScheduling : public HOptimization { 502 public: 503 HInstructionScheduling(HGraph* graph, 504 InstructionSet instruction_set, 505 CodeGenerator* cg = nullptr, 506 const char* name = kInstructionSchedulingPassName) 507 : HOptimization(graph, name), 508 codegen_(cg), 509 instruction_set_(instruction_set) {} 510 511 void Run() { 512 Run(/*only_optimize_loop_blocks*/ true, /*schedule_randomly*/ false); 513 } 514 void Run(bool only_optimize_loop_blocks, bool schedule_randomly); 515 516 static constexpr const char* kInstructionSchedulingPassName = "scheduler"; 517 518 private: 519 CodeGenerator* const codegen_; 520 const InstructionSet instruction_set_; 521 DISALLOW_COPY_AND_ASSIGN(HInstructionScheduling); 522 }; 523 524 } // namespace art 525 526 #endif // ART_COMPILER_OPTIMIZING_SCHEDULER_H_ 527