1 //===- subzero/src/IceCfg.cpp - Control flow graph implementation ---------===// 2 // 3 // The Subzero Code Generator 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// \brief Implements the Cfg class. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "IceCfg.h" 16 17 #include "IceAssembler.h" 18 #include "IceBitVector.h" 19 #include "IceCfgNode.h" 20 #include "IceClFlags.h" 21 #include "IceDefs.h" 22 #include "IceELFObjectWriter.h" 23 #include "IceGlobalInits.h" 24 #include "IceInst.h" 25 #include "IceInstVarIter.h" 26 #include "IceInstrumentation.h" 27 #include "IceLiveness.h" 28 #include "IceLoopAnalyzer.h" 29 #include "IceOperand.h" 30 #include "IceTargetLowering.h" 31 32 #include <memory> 33 #include <utility> 34 35 namespace Ice { 36 37 Cfg::Cfg(GlobalContext *Ctx, uint32_t SequenceNumber) 38 : Allocator(createAllocator()), Ctx(Ctx), SequenceNumber(SequenceNumber), 39 VMask(getFlags().getVerbose()), FunctionName(), 40 NextInstNumber(Inst::NumberInitial), Live(nullptr) { 41 NodeStrings.reset(new StringPool); 42 VarStrings.reset(new StringPool); 43 CfgLocalAllocatorScope _(this); 44 Target = TargetLowering::createLowering(getFlags().getTargetArch(), this); 45 VMetadata.reset(new VariablesMetadata(this)); 46 TargetAssembler = Target->createAssembler(); 47 48 if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Randomize) { 49 // If -randomize-pool-immediates=randomize, create a random number 50 // generator to generate a cookie for constant blinding. 51 RandomNumberGenerator RNG(getFlags().getRandomSeed(), RPE_ConstantBlinding, 52 this->SequenceNumber); 53 ConstantBlindingCookie = 54 (uint32_t)RNG.next((uint64_t)std::numeric_limits<uint32_t>::max() + 1); 55 } 56 } 57 58 Cfg::~Cfg() { 59 assert(CfgAllocatorTraits::current() == nullptr); 60 if (getFlags().getDumpStrings()) { 61 OstreamLocker _(Ctx); 62 Ostream &Str = Ctx->getStrDump(); 63 getNodeStrings()->dump(Str); 64 getVarStrings()->dump(Str); 65 } 66 } 67 68 // Called in the initalizer list of Cfg's constructor to create the Allocator 69 // and set it as TLS before any other member fields are constructed, since they 70 // may depend on it. 71 ArenaAllocator *Cfg::createAllocator() { 72 ArenaAllocator *Allocator = new ArenaAllocator(); 73 CfgAllocatorTraits::set_current(Allocator); 74 return Allocator; 75 } 76 77 /// Create a string like "foo(i=123:b=9)" indicating the function name, number 78 /// of high-level instructions, and number of basic blocks. This string is only 79 /// used for dumping and other diagnostics, and the idea is that given a set of 80 /// functions to debug a problem on, it's easy to find the smallest or simplest 81 /// function to attack. Note that the counts may change somewhat depending on 82 /// what point it is called during the translation passes. 83 std::string Cfg::getFunctionNameAndSize() const { 84 if (!BuildDefs::dump()) 85 return getFunctionName().toString(); 86 SizeT NodeCount = 0; 87 SizeT InstCount = 0; 88 for (CfgNode *Node : getNodes()) { 89 ++NodeCount; 90 // Note: deleted instructions are *not* ignored. 91 InstCount += Node->getPhis().size(); 92 for (Inst &I : Node->getInsts()) { 93 if (!llvm::isa<InstTarget>(&I)) 94 ++InstCount; 95 } 96 } 97 return getFunctionName() + "(i=" + std::to_string(InstCount) + ":b=" + 98 std::to_string(NodeCount) + ")"; 99 } 100 101 void Cfg::setError(const std::string &Message) { 102 HasError = true; 103 ErrorMessage = Message; 104 } 105 106 CfgNode *Cfg::makeNode() { 107 SizeT LabelIndex = Nodes.size(); 108 auto *Node = CfgNode::create(this, LabelIndex); 109 Nodes.push_back(Node); 110 return Node; 111 } 112 113 void Cfg::swapNodes(NodeList &NewNodes) { 114 assert(Nodes.size() == NewNodes.size()); 115 Nodes.swap(NewNodes); 116 for (SizeT I = 0, NumNodes = getNumNodes(); I < NumNodes; ++I) 117 Nodes[I]->resetIndex(I); 118 } 119 120 template <> Variable *Cfg::makeVariable<Variable>(Type Ty) { 121 SizeT Index = Variables.size(); 122 Variable *Var; 123 if (Target->shouldSplitToVariableVecOn32(Ty)) { 124 Var = VariableVecOn32::create(this, Ty, Index); 125 } else if (Target->shouldSplitToVariable64On32(Ty)) { 126 Var = Variable64On32::create(this, Ty, Index); 127 } else { 128 Var = Variable::create(this, Ty, Index); 129 } 130 Variables.push_back(Var); 131 return Var; 132 } 133 134 void Cfg::addArg(Variable *Arg) { 135 Arg->setIsArg(); 136 Args.push_back(Arg); 137 } 138 139 void Cfg::addImplicitArg(Variable *Arg) { 140 Arg->setIsImplicitArg(); 141 ImplicitArgs.push_back(Arg); 142 } 143 144 // Returns whether the stack frame layout has been computed yet. This is used 145 // for dumping the stack frame location of Variables. 146 bool Cfg::hasComputedFrame() const { return getTarget()->hasComputedFrame(); } 147 148 namespace { 149 constexpr char BlockNameGlobalPrefix[] = ".L$profiler$block_name$"; 150 constexpr char BlockStatsGlobalPrefix[] = ".L$profiler$block_info$"; 151 } // end of anonymous namespace 152 153 void Cfg::createNodeNameDeclaration(const std::string &NodeAsmName) { 154 auto *Var = VariableDeclaration::create(GlobalInits.get()); 155 Var->setName(Ctx, BlockNameGlobalPrefix + NodeAsmName); 156 Var->setIsConstant(true); 157 Var->addInitializer(VariableDeclaration::DataInitializer::create( 158 GlobalInits.get(), NodeAsmName.data(), NodeAsmName.size() + 1)); 159 const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64); 160 Var->setAlignment(Int64ByteSize); // Wasteful, 32-bit could use 4 bytes. 161 GlobalInits->push_back(Var); 162 } 163 164 void Cfg::createBlockProfilingInfoDeclaration( 165 const std::string &NodeAsmName, VariableDeclaration *NodeNameDeclaration) { 166 auto *Var = VariableDeclaration::create(GlobalInits.get()); 167 Var->setName(Ctx, BlockStatsGlobalPrefix + NodeAsmName); 168 const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64); 169 Var->addInitializer(VariableDeclaration::ZeroInitializer::create( 170 GlobalInits.get(), Int64ByteSize)); 171 172 const RelocOffsetT NodeNameDeclarationOffset = 0; 173 Var->addInitializer(VariableDeclaration::RelocInitializer::create( 174 GlobalInits.get(), NodeNameDeclaration, 175 {RelocOffset::create(Ctx, NodeNameDeclarationOffset)})); 176 Var->setAlignment(Int64ByteSize); 177 GlobalInits->push_back(Var); 178 } 179 180 void Cfg::profileBlocks() { 181 if (GlobalInits == nullptr) 182 GlobalInits.reset(new VariableDeclarationList()); 183 184 for (CfgNode *Node : Nodes) { 185 const std::string NodeAsmName = Node->getAsmName(); 186 createNodeNameDeclaration(NodeAsmName); 187 createBlockProfilingInfoDeclaration(NodeAsmName, GlobalInits->back()); 188 Node->profileExecutionCount(GlobalInits->back()); 189 } 190 } 191 192 bool Cfg::isProfileGlobal(const VariableDeclaration &Var) { 193 if (!Var.getName().hasStdString()) 194 return false; 195 return Var.getName().toString().find(BlockStatsGlobalPrefix) == 0; 196 } 197 198 void Cfg::addCallToProfileSummary() { 199 // The call(s) to __Sz_profile_summary are added by the profiler in functions 200 // that cause the program to exit. This function is defined in 201 // runtime/szrt_profiler.c. 202 Constant *ProfileSummarySym = 203 Ctx->getConstantExternSym(Ctx->getGlobalString("__Sz_profile_summary")); 204 constexpr SizeT NumArgs = 0; 205 constexpr Variable *Void = nullptr; 206 constexpr bool HasTailCall = false; 207 auto *Call = 208 InstCall::create(this, NumArgs, Void, ProfileSummarySym, HasTailCall); 209 getEntryNode()->getInsts().push_front(Call); 210 } 211 212 void Cfg::translate() { 213 if (hasError()) 214 return; 215 // Cache the possibly-overridden optimization level once translation begins. 216 // It would be nicer to do this in the constructor, but we need to wait until 217 // after setFunctionName() has a chance to be called. 218 OptimizationLevel = 219 getFlags().matchForceO2(getFunctionName(), getSequenceNumber()) 220 ? Opt_2 221 : getFlags().getOptLevel(); 222 if (BuildDefs::timers()) { 223 if (getFlags().matchTimingFocus(getFunctionName(), getSequenceNumber())) { 224 setFocusedTiming(); 225 getContext()->resetTimer(GlobalContext::TSK_Default); 226 } 227 } 228 if (BuildDefs::dump()) { 229 if (isVerbose(IceV_Status) && 230 getFlags().matchTestStatus(getFunctionName(), getSequenceNumber())) { 231 getContext()->getStrDump() << ">>>Translating " 232 << getFunctionNameAndSize() 233 << " seq=" << getSequenceNumber() << "\n"; 234 } 235 } 236 TimerMarker T_func(getContext(), getFunctionName().toStringOrEmpty()); 237 TimerMarker T(TimerStack::TT_translate, this); 238 239 dump("Initial CFG"); 240 241 if (getFlags().getEnableBlockProfile()) { 242 profileBlocks(); 243 // TODO(jpp): this is fragile, at best. Figure out a better way of 244 // detecting exit functions. 245 if (getFunctionName().toStringOrEmpty() == "exit") { 246 addCallToProfileSummary(); 247 } 248 dump("Profiled CFG"); 249 } 250 251 // Create the Hi and Lo variables where a split was needed 252 for (Variable *Var : Variables) { 253 if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Var)) { 254 Var64On32->initHiLo(this); 255 } else if (auto *VarVecOn32 = llvm::dyn_cast<VariableVecOn32>(Var)) { 256 VarVecOn32->initVecElement(this); 257 } 258 } 259 260 // Instrument the Cfg, e.g. with AddressSanitizer 261 if (!BuildDefs::minimal() && getFlags().getSanitizeAddresses()) { 262 getContext()->instrumentFunc(this); 263 dump("Instrumented CFG"); 264 } 265 266 // The set of translation passes and their order are determined by the 267 // target. 268 getTarget()->translate(); 269 270 dump("Final output"); 271 if (getFocusedTiming()) { 272 getContext()->dumpLocalTimers(getFunctionName().toString()); 273 } 274 } 275 276 void Cfg::fixPhiNodes() { 277 for (auto *Node : Nodes) { 278 // Fix all the phi edges since WASM can't tell how to make them correctly at 279 // the beginning. 280 assert(Node); 281 const auto &InEdges = Node->getInEdges(); 282 for (auto &Instr : Node->getPhis()) { 283 auto *Phi = llvm::cast<InstPhi>(&Instr); 284 assert(Phi); 285 for (SizeT i = 0; i < InEdges.size(); ++i) { 286 Phi->setLabel(i, InEdges[i]); 287 } 288 } 289 } 290 } 291 292 void Cfg::computeInOutEdges() { 293 // Compute the out-edges. 294 for (CfgNode *Node : Nodes) { 295 Node->computeSuccessors(); 296 } 297 298 // Prune any unreachable nodes before computing in-edges. 299 SizeT NumNodes = getNumNodes(); 300 BitVector Reachable(NumNodes); 301 BitVector Pending(NumNodes); 302 Pending.set(getEntryNode()->getIndex()); 303 while (true) { 304 int Index = Pending.find_first(); 305 if (Index == -1) 306 break; 307 Pending.reset(Index); 308 Reachable.set(Index); 309 CfgNode *Node = Nodes[Index]; 310 assert(Node->getIndex() == (SizeT)Index); 311 for (CfgNode *Succ : Node->getOutEdges()) { 312 SizeT SuccIndex = Succ->getIndex(); 313 if (!Reachable.test(SuccIndex)) 314 Pending.set(SuccIndex); 315 } 316 } 317 SizeT Dest = 0; 318 for (SizeT Source = 0; Source < NumNodes; ++Source) { 319 if (Reachable.test(Source)) { 320 Nodes[Dest] = Nodes[Source]; 321 Nodes[Dest]->resetIndex(Dest); 322 // Compute the in-edges. 323 Nodes[Dest]->computePredecessors(); 324 ++Dest; 325 } 326 } 327 Nodes.resize(Dest); 328 329 TimerMarker T(TimerStack::TT_phiValidation, this); 330 for (CfgNode *Node : Nodes) 331 Node->enforcePhiConsistency(); 332 } 333 334 void Cfg::renumberInstructions() { 335 TimerMarker T(TimerStack::TT_renumberInstructions, this); 336 NextInstNumber = Inst::NumberInitial; 337 for (CfgNode *Node : Nodes) 338 Node->renumberInstructions(); 339 // Make sure the entry node is the first node and therefore got the lowest 340 // instruction numbers, to facilitate live range computation of function 341 // arguments. We want to model function arguments as being live on entry to 342 // the function, otherwise an argument whose only use is in the first 343 // instruction will be assigned a trivial live range and the register 344 // allocator will not recognize its live range as overlapping another 345 // variable's live range. 346 assert(Nodes.empty() || (*Nodes.begin() == getEntryNode())); 347 } 348 349 // placePhiLoads() must be called before placePhiStores(). 350 void Cfg::placePhiLoads() { 351 TimerMarker T(TimerStack::TT_placePhiLoads, this); 352 for (CfgNode *Node : Nodes) 353 Node->placePhiLoads(); 354 } 355 356 // placePhiStores() must be called after placePhiLoads(). 357 void Cfg::placePhiStores() { 358 TimerMarker T(TimerStack::TT_placePhiStores, this); 359 for (CfgNode *Node : Nodes) 360 Node->placePhiStores(); 361 } 362 363 void Cfg::deletePhis() { 364 TimerMarker T(TimerStack::TT_deletePhis, this); 365 for (CfgNode *Node : Nodes) 366 Node->deletePhis(); 367 } 368 369 void Cfg::advancedPhiLowering() { 370 TimerMarker T(TimerStack::TT_advancedPhiLowering, this); 371 // Clear all previously computed live ranges (but not live-in/live-out bit 372 // vectors or last-use markers), because the follow-on register allocation is 373 // only concerned with live ranges across the newly created blocks. 374 for (Variable *Var : Variables) { 375 Var->getLiveRange().reset(); 376 } 377 // This splits edges and appends new nodes to the end of the node list. This 378 // can invalidate iterators, so don't use an iterator. 379 SizeT NumNodes = getNumNodes(); 380 SizeT NumVars = getNumVariables(); 381 for (SizeT I = 0; I < NumNodes; ++I) 382 Nodes[I]->advancedPhiLowering(); 383 384 TimerMarker TT(TimerStack::TT_lowerPhiAssignments, this); 385 if (true) { 386 // The following code does an in-place update of liveness and live ranges 387 // as a result of adding the new phi edge split nodes. 388 getLiveness()->initPhiEdgeSplits(Nodes.begin() + NumNodes, 389 Variables.begin() + NumVars); 390 TimerMarker TTT(TimerStack::TT_liveness, this); 391 // Iterate over the newly added nodes to add their liveness info. 392 for (auto I = Nodes.begin() + NumNodes, E = Nodes.end(); I != E; ++I) { 393 InstNumberT FirstInstNum = getNextInstNumber(); 394 (*I)->renumberInstructions(); 395 InstNumberT LastInstNum = getNextInstNumber() - 1; 396 (*I)->liveness(getLiveness()); 397 (*I)->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum); 398 } 399 } else { 400 // The following code does a brute-force recalculation of live ranges as a 401 // result of adding the new phi edge split nodes. The liveness calculation 402 // is particularly expensive because the new nodes are not yet in a proper 403 // topological order and so convergence is slower. 404 // 405 // This code is kept here for reference and can be temporarily enabled in 406 // case the incremental code is under suspicion. 407 renumberInstructions(); 408 liveness(Liveness_Intervals); 409 getVMetadata()->init(VMK_All); 410 } 411 Target->regAlloc(RAK_Phi); 412 } 413 414 // Find a reasonable placement for nodes that have not yet been placed, while 415 // maintaining the same relative ordering among already placed nodes. 416 void Cfg::reorderNodes() { 417 // TODO(ascull): it would be nice if the switch tests were always followed by 418 // the default case to allow for fall through. 419 using PlacedList = CfgList<CfgNode *>; 420 PlacedList Placed; // Nodes with relative placement locked down 421 PlacedList Unreachable; // Unreachable nodes 422 PlacedList::iterator NoPlace = Placed.end(); 423 // Keep track of where each node has been tentatively placed so that we can 424 // manage insertions into the middle. 425 CfgVector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace); 426 for (CfgNode *Node : Nodes) { 427 // The "do ... while(0);" construct is to factor out the --PlaceIndex and 428 // assert() statements before moving to the next node. 429 do { 430 if (Node != getEntryNode() && Node->getInEdges().empty()) { 431 // The node has essentially been deleted since it is not a successor of 432 // any other node. 433 Unreachable.push_back(Node); 434 PlaceIndex[Node->getIndex()] = Unreachable.end(); 435 Node->setNeedsPlacement(false); 436 continue; 437 } 438 if (!Node->needsPlacement()) { 439 // Add to the end of the Placed list. 440 Placed.push_back(Node); 441 PlaceIndex[Node->getIndex()] = Placed.end(); 442 continue; 443 } 444 Node->setNeedsPlacement(false); 445 // Assume for now that the unplaced node is from edge-splitting and 446 // therefore has 1 in-edge and 1 out-edge (actually, possibly more than 1 447 // in-edge if the predecessor node was contracted). If this changes in 448 // the future, rethink the strategy. 449 assert(Node->getInEdges().size() >= 1); 450 assert(Node->hasSingleOutEdge()); 451 452 // If it's a (non-critical) edge where the successor has a single 453 // in-edge, then place it before the successor. 454 CfgNode *Succ = Node->getOutEdges().front(); 455 if (Succ->getInEdges().size() == 1 && 456 PlaceIndex[Succ->getIndex()] != NoPlace) { 457 Placed.insert(PlaceIndex[Succ->getIndex()], Node); 458 PlaceIndex[Node->getIndex()] = PlaceIndex[Succ->getIndex()]; 459 continue; 460 } 461 462 // Otherwise, place it after the (first) predecessor. 463 CfgNode *Pred = Node->getInEdges().front(); 464 auto PredPosition = PlaceIndex[Pred->getIndex()]; 465 // It shouldn't be the case that PredPosition==NoPlace, but if that 466 // somehow turns out to be true, we just insert Node before 467 // PredPosition=NoPlace=Placed.end() . 468 if (PredPosition != NoPlace) 469 ++PredPosition; 470 Placed.insert(PredPosition, Node); 471 PlaceIndex[Node->getIndex()] = PredPosition; 472 } while (0); 473 474 --PlaceIndex[Node->getIndex()]; 475 assert(*PlaceIndex[Node->getIndex()] == Node); 476 } 477 478 // Reorder Nodes according to the built-up lists. 479 NodeList Reordered; 480 Reordered.reserve(Placed.size() + Unreachable.size()); 481 for (CfgNode *Node : Placed) 482 Reordered.push_back(Node); 483 for (CfgNode *Node : Unreachable) 484 Reordered.push_back(Node); 485 assert(getNumNodes() == Reordered.size()); 486 swapNodes(Reordered); 487 } 488 489 namespace { 490 void getRandomPostOrder(CfgNode *Node, BitVector &ToVisit, 491 Ice::NodeList &PostOrder, 492 Ice::RandomNumberGenerator *RNG) { 493 assert(ToVisit[Node->getIndex()]); 494 ToVisit[Node->getIndex()] = false; 495 NodeList Outs = Node->getOutEdges(); 496 Ice::RandomShuffle(Outs.begin(), Outs.end(), 497 [RNG](int N) { return RNG->next(N); }); 498 for (CfgNode *Next : Outs) { 499 if (ToVisit[Next->getIndex()]) 500 getRandomPostOrder(Next, ToVisit, PostOrder, RNG); 501 } 502 PostOrder.push_back(Node); 503 } 504 } // end of anonymous namespace 505 506 void Cfg::shuffleNodes() { 507 if (!getFlags().getReorderBasicBlocks()) 508 return; 509 510 NodeList ReversedReachable; 511 NodeList Unreachable; 512 BitVector ToVisit(Nodes.size(), true); 513 // Create Random number generator for function reordering 514 RandomNumberGenerator RNG(getFlags().getRandomSeed(), 515 RPE_BasicBlockReordering, SequenceNumber); 516 // Traverse from entry node. 517 getRandomPostOrder(getEntryNode(), ToVisit, ReversedReachable, &RNG); 518 // Collect the unreachable nodes. 519 for (CfgNode *Node : Nodes) 520 if (ToVisit[Node->getIndex()]) 521 Unreachable.push_back(Node); 522 // Copy the layout list to the Nodes. 523 NodeList Shuffled; 524 Shuffled.reserve(ReversedReachable.size() + Unreachable.size()); 525 for (CfgNode *Node : reverse_range(ReversedReachable)) 526 Shuffled.push_back(Node); 527 for (CfgNode *Node : Unreachable) 528 Shuffled.push_back(Node); 529 assert(Nodes.size() == Shuffled.size()); 530 swapNodes(Shuffled); 531 532 dump("After basic block shuffling"); 533 } 534 535 void Cfg::localCSE(bool AssumeSSA) { 536 // Performs basic-block local common-subexpression elimination 537 // If we have 538 // t1 = op b c 539 // t2 = op b c 540 // This pass will replace future references to t2 in a basic block by t1 541 // Points to note: 542 // 1. Assumes SSA by default. To change this, use -lcse=no-ssa 543 // This is needed if this pass is moved to a point later in the pipeline. 544 // If variables have a single definition (in the node), CSE can work just 545 // on the basis of an equality compare on instructions (sans Dest). When 546 // variables can be updated (hence, non-SSA) the result of a previous 547 // instruction which used that variable as an operand can not be reused. 548 // 2. Leaves removal of instructions to DCE. 549 // 3. Only enabled on arithmetic instructions. pnacl-clang (-O2) is expected 550 // to take care of cases not arising from GEP simplification. 551 // 4. By default, a single pass is made over each basic block. Control this 552 // with -lcse-max-iters=N 553 554 TimerMarker T(TimerStack::TT_localCse, this); 555 struct VariableHash { 556 size_t operator()(const Variable *Var) const { return Var->hashValue(); } 557 }; 558 559 struct InstHash { 560 size_t operator()(const Inst *Instr) const { 561 auto Kind = Instr->getKind(); 562 auto Result = 563 std::hash<typename std::underlying_type<Inst::InstKind>::type>()( 564 Kind); 565 for (SizeT i = 0; i < Instr->getSrcSize(); ++i) { 566 Result ^= Instr->getSrc(i)->hashValue(); 567 } 568 return Result; 569 } 570 }; 571 struct InstEq { 572 bool srcEq(const Operand *A, const Operand *B) const { 573 if (llvm::isa<Variable>(A) || llvm::isa<Constant>(A)) 574 return (A == B); 575 return false; 576 } 577 bool operator()(const Inst *InstrA, const Inst *InstrB) const { 578 if ((InstrA->getKind() != InstrB->getKind()) || 579 (InstrA->getSrcSize() != InstrB->getSrcSize())) 580 return false; 581 582 if (auto *A = llvm::dyn_cast<InstArithmetic>(InstrA)) { 583 auto *B = llvm::cast<InstArithmetic>(InstrB); 584 // A, B are guaranteed to be of the same 'kind' at this point 585 // So, dyn_cast is not needed 586 if (A->getOp() != B->getOp()) 587 return false; 588 } 589 // Does not enter loop if different kind or number of operands 590 for (SizeT i = 0; i < InstrA->getSrcSize(); ++i) { 591 if (!srcEq(InstrA->getSrc(i), InstrB->getSrc(i))) 592 return false; 593 } 594 return true; 595 } 596 }; 597 598 for (CfgNode *Node : getNodes()) { 599 CfgUnorderedSet<Inst *, InstHash, InstEq> Seen; 600 // Stores currently available instructions. 601 602 CfgUnorderedMap<Variable *, Variable *, VariableHash> Replacements; 603 // Combining the above two into a single data structure might consume less 604 // memory but will be slower i.e map of Instruction -> Set of Variables 605 606 CfgUnorderedMap<Variable *, std::vector<Inst *>, VariableHash> Dependency; 607 // Maps a variable to the Instructions that depend on it. 608 // a = op1 b c 609 // x = op2 c d 610 // Will result in the map : b -> {a}, c -> {a, x}, d -> {x} 611 // Not necessary for SSA as dependencies will never be invalidated, and the 612 // container will use minimal memory when left unused. 613 614 auto IterCount = getFlags().getLocalCseMaxIterations(); 615 616 for (uint32_t i = 0; i < IterCount; ++i) { 617 // TODO(manasijm): Stats on IterCount -> performance 618 for (Inst &Instr : Node->getInsts()) { 619 if (Instr.isDeleted() || !llvm::isa<InstArithmetic>(&Instr)) 620 continue; 621 if (!AssumeSSA) { 622 // Invalidate replacements 623 auto Iter = Replacements.find(Instr.getDest()); 624 if (Iter != Replacements.end()) { 625 Replacements.erase(Iter); 626 } 627 628 // Invalidate 'seen' instructions whose operands were just updated. 629 auto DepIter = Dependency.find(Instr.getDest()); 630 if (DepIter != Dependency.end()) { 631 for (auto *DepInst : DepIter->second) { 632 Seen.erase(DepInst); 633 } 634 } 635 } 636 637 // Replace - doing this before checking for repetitions might enable 638 // more optimizations 639 for (SizeT i = 0; i < Instr.getSrcSize(); ++i) { 640 auto *Opnd = Instr.getSrc(i); 641 if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) { 642 if (Replacements.find(Var) != Replacements.end()) { 643 Instr.replaceSource(i, Replacements[Var]); 644 } 645 } 646 } 647 648 // Check for repetitions 649 auto SeenIter = Seen.find(&Instr); 650 if (SeenIter != Seen.end()) { // seen before 651 const Inst *Found = *SeenIter; 652 Replacements[Instr.getDest()] = Found->getDest(); 653 } else { // new 654 Seen.insert(&Instr); 655 656 if (!AssumeSSA) { 657 // Update dependencies 658 for (SizeT i = 0; i < Instr.getSrcSize(); ++i) { 659 auto *Opnd = Instr.getSrc(i); 660 if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) { 661 Dependency[Var].push_back(&Instr); 662 } 663 } 664 } 665 } 666 } 667 } 668 } 669 } 670 671 void Cfg::loopInvariantCodeMotion() { 672 TimerMarker T(TimerStack::TT_loopInvariantCodeMotion, this); 673 // Does not introduce new nodes as of now. 674 for (auto &Loop : LoopInfo) { 675 CfgNode *Header = Loop.Header; 676 assert(Header); 677 if (Header->getLoopNestDepth() < 1) 678 return; 679 CfgNode *PreHeader = Loop.PreHeader; 680 if (PreHeader == nullptr || PreHeader->getInsts().size() == 0) { 681 return; // try next loop 682 } 683 684 auto &Insts = PreHeader->getInsts(); 685 auto &LastInst = Insts.back(); 686 Insts.pop_back(); 687 688 for (auto *Inst : findLoopInvariantInstructions(Loop.Body)) { 689 PreHeader->appendInst(Inst); 690 } 691 PreHeader->appendInst(&LastInst); 692 } 693 } 694 695 CfgVector<Inst *> 696 Cfg::findLoopInvariantInstructions(const CfgUnorderedSet<SizeT> &Body) { 697 CfgUnorderedSet<Inst *> InvariantInsts; 698 CfgUnorderedSet<Variable *> InvariantVars; 699 for (auto *Var : getArgs()) { 700 InvariantVars.insert(Var); 701 } 702 bool Changed = false; 703 do { 704 Changed = false; 705 for (auto NodeIndex : Body) { 706 auto *Node = Nodes[NodeIndex]; 707 CfgVector<std::reference_wrapper<Inst>> Insts(Node->getInsts().begin(), 708 Node->getInsts().end()); 709 710 for (auto &InstRef : Insts) { 711 auto &Inst = InstRef.get(); 712 if (Inst.isDeleted() || 713 InvariantInsts.find(&Inst) != InvariantInsts.end()) 714 continue; 715 switch (Inst.getKind()) { 716 case Inst::InstKind::Alloca: 717 case Inst::InstKind::Br: 718 case Inst::InstKind::Ret: 719 case Inst::InstKind::Phi: 720 case Inst::InstKind::Call: 721 case Inst::InstKind::IntrinsicCall: 722 case Inst::InstKind::Load: 723 case Inst::InstKind::Store: 724 case Inst::InstKind::Switch: 725 continue; 726 default: 727 break; 728 } 729 730 bool IsInvariant = true; 731 for (SizeT i = 0; i < Inst.getSrcSize(); ++i) { 732 if (auto *Var = llvm::dyn_cast<Variable>(Inst.getSrc(i))) { 733 if (InvariantVars.find(Var) == InvariantVars.end()) { 734 IsInvariant = false; 735 } 736 } 737 } 738 if (IsInvariant) { 739 Changed = true; 740 InvariantInsts.insert(&Inst); 741 Node->getInsts().remove(Inst); 742 if (Inst.getDest() != nullptr) { 743 InvariantVars.insert(Inst.getDest()); 744 } 745 } 746 } 747 } 748 } while (Changed); 749 750 CfgVector<Inst *> InstVector(InvariantInsts.begin(), InvariantInsts.end()); 751 std::sort(InstVector.begin(), InstVector.end(), 752 [](Inst *A, Inst *B) { return A->getNumber() < B->getNumber(); }); 753 return InstVector; 754 } 755 756 void Cfg::shortCircuitJumps() { 757 // Split Nodes whenever an early jump is possible. 758 // __N : 759 // a = <something> 760 // Instruction 1 without side effect 761 // ... b = <something> ... 762 // Instruction N without side effect 763 // t1 = or a b 764 // br t1 __X __Y 765 // 766 // is transformed into: 767 // __N : 768 // a = <something> 769 // br a __X __N_ext 770 // 771 // __N_ext : 772 // Instruction 1 without side effect 773 // ... b = <something> ... 774 // Instruction N without side effect 775 // br b __X __Y 776 // (Similar logic for AND, jump to false instead of true target.) 777 778 TimerMarker T(TimerStack::TT_shortCircuit, this); 779 getVMetadata()->init(VMK_Uses); 780 auto NodeStack = this->getNodes(); 781 CfgUnorderedMap<SizeT, CfgVector<CfgNode *>> Splits; 782 while (!NodeStack.empty()) { 783 auto *Node = NodeStack.back(); 784 NodeStack.pop_back(); 785 auto NewNode = Node->shortCircuit(); 786 if (NewNode) { 787 NodeStack.push_back(NewNode); 788 NodeStack.push_back(Node); 789 Splits[Node->getIndex()].push_back(NewNode); 790 } 791 } 792 793 // Insert nodes in the right place 794 NodeList NewList; 795 NewList.reserve(Nodes.size()); 796 CfgUnorderedSet<SizeT> Inserted; 797 for (auto *Node : Nodes) { 798 if (Inserted.find(Node->getIndex()) != Inserted.end()) 799 continue; // already inserted 800 NodeList Stack{Node}; 801 while (!Stack.empty()) { 802 auto *Current = Stack.back(); 803 Stack.pop_back(); 804 Inserted.insert(Current->getIndex()); 805 NewList.push_back(Current); 806 for (auto *Next : Splits[Current->getIndex()]) { 807 Stack.push_back(Next); 808 } 809 } 810 } 811 812 SizeT NodeIndex = 0; 813 for (auto *Node : NewList) { 814 Node->resetIndex(NodeIndex++); 815 } 816 Nodes = NewList; 817 } 818 819 void Cfg::floatConstantCSE() { 820 // Load multiple uses of a floating point constant (between two call 821 // instructions or block start/end) into a variable before its first use. 822 // t1 = b + 1.0 823 // t2 = c + 1.0 824 // Gets transformed to: 825 // t0 = 1.0 826 // t0_1 = t0 827 // t1 = b + t0_1 828 // t2 = c + t0_1 829 // Call instructions reset the procedure, but use the same variable, just in 830 // case it got a register. We are assuming floating point registers are not 831 // callee saved in general. Example, continuing from before: 832 // result = call <some function> 833 // t3 = d + 1.0 834 // Gets transformed to: 835 // result = call <some function> 836 // t0_2 = t0 837 // t3 = d + t0_2 838 // TODO(manasijm, stichnot): Figure out how to 'link' t0 to the stack slot of 839 // 1.0. When t0 does not get a register, introducing an extra assignment 840 // statement does not make sense. The relevant portion is marked below. 841 842 TimerMarker _(TimerStack::TT_floatConstantCse, this); 843 for (CfgNode *Node : getNodes()) { 844 845 CfgUnorderedMap<Constant *, Variable *> ConstCache; 846 auto Current = Node->getInsts().begin(); 847 auto End = Node->getInsts().end(); 848 while (Current != End) { 849 CfgUnorderedMap<Constant *, CfgVector<InstList::iterator>> FloatUses; 850 if (llvm::isa<InstCall>(iteratorToInst(Current))) { 851 ++Current; 852 assert(Current != End); 853 // Block should not end with a call 854 } 855 while (Current != End && !llvm::isa<InstCall>(iteratorToInst(Current))) { 856 if (!Current->isDeleted()) { 857 for (SizeT i = 0; i < Current->getSrcSize(); ++i) { 858 if (auto *Const = llvm::dyn_cast<Constant>(Current->getSrc(i))) { 859 if (Const->getType() == IceType_f32 || 860 Const->getType() == IceType_f64) { 861 FloatUses[Const].push_back(Current); 862 } 863 } 864 } 865 } 866 ++Current; 867 } 868 for (auto &Pair : FloatUses) { 869 static constexpr SizeT MinUseThreshold = 3; 870 if (Pair.second.size() < MinUseThreshold) 871 continue; 872 // Only consider constants with at least `MinUseThreshold` uses 873 auto &Insts = Node->getInsts(); 874 875 if (ConstCache.find(Pair.first) == ConstCache.end()) { 876 // Saw a constant (which is used at least twice) for the first time 877 auto *NewVar = makeVariable(Pair.first->getType()); 878 // NewVar->setLinkedTo(Pair.first); 879 // TODO(manasijm): Plumbing for linking to an Operand. 880 auto *Assign = InstAssign::create(Node->getCfg(), NewVar, Pair.first); 881 Insts.insert(Pair.second[0], Assign); 882 ConstCache[Pair.first] = NewVar; 883 } 884 885 auto *NewVar = makeVariable(Pair.first->getType()); 886 NewVar->setLinkedTo(ConstCache[Pair.first]); 887 auto *Assign = 888 InstAssign::create(Node->getCfg(), NewVar, ConstCache[Pair.first]); 889 890 Insts.insert(Pair.second[0], Assign); 891 for (auto InstUse : Pair.second) { 892 for (SizeT i = 0; i < InstUse->getSrcSize(); ++i) { 893 if (auto *Const = llvm::dyn_cast<Constant>(InstUse->getSrc(i))) { 894 if (Const == Pair.first) { 895 InstUse->replaceSource(i, NewVar); 896 } 897 } 898 } 899 } 900 } 901 } 902 } 903 } 904 905 void Cfg::doArgLowering() { 906 TimerMarker T(TimerStack::TT_doArgLowering, this); 907 getTarget()->lowerArguments(); 908 } 909 910 void Cfg::sortAndCombineAllocas(CfgVector<InstAlloca *> &Allocas, 911 uint32_t CombinedAlignment, InstList &Insts, 912 AllocaBaseVariableType BaseVariableType) { 913 if (Allocas.empty()) 914 return; 915 // Sort by decreasing alignment. 916 std::sort(Allocas.begin(), Allocas.end(), [](InstAlloca *A1, InstAlloca *A2) { 917 uint32_t Align1 = A1->getAlignInBytes(); 918 uint32_t Align2 = A2->getAlignInBytes(); 919 if (Align1 == Align2) 920 return A1->getNumber() < A2->getNumber(); 921 else 922 return Align1 > Align2; 923 }); 924 // Process the allocas in order of decreasing stack alignment. This allows 925 // us to pack less-aligned pieces after more-aligned ones, resulting in less 926 // stack growth. It also allows there to be at most one stack alignment "and" 927 // instruction for a whole list of allocas. 928 uint32_t CurrentOffset = 0; 929 CfgVector<int32_t> Offsets; 930 for (Inst *Instr : Allocas) { 931 auto *Alloca = llvm::cast<InstAlloca>(Instr); 932 // Adjust the size of the allocation up to the next multiple of the 933 // object's alignment. 934 uint32_t Alignment = std::max(Alloca->getAlignInBytes(), 1u); 935 auto *ConstSize = 936 llvm::dyn_cast<ConstantInteger32>(Alloca->getSizeInBytes()); 937 uint32_t Size = Utils::applyAlignment(ConstSize->getValue(), Alignment); 938 if (BaseVariableType == BVT_FramePointer) { 939 // Addressing is relative to the frame pointer. Subtract the offset after 940 // adding the size of the alloca, because it grows downwards from the 941 // frame pointer. 942 Offsets.push_back(Target->getFramePointerOffset(CurrentOffset, Size)); 943 } else { 944 // Addressing is relative to the stack pointer or to a user pointer. Add 945 // the offset before adding the size of the object, because it grows 946 // upwards from the stack pointer. In addition, if the addressing is 947 // relative to the stack pointer, we need to add the pre-computed max out 948 // args size bytes. 949 const uint32_t OutArgsOffsetOrZero = 950 (BaseVariableType == BVT_StackPointer) 951 ? getTarget()->maxOutArgsSizeBytes() 952 : 0; 953 Offsets.push_back(CurrentOffset + OutArgsOffsetOrZero); 954 } 955 // Update the running offset of the fused alloca region. 956 CurrentOffset += Size; 957 } 958 // Round the offset up to the alignment granularity to use as the size. 959 uint32_t TotalSize = Utils::applyAlignment(CurrentOffset, CombinedAlignment); 960 // Ensure every alloca was assigned an offset. 961 assert(Allocas.size() == Offsets.size()); 962 963 switch (BaseVariableType) { 964 case BVT_UserPointer: { 965 Variable *BaseVariable = makeVariable(IceType_i32); 966 for (SizeT i = 0; i < Allocas.size(); ++i) { 967 auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]); 968 // Emit a new addition operation to replace the alloca. 969 Operand *AllocaOffset = Ctx->getConstantInt32(Offsets[i]); 970 InstArithmetic *Add = 971 InstArithmetic::create(this, InstArithmetic::Add, Alloca->getDest(), 972 BaseVariable, AllocaOffset); 973 Insts.push_front(Add); 974 Alloca->setDeleted(); 975 } 976 Operand *AllocaSize = Ctx->getConstantInt32(TotalSize); 977 InstAlloca *CombinedAlloca = 978 InstAlloca::create(this, BaseVariable, AllocaSize, CombinedAlignment); 979 CombinedAlloca->setKnownFrameOffset(); 980 Insts.push_front(CombinedAlloca); 981 } break; 982 case BVT_StackPointer: 983 case BVT_FramePointer: { 984 for (SizeT i = 0; i < Allocas.size(); ++i) { 985 auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]); 986 // Emit a fake definition of the rematerializable variable. 987 Variable *Dest = Alloca->getDest(); 988 auto *Def = InstFakeDef::create(this, Dest); 989 if (BaseVariableType == BVT_StackPointer) 990 Dest->setRematerializable(getTarget()->getStackReg(), Offsets[i]); 991 else 992 Dest->setRematerializable(getTarget()->getFrameReg(), Offsets[i]); 993 Insts.push_front(Def); 994 Alloca->setDeleted(); 995 } 996 // Allocate the fixed area in the function prolog. 997 getTarget()->reserveFixedAllocaArea(TotalSize, CombinedAlignment); 998 } break; 999 } 1000 } 1001 1002 void Cfg::processAllocas(bool SortAndCombine) { 1003 TimerMarker _(TimerStack::TT_alloca, this); 1004 const uint32_t StackAlignment = getTarget()->getStackAlignment(); 1005 CfgNode *EntryNode = getEntryNode(); 1006 assert(EntryNode); 1007 // LLVM enforces power of 2 alignment. 1008 assert(llvm::isPowerOf2_32(StackAlignment)); 1009 // If the ABI's stack alignment is smaller than the vector size (16 bytes), 1010 // conservatively use a frame pointer to allow for explicit alignment of the 1011 // stack pointer. This needs to happen before register allocation so the frame 1012 // pointer can be reserved. 1013 if (getTarget()->needsStackPointerAlignment()) { 1014 getTarget()->setHasFramePointer(); 1015 } 1016 // Determine if there are large alignment allocations in the entry block or 1017 // dynamic allocations (variable size in the entry block). 1018 bool HasLargeAlignment = false; 1019 bool HasDynamicAllocation = false; 1020 for (Inst &Instr : EntryNode->getInsts()) { 1021 if (Instr.isDeleted()) 1022 continue; 1023 if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) { 1024 uint32_t AlignmentParam = Alloca->getAlignInBytes(); 1025 if (AlignmentParam > StackAlignment) 1026 HasLargeAlignment = true; 1027 if (llvm::isa<Constant>(Alloca->getSizeInBytes())) 1028 Alloca->setKnownFrameOffset(); 1029 else { 1030 HasDynamicAllocation = true; 1031 // If Allocas are not sorted, the first dynamic allocation causes 1032 // later Allocas to be at unknown offsets relative to the stack/frame. 1033 if (!SortAndCombine) 1034 break; 1035 } 1036 } 1037 } 1038 // Don't do the heavyweight sorting and layout for low optimization levels. 1039 if (!SortAndCombine) 1040 return; 1041 // Any alloca outside the entry block is a dynamic allocation. 1042 for (CfgNode *Node : Nodes) { 1043 if (Node == EntryNode) 1044 continue; 1045 for (Inst &Instr : Node->getInsts()) { 1046 if (Instr.isDeleted()) 1047 continue; 1048 if (llvm::isa<InstAlloca>(&Instr)) { 1049 // Allocations outside the entry block require a frame pointer. 1050 HasDynamicAllocation = true; 1051 break; 1052 } 1053 } 1054 if (HasDynamicAllocation) 1055 break; 1056 } 1057 // Mark the target as requiring a frame pointer. 1058 if (HasLargeAlignment || HasDynamicAllocation) 1059 getTarget()->setHasFramePointer(); 1060 // Collect the Allocas into the two vectors. 1061 // Allocas in the entry block that have constant size and alignment less 1062 // than or equal to the function's stack alignment. 1063 CfgVector<InstAlloca *> FixedAllocas; 1064 // Allocas in the entry block that have constant size and alignment greater 1065 // than the function's stack alignment. 1066 CfgVector<InstAlloca *> AlignedAllocas; 1067 // Maximum alignment used by any alloca. 1068 uint32_t MaxAlignment = StackAlignment; 1069 for (Inst &Instr : EntryNode->getInsts()) { 1070 if (Instr.isDeleted()) 1071 continue; 1072 if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) { 1073 if (!llvm::isa<Constant>(Alloca->getSizeInBytes())) 1074 continue; 1075 uint32_t AlignmentParam = Alloca->getAlignInBytes(); 1076 // For default align=0, set it to the real value 1, to avoid any 1077 // bit-manipulation problems below. 1078 AlignmentParam = std::max(AlignmentParam, 1u); 1079 assert(llvm::isPowerOf2_32(AlignmentParam)); 1080 if (HasDynamicAllocation && AlignmentParam > StackAlignment) { 1081 // If we have both dynamic allocations and large stack alignments, 1082 // high-alignment allocations are pulled out with their own base. 1083 AlignedAllocas.push_back(Alloca); 1084 } else { 1085 FixedAllocas.push_back(Alloca); 1086 } 1087 MaxAlignment = std::max(AlignmentParam, MaxAlignment); 1088 } 1089 } 1090 // Add instructions to the head of the entry block in reverse order. 1091 InstList &Insts = getEntryNode()->getInsts(); 1092 if (HasDynamicAllocation && HasLargeAlignment) { 1093 // We are using a frame pointer, but fixed large-alignment alloca addresses 1094 // do not have a known offset from either the stack or frame pointer. 1095 // They grow up from a user pointer from an alloca. 1096 sortAndCombineAllocas(AlignedAllocas, MaxAlignment, Insts, BVT_UserPointer); 1097 // Fixed size allocas are addressed relative to the frame pointer. 1098 sortAndCombineAllocas(FixedAllocas, StackAlignment, Insts, 1099 BVT_FramePointer); 1100 } else { 1101 // Otherwise, fixed size allocas are addressed relative to the stack unless 1102 // there are dynamic allocas. 1103 const AllocaBaseVariableType BasePointerType = 1104 (HasDynamicAllocation ? BVT_FramePointer : BVT_StackPointer); 1105 sortAndCombineAllocas(FixedAllocas, MaxAlignment, Insts, BasePointerType); 1106 } 1107 if (!FixedAllocas.empty() || !AlignedAllocas.empty()) 1108 // No use calling findRematerializable() unless there is some 1109 // rematerializable alloca instruction to seed it. 1110 findRematerializable(); 1111 } 1112 1113 namespace { 1114 1115 // Helpers for findRematerializable(). For each of them, if a suitable 1116 // rematerialization is found, the instruction's Dest variable is set to be 1117 // rematerializable and it returns true, otherwise it returns false. 1118 1119 bool rematerializeArithmetic(const Inst *Instr) { 1120 // Check that it's an Arithmetic instruction with an Add operation. 1121 auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr); 1122 if (Arith == nullptr || Arith->getOp() != InstArithmetic::Add) 1123 return false; 1124 // Check that Src(0) is rematerializable. 1125 auto *Src0Var = llvm::dyn_cast<Variable>(Arith->getSrc(0)); 1126 if (Src0Var == nullptr || !Src0Var->isRematerializable()) 1127 return false; 1128 // Check that Src(1) is an immediate. 1129 auto *Src1Imm = llvm::dyn_cast<ConstantInteger32>(Arith->getSrc(1)); 1130 if (Src1Imm == nullptr) 1131 return false; 1132 Arith->getDest()->setRematerializable( 1133 Src0Var->getRegNum(), Src0Var->getStackOffset() + Src1Imm->getValue()); 1134 return true; 1135 } 1136 1137 bool rematerializeAssign(const Inst *Instr) { 1138 // An InstAssign only originates from an inttoptr or ptrtoint instruction, 1139 // which never occurs in a MINIMAL build. 1140 if (BuildDefs::minimal()) 1141 return false; 1142 // Check that it's an Assign instruction. 1143 if (!llvm::isa<InstAssign>(Instr)) 1144 return false; 1145 // Check that Src(0) is rematerializable. 1146 auto *Src0Var = llvm::dyn_cast<Variable>(Instr->getSrc(0)); 1147 if (Src0Var == nullptr || !Src0Var->isRematerializable()) 1148 return false; 1149 Instr->getDest()->setRematerializable(Src0Var->getRegNum(), 1150 Src0Var->getStackOffset()); 1151 return true; 1152 } 1153 1154 bool rematerializeCast(const Inst *Instr) { 1155 // An pointer-type bitcast never occurs in a MINIMAL build. 1156 if (BuildDefs::minimal()) 1157 return false; 1158 // Check that it's a Cast instruction with a Bitcast operation. 1159 auto *Cast = llvm::dyn_cast<InstCast>(Instr); 1160 if (Cast == nullptr || Cast->getCastKind() != InstCast::Bitcast) 1161 return false; 1162 // Check that Src(0) is rematerializable. 1163 auto *Src0Var = llvm::dyn_cast<Variable>(Cast->getSrc(0)); 1164 if (Src0Var == nullptr || !Src0Var->isRematerializable()) 1165 return false; 1166 // Check that Dest and Src(0) have the same type. 1167 Variable *Dest = Cast->getDest(); 1168 if (Dest->getType() != Src0Var->getType()) 1169 return false; 1170 Dest->setRematerializable(Src0Var->getRegNum(), Src0Var->getStackOffset()); 1171 return true; 1172 } 1173 1174 } // end of anonymous namespace 1175 1176 /// Scan the function to find additional rematerializable variables. This is 1177 /// possible when the source operand of an InstAssignment is a rematerializable 1178 /// variable, or the same for a pointer-type InstCast::Bitcast, or when an 1179 /// InstArithmetic is an add of a rematerializable variable and an immediate. 1180 /// Note that InstAssignment instructions and pointer-type InstCast::Bitcast 1181 /// instructions generally only come about from the IceConverter's treatment of 1182 /// inttoptr, ptrtoint, and bitcast instructions. TODO(stichnot): Consider 1183 /// other possibilities, however unlikely, such as InstArithmetic::Sub, or 1184 /// commutativity. 1185 void Cfg::findRematerializable() { 1186 // Scan the instructions in order, and repeat until no new opportunities are 1187 // found. It may take more than one iteration because a variable's defining 1188 // block may happen to come after a block where it is used, depending on the 1189 // CfgNode linearization order. 1190 bool FoundNewAssignment; 1191 do { 1192 FoundNewAssignment = false; 1193 for (CfgNode *Node : getNodes()) { 1194 // No need to process Phi instructions. 1195 for (Inst &Instr : Node->getInsts()) { 1196 if (Instr.isDeleted()) 1197 continue; 1198 Variable *Dest = Instr.getDest(); 1199 if (Dest == nullptr || Dest->isRematerializable()) 1200 continue; 1201 if (rematerializeArithmetic(&Instr) || rematerializeAssign(&Instr) || 1202 rematerializeCast(&Instr)) { 1203 FoundNewAssignment = true; 1204 } 1205 } 1206 } 1207 } while (FoundNewAssignment); 1208 } 1209 1210 void Cfg::doAddressOpt() { 1211 TimerMarker T(TimerStack::TT_doAddressOpt, this); 1212 for (CfgNode *Node : Nodes) 1213 Node->doAddressOpt(); 1214 } 1215 1216 namespace { 1217 // ShuffleVectorUtils implements helper functions for rematerializing 1218 // shufflevector instructions from a sequence of extractelement/insertelement 1219 // instructions. It looks for the following pattern: 1220 // 1221 // %t0 = extractelement A, %n0 1222 // %t1 = extractelement B, %n1 1223 // %t2 = extractelement C, %n2 1224 // ... 1225 // %tN = extractelement N, %nN 1226 // %d0 = insertelement undef, %t0, 0 1227 // %d1 = insertelement %d0, %t1, 1 1228 // %d2 = insertelement %d1, %t2, 2 1229 // ... 1230 // %dest = insertelement %d_N-1, %tN, N 1231 // 1232 // where N is num_element(typeof(%dest)), and A, B, C, ... N are at most two 1233 // distinct variables. 1234 namespace ShuffleVectorUtils { 1235 // findAllInserts is used when searching for all the insertelements that are 1236 // used in a shufflevector operation. This function works recursively, when 1237 // invoked with I = i, the function assumes Insts[i] is the last found 1238 // insertelement in the chain. The next insertelement insertruction is saved in 1239 // Insts[i+1]. 1240 bool findAllInserts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM, 1241 CfgVector<const Inst *> *Insts, SizeT I = 0) { 1242 const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat); 1243 1244 if (I > Insts->size()) { 1245 if (Verbose) { 1246 Ctx->getStrDump() << "\tToo many inserts.\n"; 1247 } 1248 return false; 1249 } 1250 1251 const auto *LastInsert = Insts->at(I); 1252 assert(llvm::isa<InstInsertElement>(LastInsert)); 1253 1254 if (I == Insts->size() - 1) { 1255 // Matching against undef is not really needed because the value in Src(0) 1256 // will be totally overwritten. We still enforce it anyways because the 1257 // PNaCl toolchain generates the bitcode with it. 1258 if (!llvm::isa<ConstantUndef>(LastInsert->getSrc(0))) { 1259 if (Verbose) { 1260 Ctx->getStrDump() << "\tSrc0 is not undef: " << I << " " 1261 << Insts->size(); 1262 LastInsert->dump(Func); 1263 Ctx->getStrDump() << "\n"; 1264 } 1265 return false; 1266 } 1267 1268 // The following loop ensures that the insertelements are sorted. In theory, 1269 // we could relax this restriction and allow any order. As long as each 1270 // index appears exactly once, this chain is still a candidate for becoming 1271 // a shufflevector. The Insts vector is traversed backwards because the 1272 // instructions are "enqueued" in reverse order. 1273 int32_t ExpectedElement = 0; 1274 for (const auto *I : reverse_range(*Insts)) { 1275 if (llvm::cast<ConstantInteger32>(I->getSrc(2))->getValue() != 1276 ExpectedElement) { 1277 return false; 1278 } 1279 ++ExpectedElement; 1280 } 1281 return true; 1282 } 1283 1284 const auto *Src0V = llvm::cast<Variable>(LastInsert->getSrc(0)); 1285 const auto *Def = VM->getSingleDefinition(Src0V); 1286 1287 // Only optimize if the first operand in 1288 // 1289 // Dest = insertelement A, B, 10 1290 // 1291 // is singly-def'ed. 1292 if (Def == nullptr) { 1293 if (Verbose) { 1294 Ctx->getStrDump() << "\tmulti-def: "; 1295 (*Insts)[I]->dump(Func); 1296 Ctx->getStrDump() << "\n"; 1297 } 1298 return false; 1299 } 1300 1301 // We also require the (single) definition to come from an insertelement 1302 // instruction. 1303 if (!llvm::isa<InstInsertElement>(Def)) { 1304 if (Verbose) { 1305 Ctx->getStrDump() << "\tnot insert element: "; 1306 Def->dump(Func); 1307 Ctx->getStrDump() << "\n"; 1308 } 1309 return false; 1310 } 1311 1312 // Everything seems fine, so we save Def in Insts, and delegate the decision 1313 // to findAllInserts. 1314 (*Insts)[I + 1] = Def; 1315 1316 return findAllInserts(Func, Ctx, VM, Insts, I + 1); 1317 } 1318 1319 // insertsLastElement returns true if Insert is inserting an element in the last 1320 // position of a vector. 1321 bool insertsLastElement(const Inst &Insert) { 1322 const Type DestTy = Insert.getDest()->getType(); 1323 assert(isVectorType(DestTy)); 1324 const SizeT Elem = 1325 llvm::cast<ConstantInteger32>(Insert.getSrc(2))->getValue(); 1326 return Elem == typeNumElements(DestTy) - 1; 1327 } 1328 1329 // findAllExtracts goes over all the insertelement instructions that are 1330 // candidates to be replaced by a shufflevector, and searches for all the 1331 // definitions of the elements being inserted. If all of the elements are the 1332 // result of an extractelement instruction, and all of the extractelements 1333 // operate on at most two different sources, than the instructions can be 1334 // replaced by a shufflevector. 1335 bool findAllExtracts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM, 1336 const CfgVector<const Inst *> &Insts, Variable **Src0, 1337 Variable **Src1, CfgVector<const Inst *> *Extracts) { 1338 const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat); 1339 1340 *Src0 = nullptr; 1341 *Src1 = nullptr; 1342 assert(Insts.size() > 0); 1343 for (SizeT I = 0; I < Insts.size(); ++I) { 1344 const auto *Insert = Insts.at(I); 1345 const auto *Src1V = llvm::dyn_cast<Variable>(Insert->getSrc(1)); 1346 if (Src1V == nullptr) { 1347 if (Verbose) { 1348 Ctx->getStrDump() << "src(1) is not a variable: "; 1349 Insert->dump(Func); 1350 Ctx->getStrDump() << "\n"; 1351 } 1352 return false; 1353 } 1354 1355 const auto *Def = VM->getSingleDefinition(Src1V); 1356 if (Def == nullptr) { 1357 if (Verbose) { 1358 Ctx->getStrDump() << "multi-def src(1): "; 1359 Insert->dump(Func); 1360 Ctx->getStrDump() << "\n"; 1361 } 1362 return false; 1363 } 1364 1365 if (!llvm::isa<InstExtractElement>(Def)) { 1366 if (Verbose) { 1367 Ctx->getStrDump() << "not extractelement: "; 1368 Def->dump(Func); 1369 Ctx->getStrDump() << "\n"; 1370 } 1371 return false; 1372 } 1373 1374 auto *Src = llvm::cast<Variable>(Def->getSrc(0)); 1375 if (*Src0 == nullptr) { 1376 // No sources yet. Save Src to Src0. 1377 *Src0 = Src; 1378 } else if (*Src1 == nullptr) { 1379 // We already have a source, so we might save Src in Src1 -- but only if 1380 // Src0 is not Src. 1381 if (*Src0 != Src) { 1382 *Src1 = Src; 1383 } 1384 } else if (Src != *Src0 && Src != *Src1) { 1385 // More than two sources, so we can't rematerialize the shufflevector 1386 // instruction. 1387 if (Verbose) { 1388 Ctx->getStrDump() << "Can't shuffle more than two sources.\n"; 1389 } 1390 return false; 1391 } 1392 1393 (*Extracts)[I] = Def; 1394 } 1395 1396 // We should have seen at least one source operand. 1397 assert(*Src0 != nullptr); 1398 1399 // If a second source was not seen, then we just make Src1 = Src0 to simplify 1400 // things down stream. This should not matter, as all of the indexes in the 1401 // shufflevector instruction will point to Src0. 1402 if (*Src1 == nullptr) { 1403 *Src1 = *Src0; 1404 } 1405 1406 return true; 1407 } 1408 1409 } // end of namespace ShuffleVectorUtils 1410 } // end of anonymous namespace 1411 1412 void Cfg::materializeVectorShuffles() { 1413 const bool Verbose = BuildDefs::dump() && isVerbose(IceV_ShufMat); 1414 1415 std::unique_ptr<OstreamLocker> L; 1416 if (Verbose) { 1417 L.reset(new OstreamLocker(getContext())); 1418 getContext()->getStrDump() << "\nShuffle materialization:\n"; 1419 } 1420 1421 // MaxVectorElements is the maximum number of elements in the vector types 1422 // handled by Subzero. We use it to create the Inserts and Extracts vectors 1423 // with the appropriate size, thus avoiding resize() calls. 1424 const SizeT MaxVectorElements = typeNumElements(IceType_v16i8); 1425 CfgVector<const Inst *> Inserts(MaxVectorElements); 1426 CfgVector<const Inst *> Extracts(MaxVectorElements); 1427 1428 TimerMarker T(TimerStack::TT_materializeVectorShuffles, this); 1429 for (CfgNode *Node : Nodes) { 1430 for (auto &Instr : Node->getInsts()) { 1431 if (!llvm::isa<InstInsertElement>(Instr)) { 1432 continue; 1433 } 1434 if (!ShuffleVectorUtils::insertsLastElement(Instr)) { 1435 // To avoid wasting time, we only start the pattern match at the last 1436 // insertelement instruction -- and go backwards from there. 1437 continue; 1438 } 1439 if (Verbose) { 1440 getContext()->getStrDump() << "\tCandidate: "; 1441 Instr.dump(this); 1442 getContext()->getStrDump() << "\n"; 1443 } 1444 Inserts.resize(typeNumElements(Instr.getDest()->getType())); 1445 Inserts[0] = &Instr; 1446 if (!ShuffleVectorUtils::findAllInserts(this, getContext(), 1447 VMetadata.get(), &Inserts)) { 1448 // If we fail to find a sequence of insertelements, we stop the 1449 // optimization. 1450 if (Verbose) { 1451 getContext()->getStrDump() << "\tFalse alarm.\n"; 1452 } 1453 continue; 1454 } 1455 if (Verbose) { 1456 getContext()->getStrDump() << "\tFound the following insertelement: \n"; 1457 for (auto *I : reverse_range(Inserts)) { 1458 getContext()->getStrDump() << "\t\t"; 1459 I->dump(this); 1460 getContext()->getStrDump() << "\n"; 1461 } 1462 } 1463 Extracts.resize(Inserts.size()); 1464 Variable *Src0; 1465 Variable *Src1; 1466 if (!ShuffleVectorUtils::findAllExtracts(this, getContext(), 1467 VMetadata.get(), Inserts, &Src0, 1468 &Src1, &Extracts)) { 1469 // If we fail to match the definitions of the insertelements' sources 1470 // with extractelement instructions -- or if those instructions operate 1471 // on more than two different variables -- we stop the optimization. 1472 if (Verbose) { 1473 getContext()->getStrDump() << "\tFailed to match extractelements.\n"; 1474 } 1475 continue; 1476 } 1477 if (Verbose) { 1478 getContext()->getStrDump() 1479 << "\tFound the following insert/extract element pairs: \n"; 1480 for (SizeT I = 0; I < Inserts.size(); ++I) { 1481 const SizeT Pos = Inserts.size() - I - 1; 1482 getContext()->getStrDump() << "\t\tInsert : "; 1483 Inserts[Pos]->dump(this); 1484 getContext()->getStrDump() << "\n\t\tExtract: "; 1485 Extracts[Pos]->dump(this); 1486 getContext()->getStrDump() << "\n"; 1487 } 1488 } 1489 1490 assert(Src0 != nullptr); 1491 assert(Src1 != nullptr); 1492 1493 auto *ShuffleVector = 1494 InstShuffleVector::create(this, Instr.getDest(), Src0, Src1); 1495 assert(ShuffleVector->getSrc(0) == Src0); 1496 assert(ShuffleVector->getSrc(1) == Src1); 1497 for (SizeT I = 0; I < Extracts.size(); ++I) { 1498 const SizeT Pos = Extracts.size() - I - 1; 1499 auto *Index = llvm::cast<ConstantInteger32>(Extracts[Pos]->getSrc(1)); 1500 if (Src0 == Extracts[Pos]->getSrc(0)) { 1501 ShuffleVector->addIndex(Index); 1502 } else { 1503 ShuffleVector->addIndex(llvm::cast<ConstantInteger32>( 1504 Ctx->getConstantInt32(Index->getValue() + Extracts.size()))); 1505 } 1506 } 1507 1508 if (Verbose) { 1509 getContext()->getStrDump() << "Created: "; 1510 ShuffleVector->dump(this); 1511 getContext()->getStrDump() << "\n"; 1512 } 1513 1514 Instr.setDeleted(); 1515 auto &LoweringContext = getTarget()->getContext(); 1516 LoweringContext.setInsertPoint(instToIterator(&Instr)); 1517 LoweringContext.insert(ShuffleVector); 1518 } 1519 } 1520 } 1521 1522 void Cfg::doNopInsertion() { 1523 if (!getFlags().getShouldDoNopInsertion()) 1524 return; 1525 TimerMarker T(TimerStack::TT_doNopInsertion, this); 1526 RandomNumberGenerator RNG(getFlags().getRandomSeed(), RPE_NopInsertion, 1527 SequenceNumber); 1528 for (CfgNode *Node : Nodes) 1529 Node->doNopInsertion(RNG); 1530 } 1531 1532 void Cfg::genCode() { 1533 TimerMarker T(TimerStack::TT_genCode, this); 1534 for (CfgNode *Node : Nodes) 1535 Node->genCode(); 1536 } 1537 1538 // Compute the stack frame layout. 1539 void Cfg::genFrame() { 1540 TimerMarker T(TimerStack::TT_genFrame, this); 1541 getTarget()->addProlog(Entry); 1542 for (CfgNode *Node : Nodes) 1543 if (Node->getHasReturn()) 1544 getTarget()->addEpilog(Node); 1545 } 1546 1547 void Cfg::generateLoopInfo() { 1548 TimerMarker T(TimerStack::TT_computeLoopNestDepth, this); 1549 LoopInfo = ComputeLoopInfo(this); 1550 } 1551 1552 // This is a lightweight version of live-range-end calculation. Marks the last 1553 // use of only those variables whose definition and uses are completely with a 1554 // single block. It is a quick single pass and doesn't need to iterate until 1555 // convergence. 1556 void Cfg::livenessLightweight() { 1557 TimerMarker T(TimerStack::TT_livenessLightweight, this); 1558 getVMetadata()->init(VMK_Uses); 1559 for (CfgNode *Node : Nodes) 1560 Node->livenessLightweight(); 1561 } 1562 1563 void Cfg::liveness(LivenessMode Mode) { 1564 TimerMarker T(TimerStack::TT_liveness, this); 1565 // Destroying the previous (if any) Liveness information clears the Liveness 1566 // allocator TLS pointer. 1567 Live = nullptr; 1568 Live = Liveness::create(this, Mode); 1569 1570 getVMetadata()->init(VMK_Uses); 1571 Live->init(); 1572 1573 // Initialize with all nodes needing to be processed. 1574 BitVector NeedToProcess(Nodes.size(), true); 1575 while (NeedToProcess.any()) { 1576 // Iterate in reverse topological order to speed up convergence. 1577 for (CfgNode *Node : reverse_range(Nodes)) { 1578 if (NeedToProcess[Node->getIndex()]) { 1579 NeedToProcess[Node->getIndex()] = false; 1580 bool Changed = Node->liveness(getLiveness()); 1581 if (Changed) { 1582 // If the beginning-of-block liveness changed since the last 1583 // iteration, mark all in-edges as needing to be processed. 1584 for (CfgNode *Pred : Node->getInEdges()) 1585 NeedToProcess[Pred->getIndex()] = true; 1586 } 1587 } 1588 } 1589 } 1590 if (Mode == Liveness_Intervals) { 1591 // Reset each variable's live range. 1592 for (Variable *Var : Variables) 1593 Var->resetLiveRange(); 1594 } 1595 // Make a final pass over each node to delete dead instructions, collect the 1596 // first and last instruction numbers, and add live range segments for that 1597 // node. 1598 for (CfgNode *Node : Nodes) { 1599 InstNumberT FirstInstNum = Inst::NumberSentinel; 1600 InstNumberT LastInstNum = Inst::NumberSentinel; 1601 for (Inst &I : Node->getPhis()) { 1602 I.deleteIfDead(); 1603 if (Mode == Liveness_Intervals && !I.isDeleted()) { 1604 if (FirstInstNum == Inst::NumberSentinel) 1605 FirstInstNum = I.getNumber(); 1606 assert(I.getNumber() > LastInstNum); 1607 LastInstNum = I.getNumber(); 1608 } 1609 } 1610 for (Inst &I : Node->getInsts()) { 1611 I.deleteIfDead(); 1612 if (Mode == Liveness_Intervals && !I.isDeleted()) { 1613 if (FirstInstNum == Inst::NumberSentinel) 1614 FirstInstNum = I.getNumber(); 1615 assert(I.getNumber() > LastInstNum); 1616 LastInstNum = I.getNumber(); 1617 } 1618 } 1619 if (Mode == Liveness_Intervals) { 1620 // Special treatment for live in-args. Their liveness needs to extend 1621 // beyond the beginning of the function, otherwise an arg whose only use 1622 // is in the first instruction will end up having the trivial live range 1623 // [2,2) and will *not* interfere with other arguments. So if the first 1624 // instruction of the method is "r=arg1+arg2", both args may be assigned 1625 // the same register. This is accomplished by extending the entry block's 1626 // instruction range from [2,n) to [1,n) which will transform the 1627 // problematic [2,2) live ranges into [1,2). This extension works because 1628 // the entry node is guaranteed to have the lowest instruction numbers. 1629 if (Node == getEntryNode()) { 1630 FirstInstNum = Inst::NumberExtended; 1631 // Just in case the entry node somehow contains no instructions... 1632 if (LastInstNum == Inst::NumberSentinel) 1633 LastInstNum = FirstInstNum; 1634 } 1635 // If this node somehow contains no instructions, don't bother trying to 1636 // add liveness intervals for it, because variables that are live-in and 1637 // live-out will have a bogus interval added. 1638 if (FirstInstNum != Inst::NumberSentinel) 1639 Node->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum); 1640 } 1641 } 1642 } 1643 1644 // Traverse every Variable of every Inst and verify that it appears within the 1645 // Variable's computed live range. 1646 bool Cfg::validateLiveness() const { 1647 TimerMarker T(TimerStack::TT_validateLiveness, this); 1648 bool Valid = true; 1649 OstreamLocker L(Ctx); 1650 Ostream &Str = Ctx->getStrDump(); 1651 for (CfgNode *Node : Nodes) { 1652 Inst *FirstInst = nullptr; 1653 for (Inst &Instr : Node->getInsts()) { 1654 if (Instr.isDeleted()) 1655 continue; 1656 if (FirstInst == nullptr) 1657 FirstInst = &Instr; 1658 InstNumberT InstNumber = Instr.getNumber(); 1659 if (Variable *Dest = Instr.getDest()) { 1660 if (!Dest->getIgnoreLiveness()) { 1661 bool Invalid = false; 1662 constexpr bool IsDest = true; 1663 if (!Dest->getLiveRange().containsValue(InstNumber, IsDest)) 1664 Invalid = true; 1665 // Check that this instruction actually *begins* Dest's live range, 1666 // by checking that Dest is not live in the previous instruction. As 1667 // a special exception, we don't check this for the first instruction 1668 // of the block, because a Phi temporary may be live at the end of 1669 // the previous block, and if it is also assigned in the first 1670 // instruction of this block, the adjacent live ranges get merged. 1671 if (&Instr != FirstInst && !Instr.isDestRedefined() && 1672 Dest->getLiveRange().containsValue(InstNumber - 1, IsDest)) 1673 Invalid = true; 1674 if (Invalid) { 1675 Valid = false; 1676 Str << "Liveness error: inst " << Instr.getNumber() << " dest "; 1677 Dest->dump(this); 1678 Str << " live range " << Dest->getLiveRange() << "\n"; 1679 } 1680 } 1681 } 1682 FOREACH_VAR_IN_INST(Var, Instr) { 1683 static constexpr bool IsDest = false; 1684 if (!Var->getIgnoreLiveness() && 1685 !Var->getLiveRange().containsValue(InstNumber, IsDest)) { 1686 Valid = false; 1687 Str << "Liveness error: inst " << Instr.getNumber() << " var "; 1688 Var->dump(this); 1689 Str << " live range " << Var->getLiveRange() << "\n"; 1690 } 1691 } 1692 } 1693 } 1694 return Valid; 1695 } 1696 1697 void Cfg::contractEmptyNodes() { 1698 // If we're decorating the asm output with register liveness info, this 1699 // information may become corrupted or incorrect after contracting nodes that 1700 // contain only redundant assignments. As such, we disable this pass when 1701 // DecorateAsm is specified. This may make the resulting code look more 1702 // branchy, but it should have no effect on the register assignments. 1703 if (getFlags().getDecorateAsm()) 1704 return; 1705 for (CfgNode *Node : Nodes) { 1706 Node->contractIfEmpty(); 1707 } 1708 } 1709 1710 void Cfg::doBranchOpt() { 1711 TimerMarker T(TimerStack::TT_doBranchOpt, this); 1712 for (auto I = Nodes.begin(), E = Nodes.end(); I != E; ++I) { 1713 auto NextNode = I + 1; 1714 (*I)->doBranchOpt(NextNode == E ? nullptr : *NextNode); 1715 } 1716 } 1717 1718 void Cfg::markNodesForSandboxing() { 1719 for (const InstJumpTable *JT : JumpTables) 1720 for (SizeT I = 0; I < JT->getNumTargets(); ++I) 1721 JT->getTarget(I)->setNeedsAlignment(); 1722 } 1723 1724 // ======================== Dump routines ======================== // 1725 1726 // emitTextHeader() is not target-specific (apart from what is abstracted by 1727 // the Assembler), so it is defined here rather than in the target lowering 1728 // class. 1729 void Cfg::emitTextHeader(GlobalString Name, GlobalContext *Ctx, 1730 const Assembler *Asm) { 1731 if (!BuildDefs::dump()) 1732 return; 1733 Ostream &Str = Ctx->getStrEmit(); 1734 Str << "\t.text\n"; 1735 if (getFlags().getFunctionSections()) 1736 Str << "\t.section\t.text." << Name << ",\"ax\",%progbits\n"; 1737 if (!Asm->getInternal() || getFlags().getDisableInternal()) { 1738 Str << "\t.globl\t" << Name << "\n"; 1739 Str << "\t.type\t" << Name << ",%function\n"; 1740 } 1741 Str << "\t" << Asm->getAlignDirective() << " " 1742 << Asm->getBundleAlignLog2Bytes() << ",0x"; 1743 for (uint8_t I : Asm->getNonExecBundlePadding()) 1744 Str.write_hex(I); 1745 Str << "\n"; 1746 Str << Name << ":\n"; 1747 } 1748 1749 void Cfg::emitJumpTables() { 1750 switch (getFlags().getOutFileType()) { 1751 case FT_Elf: 1752 case FT_Iasm: { 1753 // The emission needs to be delayed until the after the text section so 1754 // save the offsets in the global context. 1755 for (const InstJumpTable *JumpTable : JumpTables) { 1756 Ctx->addJumpTableData(JumpTable->toJumpTableData(getAssembler())); 1757 } 1758 } break; 1759 case FT_Asm: { 1760 // Emit the assembly directly so we don't need to hang on to all the names 1761 for (const InstJumpTable *JumpTable : JumpTables) 1762 getTarget()->emitJumpTable(this, JumpTable); 1763 } break; 1764 } 1765 } 1766 1767 void Cfg::emit() { 1768 if (!BuildDefs::dump()) 1769 return; 1770 TimerMarker T(TimerStack::TT_emitAsm, this); 1771 if (getFlags().getDecorateAsm()) { 1772 renumberInstructions(); 1773 getVMetadata()->init(VMK_Uses); 1774 liveness(Liveness_Basic); 1775 dump("After recomputing liveness for -decorate-asm"); 1776 } 1777 OstreamLocker L(Ctx); 1778 Ostream &Str = Ctx->getStrEmit(); 1779 const Assembler *Asm = getAssembler<>(); 1780 const bool NeedSandboxing = getFlags().getUseSandboxing(); 1781 1782 emitTextHeader(FunctionName, Ctx, Asm); 1783 if (getFlags().getDecorateAsm()) { 1784 for (Variable *Var : getVariables()) { 1785 if (Var->hasKnownStackOffset() && !Var->isRematerializable()) { 1786 Str << "\t" << Var->getSymbolicStackOffset() << " = " 1787 << Var->getStackOffset() << "\n"; 1788 } 1789 } 1790 } 1791 for (CfgNode *Node : Nodes) { 1792 if (NeedSandboxing && Node->needsAlignment()) { 1793 Str << "\t" << Asm->getAlignDirective() << " " 1794 << Asm->getBundleAlignLog2Bytes() << "\n"; 1795 } 1796 Node->emit(this); 1797 } 1798 emitJumpTables(); 1799 Str << "\n"; 1800 } 1801 1802 void Cfg::emitIAS() { 1803 TimerMarker T(TimerStack::TT_emitAsm, this); 1804 // The emitIAS() routines emit into the internal assembler buffer, so there's 1805 // no need to lock the streams. 1806 const bool NeedSandboxing = getFlags().getUseSandboxing(); 1807 for (CfgNode *Node : Nodes) { 1808 if (NeedSandboxing && Node->needsAlignment()) 1809 getAssembler()->alignCfgNode(); 1810 Node->emitIAS(this); 1811 } 1812 emitJumpTables(); 1813 } 1814 1815 size_t Cfg::getTotalMemoryMB() const { 1816 constexpr size_t _1MB = 1024 * 1024; 1817 assert(Allocator != nullptr); 1818 assert(CfgAllocatorTraits::current() == Allocator.get()); 1819 return Allocator->getTotalMemory() / _1MB; 1820 } 1821 1822 size_t Cfg::getLivenessMemoryMB() const { 1823 constexpr size_t _1MB = 1024 * 1024; 1824 if (Live == nullptr) { 1825 return 0; 1826 } 1827 return Live->getAllocator()->getTotalMemory() / _1MB; 1828 } 1829 1830 // Dumps the IR with an optional introductory message. 1831 void Cfg::dump(const char *Message) { 1832 if (!BuildDefs::dump()) 1833 return; 1834 if (!isVerbose()) 1835 return; 1836 OstreamLocker L(Ctx); 1837 Ostream &Str = Ctx->getStrDump(); 1838 if (Message[0]) 1839 Str << "================ " << Message << " ================\n"; 1840 if (isVerbose(IceV_Mem)) { 1841 Str << "Memory size = " << getTotalMemoryMB() << " MB\n"; 1842 } 1843 setCurrentNode(getEntryNode()); 1844 // Print function name+args 1845 if (isVerbose(IceV_Instructions)) { 1846 Str << "define "; 1847 if (getInternal() && !getFlags().getDisableInternal()) 1848 Str << "internal "; 1849 Str << ReturnType << " @" << getFunctionName() << "("; 1850 for (SizeT i = 0; i < Args.size(); ++i) { 1851 if (i > 0) 1852 Str << ", "; 1853 Str << Args[i]->getType() << " "; 1854 Args[i]->dump(this); 1855 } 1856 // Append an extra copy of the function name here, in order to print its 1857 // size stats but not mess up lit tests. 1858 Str << ") { # " << getFunctionNameAndSize() << "\n"; 1859 } 1860 resetCurrentNode(); 1861 if (isVerbose(IceV_Liveness)) { 1862 // Print summary info about variables 1863 for (Variable *Var : Variables) { 1864 Str << "// multiblock="; 1865 if (getVMetadata()->isTracked(Var)) 1866 Str << getVMetadata()->isMultiBlock(Var); 1867 else 1868 Str << "?"; 1869 Str << " defs="; 1870 bool FirstPrint = true; 1871 if (VMetadata->getKind() != VMK_Uses) { 1872 if (const Inst *FirstDef = VMetadata->getFirstDefinition(Var)) { 1873 Str << FirstDef->getNumber(); 1874 FirstPrint = false; 1875 } 1876 } 1877 if (VMetadata->getKind() == VMK_All) { 1878 for (const Inst *Instr : VMetadata->getLatterDefinitions(Var)) { 1879 if (!FirstPrint) 1880 Str << ","; 1881 Str << Instr->getNumber(); 1882 FirstPrint = false; 1883 } 1884 } 1885 Str << " weight=" << Var->getWeight(this) << " "; 1886 Var->dump(this); 1887 Str << " LIVE=" << Var->getLiveRange() << "\n"; 1888 } 1889 } 1890 // Print each basic block 1891 for (CfgNode *Node : Nodes) 1892 Node->dump(this); 1893 if (isVerbose(IceV_Instructions)) 1894 Str << "}\n"; 1895 } 1896 1897 } // end of namespace Ice 1898