1 //===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass implements instructions packetization for R600. It unsets isLast 12 /// bit of instructions inside a bundle and substitutes src register with 13 /// PreviousVector when applicable. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Support/Debug.h" 18 #include "AMDGPU.h" 19 #include "AMDGPUSubtarget.h" 20 #include "R600InstrInfo.h" 21 #include "llvm/CodeGen/DFAPacketizer.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineFunctionPass.h" 24 #include "llvm/CodeGen/MachineLoopInfo.h" 25 #include "llvm/CodeGen/Passes.h" 26 #include "llvm/CodeGen/ScheduleDAG.h" 27 #include "llvm/Support/raw_ostream.h" 28 29 using namespace llvm; 30 31 #define DEBUG_TYPE "packets" 32 33 namespace { 34 35 class R600Packetizer : public MachineFunctionPass { 36 37 public: 38 static char ID; 39 R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} 40 41 void getAnalysisUsage(AnalysisUsage &AU) const override { 42 AU.setPreservesCFG(); 43 AU.addRequired<MachineDominatorTree>(); 44 AU.addPreserved<MachineDominatorTree>(); 45 AU.addRequired<MachineLoopInfo>(); 46 AU.addPreserved<MachineLoopInfo>(); 47 MachineFunctionPass::getAnalysisUsage(AU); 48 } 49 50 const char *getPassName() const override { 51 return "R600 Packetizer"; 52 } 53 54 bool runOnMachineFunction(MachineFunction &Fn) override; 55 }; 56 char R600Packetizer::ID = 0; 57 58 class R600PacketizerList : public VLIWPacketizerList { 59 private: 60 const R600InstrInfo *TII; 61 const R600RegisterInfo &TRI; 62 bool VLIW5; 63 bool ConsideredInstUsesAlreadyWrittenVectorElement; 64 65 unsigned getSlot(const MachineInstr &MI) const { 66 return TRI.getHWRegChan(MI.getOperand(0).getReg()); 67 } 68 69 /// \returns register to PV chan mapping for bundle/single instructions that 70 /// immediately precedes I. 71 DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I) 72 const { 73 DenseMap<unsigned, unsigned> Result; 74 I--; 75 if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle()) 76 return Result; 77 MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); 78 if (I->isBundle()) 79 BI++; 80 int LastDstChan = -1; 81 do { 82 bool isTrans = false; 83 int BISlot = getSlot(*BI); 84 if (LastDstChan >= BISlot) 85 isTrans = true; 86 LastDstChan = BISlot; 87 if (TII->isPredicated(*BI)) 88 continue; 89 int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); 90 if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) 91 continue; 92 int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst); 93 if (DstIdx == -1) { 94 continue; 95 } 96 unsigned Dst = BI->getOperand(DstIdx).getReg(); 97 if (isTrans || TII->isTransOnly(*BI)) { 98 Result[Dst] = AMDGPU::PS; 99 continue; 100 } 101 if (BI->getOpcode() == AMDGPU::DOT4_r600 || 102 BI->getOpcode() == AMDGPU::DOT4_eg) { 103 Result[Dst] = AMDGPU::PV_X; 104 continue; 105 } 106 if (Dst == AMDGPU::OQAP) { 107 continue; 108 } 109 unsigned PVReg = 0; 110 switch (TRI.getHWRegChan(Dst)) { 111 case 0: 112 PVReg = AMDGPU::PV_X; 113 break; 114 case 1: 115 PVReg = AMDGPU::PV_Y; 116 break; 117 case 2: 118 PVReg = AMDGPU::PV_Z; 119 break; 120 case 3: 121 PVReg = AMDGPU::PV_W; 122 break; 123 default: 124 llvm_unreachable("Invalid Chan"); 125 } 126 Result[Dst] = PVReg; 127 } while ((++BI)->isBundledWithPred()); 128 return Result; 129 } 130 131 void substitutePV(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PVs) 132 const { 133 unsigned Ops[] = { 134 AMDGPU::OpName::src0, 135 AMDGPU::OpName::src1, 136 AMDGPU::OpName::src2 137 }; 138 for (unsigned i = 0; i < 3; i++) { 139 int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]); 140 if (OperandIdx < 0) 141 continue; 142 unsigned Src = MI.getOperand(OperandIdx).getReg(); 143 const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src); 144 if (It != PVs.end()) 145 MI.getOperand(OperandIdx).setReg(It->second); 146 } 147 } 148 public: 149 // Ctor. 150 R600PacketizerList(MachineFunction &MF, const R600Subtarget &ST, 151 MachineLoopInfo &MLI) 152 : VLIWPacketizerList(MF, MLI, nullptr), 153 TII(ST.getInstrInfo()), 154 TRI(TII->getRegisterInfo()) { 155 VLIW5 = !ST.hasCaymanISA(); 156 } 157 158 // initPacketizerState - initialize some internal flags. 159 void initPacketizerState() override { 160 ConsideredInstUsesAlreadyWrittenVectorElement = false; 161 } 162 163 // ignorePseudoInstruction - Ignore bundling of pseudo instructions. 164 bool ignorePseudoInstruction(const MachineInstr &MI, 165 const MachineBasicBlock *MBB) override { 166 return false; 167 } 168 169 // isSoloInstruction - return true if instruction MI can not be packetized 170 // with any other instruction, which means that MI itself is a packet. 171 bool isSoloInstruction(const MachineInstr &MI) override { 172 if (TII->isVector(MI)) 173 return true; 174 if (!TII->isALUInstr(MI.getOpcode())) 175 return true; 176 if (MI.getOpcode() == AMDGPU::GROUP_BARRIER) 177 return true; 178 // XXX: This can be removed once the packetizer properly handles all the 179 // LDS instruction group restrictions. 180 return TII->isLDSInstr(MI.getOpcode()); 181 } 182 183 // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ 184 // together. 185 bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override { 186 MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); 187 if (getSlot(*MII) == getSlot(*MIJ)) 188 ConsideredInstUsesAlreadyWrittenVectorElement = true; 189 // Does MII and MIJ share the same pred_sel ? 190 int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel), 191 OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel); 192 unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, 193 PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; 194 if (PredI != PredJ) 195 return false; 196 if (SUJ->isSucc(SUI)) { 197 for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) { 198 const SDep &Dep = SUJ->Succs[i]; 199 if (Dep.getSUnit() != SUI) 200 continue; 201 if (Dep.getKind() == SDep::Anti) 202 continue; 203 if (Dep.getKind() == SDep::Output) 204 if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg()) 205 continue; 206 return false; 207 } 208 } 209 210 bool ARDef = 211 TII->definesAddressRegister(*MII) || TII->definesAddressRegister(*MIJ); 212 bool ARUse = 213 TII->usesAddressRegister(*MII) || TII->usesAddressRegister(*MIJ); 214 215 return !ARDef || !ARUse; 216 } 217 218 // isLegalToPruneDependencies - Is it legal to prune dependece between SUI 219 // and SUJ. 220 bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override { 221 return false; 222 } 223 224 void setIsLastBit(MachineInstr *MI, unsigned Bit) const { 225 unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last); 226 MI->getOperand(LastOp).setImm(Bit); 227 } 228 229 bool isBundlableWithCurrentPMI(MachineInstr &MI, 230 const DenseMap<unsigned, unsigned> &PV, 231 std::vector<R600InstrInfo::BankSwizzle> &BS, 232 bool &isTransSlot) { 233 isTransSlot = TII->isTransOnly(MI); 234 assert (!isTransSlot || VLIW5); 235 236 // Is the dst reg sequence legal ? 237 if (!isTransSlot && !CurrentPacketMIs.empty()) { 238 if (getSlot(MI) <= getSlot(*CurrentPacketMIs.back())) { 239 if (ConsideredInstUsesAlreadyWrittenVectorElement && 240 !TII->isVectorOnly(MI) && VLIW5) { 241 isTransSlot = true; 242 DEBUG({ 243 dbgs() << "Considering as Trans Inst :"; 244 MI.dump(); 245 }); 246 } 247 else 248 return false; 249 } 250 } 251 252 // Are the Constants limitations met ? 253 CurrentPacketMIs.push_back(&MI); 254 if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) { 255 DEBUG({ 256 dbgs() << "Couldn't pack :\n"; 257 MI.dump(); 258 dbgs() << "with the following packets :\n"; 259 for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { 260 CurrentPacketMIs[i]->dump(); 261 dbgs() << "\n"; 262 } 263 dbgs() << "because of Consts read limitations\n"; 264 }); 265 CurrentPacketMIs.pop_back(); 266 return false; 267 } 268 269 // Is there a BankSwizzle set that meet Read Port limitations ? 270 if (!TII->fitsReadPortLimitations(CurrentPacketMIs, 271 PV, BS, isTransSlot)) { 272 DEBUG({ 273 dbgs() << "Couldn't pack :\n"; 274 MI.dump(); 275 dbgs() << "with the following packets :\n"; 276 for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { 277 CurrentPacketMIs[i]->dump(); 278 dbgs() << "\n"; 279 } 280 dbgs() << "because of Read port limitations\n"; 281 }); 282 CurrentPacketMIs.pop_back(); 283 return false; 284 } 285 286 // We cannot read LDS source registrs from the Trans slot. 287 if (isTransSlot && TII->readsLDSSrcReg(MI)) 288 return false; 289 290 CurrentPacketMIs.pop_back(); 291 return true; 292 } 293 294 MachineBasicBlock::iterator addToPacket(MachineInstr &MI) override { 295 MachineBasicBlock::iterator FirstInBundle = 296 CurrentPacketMIs.empty() ? &MI : CurrentPacketMIs.front(); 297 const DenseMap<unsigned, unsigned> &PV = 298 getPreviousVector(FirstInBundle); 299 std::vector<R600InstrInfo::BankSwizzle> BS; 300 bool isTransSlot; 301 302 if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) { 303 for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) { 304 MachineInstr *MI = CurrentPacketMIs[i]; 305 unsigned Op = TII->getOperandIdx(MI->getOpcode(), 306 AMDGPU::OpName::bank_swizzle); 307 MI->getOperand(Op).setImm(BS[i]); 308 } 309 unsigned Op = 310 TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle); 311 MI.getOperand(Op).setImm(BS.back()); 312 if (!CurrentPacketMIs.empty()) 313 setIsLastBit(CurrentPacketMIs.back(), 0); 314 substitutePV(MI, PV); 315 MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI); 316 if (isTransSlot) { 317 endPacket(std::next(It)->getParent(), std::next(It)); 318 } 319 return It; 320 } 321 endPacket(MI.getParent(), MI); 322 if (TII->isTransOnly(MI)) 323 return MI; 324 return VLIWPacketizerList::addToPacket(MI); 325 } 326 }; 327 328 bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { 329 const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>(); 330 const R600InstrInfo *TII = ST.getInstrInfo(); 331 332 MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); 333 334 // Instantiate the packetizer. 335 R600PacketizerList Packetizer(Fn, ST, MLI); 336 337 // DFA state table should not be empty. 338 assert(Packetizer.getResourceTracker() && "Empty DFA table!"); 339 340 if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty()) 341 return false; 342 343 // 344 // Loop over all basic blocks and remove KILL pseudo-instructions 345 // These instructions confuse the dependence analysis. Consider: 346 // D0 = ... (Insn 0) 347 // R0 = KILL R0, D0 (Insn 1) 348 // R0 = ... (Insn 2) 349 // Here, Insn 1 will result in the dependence graph not emitting an output 350 // dependence between Insn 0 and Insn 2. This can lead to incorrect 351 // packetization 352 // 353 for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); 354 MBB != MBBe; ++MBB) { 355 MachineBasicBlock::iterator End = MBB->end(); 356 MachineBasicBlock::iterator MI = MBB->begin(); 357 while (MI != End) { 358 if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF || 359 (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) { 360 MachineBasicBlock::iterator DeleteMI = MI; 361 ++MI; 362 MBB->erase(DeleteMI); 363 End = MBB->end(); 364 continue; 365 } 366 ++MI; 367 } 368 } 369 370 // Loop over all of the basic blocks. 371 for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); 372 MBB != MBBe; ++MBB) { 373 // Find scheduling regions and schedule / packetize each region. 374 unsigned RemainingCount = MBB->size(); 375 for(MachineBasicBlock::iterator RegionEnd = MBB->end(); 376 RegionEnd != MBB->begin();) { 377 // The next region starts above the previous region. Look backward in the 378 // instruction stream until we find the nearest boundary. 379 MachineBasicBlock::iterator I = RegionEnd; 380 for(;I != MBB->begin(); --I, --RemainingCount) { 381 if (TII->isSchedulingBoundary(*std::prev(I), &*MBB, Fn)) 382 break; 383 } 384 I = MBB->begin(); 385 386 // Skip empty scheduling regions. 387 if (I == RegionEnd) { 388 RegionEnd = std::prev(RegionEnd); 389 --RemainingCount; 390 continue; 391 } 392 // Skip regions with one instruction. 393 if (I == std::prev(RegionEnd)) { 394 RegionEnd = std::prev(RegionEnd); 395 continue; 396 } 397 398 Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd); 399 RegionEnd = I; 400 } 401 } 402 403 return true; 404 405 } 406 407 } // end anonymous namespace 408 409 llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) { 410 return new R600Packetizer(tm); 411 } 412