1 //===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold 12 /// 128 Alu instructions ; these instructions can access up to 4 prefetched 13 /// 4 lines of 16 registers from constant buffers. Such ALU clauses are 14 /// initiated by CF_ALU instructions. 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPU.h" 18 #include "R600Defines.h" 19 #include "R600InstrInfo.h" 20 #include "R600MachineFunctionInfo.h" 21 #include "R600RegisterInfo.h" 22 #include "AMDGPUSubtarget.h" 23 #include "llvm/CodeGen/MachineFunctionPass.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/CodeGen/MachineRegisterInfo.h" 26 27 using namespace llvm; 28 29 namespace llvm { 30 void initializeR600EmitClauseMarkersPass(PassRegistry&); 31 } 32 33 namespace { 34 35 class R600EmitClauseMarkers : public MachineFunctionPass { 36 37 private: 38 const R600InstrInfo *TII; 39 int Address; 40 41 unsigned OccupiedDwords(MachineInstr &MI) const { 42 switch (MI.getOpcode()) { 43 case AMDGPU::INTERP_PAIR_XY: 44 case AMDGPU::INTERP_PAIR_ZW: 45 case AMDGPU::INTERP_VEC_LOAD: 46 case AMDGPU::DOT_4: 47 return 4; 48 case AMDGPU::KILL: 49 return 0; 50 default: 51 break; 52 } 53 54 // These will be expanded to two ALU instructions in the 55 // ExpandSpecialInstructions pass. 56 if (TII->isLDSRetInstr(MI.getOpcode())) 57 return 2; 58 59 if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()) || 60 TII->isReductionOp(MI.getOpcode())) 61 return 4; 62 63 unsigned NumLiteral = 0; 64 for (MachineInstr::mop_iterator It = MI.operands_begin(), 65 E = MI.operands_end(); 66 It != E; ++It) { 67 MachineOperand &MO = *It; 68 if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) 69 ++NumLiteral; 70 } 71 return 1 + NumLiteral; 72 } 73 74 bool isALU(const MachineInstr &MI) const { 75 if (TII->isALUInstr(MI.getOpcode())) 76 return true; 77 if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode())) 78 return true; 79 switch (MI.getOpcode()) { 80 case AMDGPU::PRED_X: 81 case AMDGPU::INTERP_PAIR_XY: 82 case AMDGPU::INTERP_PAIR_ZW: 83 case AMDGPU::INTERP_VEC_LOAD: 84 case AMDGPU::COPY: 85 case AMDGPU::DOT_4: 86 return true; 87 default: 88 return false; 89 } 90 } 91 92 bool IsTrivialInst(MachineInstr &MI) const { 93 switch (MI.getOpcode()) { 94 case AMDGPU::KILL: 95 case AMDGPU::RETURN: 96 case AMDGPU::IMPLICIT_DEF: 97 return true; 98 default: 99 return false; 100 } 101 } 102 103 std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const { 104 // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2 105 // (See also R600ISelLowering.cpp) 106 // ConstIndex value is in [0, 4095]; 107 return std::pair<unsigned, unsigned>( 108 ((Sel >> 2) - 512) >> 12, // KC_BANK 109 // Line Number of ConstIndex 110 // A line contains 16 constant registers however KCX bank can lock 111 // two line at the same time ; thus we want to get an even line number. 112 // Line number can be retrieved with (>>4), using (>>5) <<1 generates 113 // an even number. 114 ((((Sel >> 2) - 512) & 4095) >> 5) << 1); 115 } 116 117 bool 118 SubstituteKCacheBank(MachineInstr &MI, 119 std::vector<std::pair<unsigned, unsigned>> &CachedConsts, 120 bool UpdateInstr = true) const { 121 std::vector<std::pair<unsigned, unsigned> > UsedKCache; 122 123 if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4) 124 return true; 125 126 const SmallVectorImpl<std::pair<MachineOperand *, int64_t>> &Consts = 127 TII->getSrcs(MI); 128 assert( 129 (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) && 130 "Can't assign Const"); 131 for (unsigned i = 0, n = Consts.size(); i < n; ++i) { 132 if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) 133 continue; 134 unsigned Sel = Consts[i].second; 135 unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; 136 unsigned KCacheIndex = Index * 4 + Chan; 137 const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel); 138 if (CachedConsts.empty()) { 139 CachedConsts.push_back(BankLine); 140 UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); 141 continue; 142 } 143 if (CachedConsts[0] == BankLine) { 144 UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); 145 continue; 146 } 147 if (CachedConsts.size() == 1) { 148 CachedConsts.push_back(BankLine); 149 UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); 150 continue; 151 } 152 if (CachedConsts[1] == BankLine) { 153 UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); 154 continue; 155 } 156 return false; 157 } 158 159 if (!UpdateInstr) 160 return true; 161 162 for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { 163 if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) 164 continue; 165 switch(UsedKCache[j].first) { 166 case 0: 167 Consts[i].first->setReg( 168 AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second)); 169 break; 170 case 1: 171 Consts[i].first->setReg( 172 AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second)); 173 break; 174 default: 175 llvm_unreachable("Wrong Cache Line"); 176 } 177 j++; 178 } 179 return true; 180 } 181 182 bool canClauseLocalKillFitInClause( 183 unsigned AluInstCount, 184 std::vector<std::pair<unsigned, unsigned> > KCacheBanks, 185 MachineBasicBlock::iterator Def, 186 MachineBasicBlock::iterator BBEnd) { 187 const R600RegisterInfo &TRI = TII->getRegisterInfo(); 188 for (MachineInstr::const_mop_iterator 189 MOI = Def->operands_begin(), 190 MOE = Def->operands_end(); MOI != MOE; ++MOI) { 191 if (!MOI->isReg() || !MOI->isDef() || 192 TRI.isPhysRegLiveAcrossClauses(MOI->getReg())) 193 continue; 194 195 // Def defines a clause local register, so check that its use will fit 196 // in the clause. 197 unsigned LastUseCount = 0; 198 for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { 199 AluInstCount += OccupiedDwords(*UseI); 200 // Make sure we won't need to end the clause due to KCache limitations. 201 if (!SubstituteKCacheBank(*UseI, KCacheBanks, false)) 202 return false; 203 204 // We have reached the maximum instruction limit before finding the 205 // use that kills this register, so we cannot use this def in the 206 // current clause. 207 if (AluInstCount >= TII->getMaxAlusPerClause()) 208 return false; 209 210 // Register kill flags have been cleared by the time we get to this 211 // pass, but it is safe to assume that all uses of this register 212 // occur in the same basic block as its definition, because 213 // it is illegal for the scheduler to schedule them in 214 // different blocks. 215 if (UseI->findRegisterUseOperandIdx(MOI->getReg())) 216 LastUseCount = AluInstCount; 217 218 if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) 219 break; 220 } 221 if (LastUseCount) 222 return LastUseCount <= TII->getMaxAlusPerClause(); 223 llvm_unreachable("Clause local register live at end of clause."); 224 } 225 return true; 226 } 227 228 MachineBasicBlock::iterator 229 MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { 230 MachineBasicBlock::iterator ClauseHead = I; 231 std::vector<std::pair<unsigned, unsigned> > KCacheBanks; 232 bool PushBeforeModifier = false; 233 unsigned AluInstCount = 0; 234 for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { 235 if (IsTrivialInst(*I)) 236 continue; 237 if (!isALU(*I)) 238 break; 239 if (AluInstCount > TII->getMaxAlusPerClause()) 240 break; 241 if (I->getOpcode() == AMDGPU::PRED_X) { 242 // We put PRED_X in its own clause to ensure that ifcvt won't create 243 // clauses with more than 128 insts. 244 // IfCvt is indeed checking that "then" and "else" branches of an if 245 // statement have less than ~60 insts thus converted clauses can't be 246 // bigger than ~121 insts (predicate setter needs to be in the same 247 // clause as predicated alus). 248 if (AluInstCount > 0) 249 break; 250 if (TII->getFlagOp(*I).getImm() & MO_FLAG_PUSH) 251 PushBeforeModifier = true; 252 AluInstCount ++; 253 continue; 254 } 255 // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as: 256 // 257 // * KILL or INTERP instructions 258 // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits 259 // * Uses waterfalling (i.e. INDEX_MODE = AR.X) 260 // 261 // XXX: These checks have not been implemented yet. 262 if (TII->mustBeLastInClause(I->getOpcode())) { 263 I++; 264 break; 265 } 266 267 // If this instruction defines a clause local register, make sure 268 // its use can fit in this clause. 269 if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) 270 break; 271 272 if (!SubstituteKCacheBank(*I, KCacheBanks)) 273 break; 274 AluInstCount += OccupiedDwords(*I); 275 } 276 unsigned Opcode = PushBeforeModifier ? 277 AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; 278 BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) 279 // We don't use the ADDR field until R600ControlFlowFinalizer pass, where 280 // it is safe to assume it is 0. However if we always put 0 here, the ifcvt 281 // pass may assume that identical ALU clause starter at the beginning of a 282 // true and false branch can be factorized which is not the case. 283 .addImm(Address++) // ADDR 284 .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 285 .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1 286 .addImm(KCacheBanks.empty()?0:2) // KM0 287 .addImm((KCacheBanks.size() < 2)?0:2) // KM1 288 .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0 289 .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1 290 .addImm(AluInstCount) // COUNT 291 .addImm(1); // Enabled 292 return I; 293 } 294 295 public: 296 static char ID; 297 R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) { 298 299 initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); 300 } 301 302 bool runOnMachineFunction(MachineFunction &MF) override { 303 const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); 304 TII = ST.getInstrInfo(); 305 306 for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); 307 BB != BB_E; ++BB) { 308 MachineBasicBlock &MBB = *BB; 309 MachineBasicBlock::iterator I = MBB.begin(); 310 if (I->getOpcode() == AMDGPU::CF_ALU) 311 continue; // BB was already parsed 312 for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { 313 if (isALU(*I)) 314 I = MakeALUClause(MBB, I); 315 else 316 ++I; 317 } 318 } 319 return false; 320 } 321 322 const char *getPassName() const override { 323 return "R600 Emit Clause Markers Pass"; 324 } 325 }; 326 327 char R600EmitClauseMarkers::ID = 0; 328 329 } // end anonymous namespace 330 331 INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", 332 "R600 Emit Clause Markters", false, false) 333 INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", 334 "R600 Emit Clause Markters", false, false) 335 336 llvm::FunctionPass *llvm::createR600EmitClauseMarkers() { 337 return new R600EmitClauseMarkers(); 338 } 339 340