1 //===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold 12 /// 128 Alu instructions ; these instructions can access up to 4 prefetched 13 /// 4 lines of 16 registers from constant buffers. Such ALU clauses are 14 /// initiated by CF_ALU instructions. 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPU.h" 18 #include "R600Defines.h" 19 #include "R600InstrInfo.h" 20 #include "R600MachineFunctionInfo.h" 21 #include "R600RegisterInfo.h" 22 #include "AMDGPUSubtarget.h" 23 #include "llvm/CodeGen/MachineFunctionPass.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/CodeGen/MachineRegisterInfo.h" 26 27 using namespace llvm; 28 29 namespace llvm { 30 void initializeR600EmitClauseMarkersPass(PassRegistry&); 31 } 32 33 namespace { 34 35 class R600EmitClauseMarkers : public MachineFunctionPass { 36 37 private: 38 const R600InstrInfo *TII; 39 int Address; 40 41 unsigned OccupiedDwords(MachineInstr *MI) const { 42 switch (MI->getOpcode()) { 43 case AMDGPU::INTERP_PAIR_XY: 44 case AMDGPU::INTERP_PAIR_ZW: 45 case AMDGPU::INTERP_VEC_LOAD: 46 case AMDGPU::DOT_4: 47 return 4; 48 case AMDGPU::KILL: 49 return 0; 50 default: 51 break; 52 } 53 54 // These will be expanded to two ALU instructions in the 55 // ExpandSpecialInstructions pass. 56 if (TII->isLDSRetInstr(MI->getOpcode())) 57 return 2; 58 59 if(TII->isVector(*MI) || 60 TII->isCubeOp(MI->getOpcode()) || 61 TII->isReductionOp(MI->getOpcode())) 62 return 4; 63 64 unsigned NumLiteral = 0; 65 for (MachineInstr::mop_iterator It = MI->operands_begin(), 66 E = MI->operands_end(); It != E; ++It) { 67 MachineOperand &MO = *It; 68 if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) 69 ++NumLiteral; 70 } 71 return 1 + NumLiteral; 72 } 73 74 bool isALU(const MachineInstr *MI) const { 75 if (TII->isALUInstr(MI->getOpcode())) 76 return true; 77 if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode())) 78 return true; 79 switch (MI->getOpcode()) { 80 case AMDGPU::PRED_X: 81 case AMDGPU::INTERP_PAIR_XY: 82 case AMDGPU::INTERP_PAIR_ZW: 83 case AMDGPU::INTERP_VEC_LOAD: 84 case AMDGPU::COPY: 85 case AMDGPU::DOT_4: 86 return true; 87 default: 88 return false; 89 } 90 } 91 92 bool IsTrivialInst(MachineInstr *MI) const { 93 switch (MI->getOpcode()) { 94 case AMDGPU::KILL: 95 case AMDGPU::RETURN: 96 case AMDGPU::IMPLICIT_DEF: 97 return true; 98 default: 99 return false; 100 } 101 } 102 103 std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const { 104 // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2 105 // (See also R600ISelLowering.cpp) 106 // ConstIndex value is in [0, 4095]; 107 return std::pair<unsigned, unsigned>( 108 ((Sel >> 2) - 512) >> 12, // KC_BANK 109 // Line Number of ConstIndex 110 // A line contains 16 constant registers however KCX bank can lock 111 // two line at the same time ; thus we want to get an even line number. 112 // Line number can be retrieved with (>>4), using (>>5) <<1 generates 113 // an even number. 114 ((((Sel >> 2) - 512) & 4095) >> 5) << 1); 115 } 116 117 bool SubstituteKCacheBank(MachineInstr *MI, 118 std::vector<std::pair<unsigned, unsigned> > &CachedConsts, 119 bool UpdateInstr = true) const { 120 std::vector<std::pair<unsigned, unsigned> > UsedKCache; 121 122 if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) 123 return true; 124 125 const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts = 126 TII->getSrcs(MI); 127 assert((TII->isALUInstr(MI->getOpcode()) || 128 MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); 129 for (unsigned i = 0, n = Consts.size(); i < n; ++i) { 130 if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) 131 continue; 132 unsigned Sel = Consts[i].second; 133 unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; 134 unsigned KCacheIndex = Index * 4 + Chan; 135 const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel); 136 if (CachedConsts.empty()) { 137 CachedConsts.push_back(BankLine); 138 UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); 139 continue; 140 } 141 if (CachedConsts[0] == BankLine) { 142 UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); 143 continue; 144 } 145 if (CachedConsts.size() == 1) { 146 CachedConsts.push_back(BankLine); 147 UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); 148 continue; 149 } 150 if (CachedConsts[1] == BankLine) { 151 UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); 152 continue; 153 } 154 return false; 155 } 156 157 if (!UpdateInstr) 158 return true; 159 160 for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { 161 if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) 162 continue; 163 switch(UsedKCache[j].first) { 164 case 0: 165 Consts[i].first->setReg( 166 AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second)); 167 break; 168 case 1: 169 Consts[i].first->setReg( 170 AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second)); 171 break; 172 default: 173 llvm_unreachable("Wrong Cache Line"); 174 } 175 j++; 176 } 177 return true; 178 } 179 180 bool canClauseLocalKillFitInClause( 181 unsigned AluInstCount, 182 std::vector<std::pair<unsigned, unsigned> > KCacheBanks, 183 MachineBasicBlock::iterator Def, 184 MachineBasicBlock::iterator BBEnd) { 185 const R600RegisterInfo &TRI = TII->getRegisterInfo(); 186 for (MachineInstr::const_mop_iterator 187 MOI = Def->operands_begin(), 188 MOE = Def->operands_end(); MOI != MOE; ++MOI) { 189 if (!MOI->isReg() || !MOI->isDef() || 190 TRI.isPhysRegLiveAcrossClauses(MOI->getReg())) 191 continue; 192 193 // Def defines a clause local register, so check that its use will fit 194 // in the clause. 195 unsigned LastUseCount = 0; 196 for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { 197 AluInstCount += OccupiedDwords(UseI); 198 // Make sure we won't need to end the clause due to KCache limitations. 199 if (!SubstituteKCacheBank(UseI, KCacheBanks, false)) 200 return false; 201 202 // We have reached the maximum instruction limit before finding the 203 // use that kills this register, so we cannot use this def in the 204 // current clause. 205 if (AluInstCount >= TII->getMaxAlusPerClause()) 206 return false; 207 208 // Register kill flags have been cleared by the time we get to this 209 // pass, but it is safe to assume that all uses of this register 210 // occur in the same basic block as its definition, because 211 // it is illegal for the scheduler to schedule them in 212 // different blocks. 213 if (UseI->findRegisterUseOperandIdx(MOI->getReg())) 214 LastUseCount = AluInstCount; 215 216 if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) 217 break; 218 } 219 if (LastUseCount) 220 return LastUseCount <= TII->getMaxAlusPerClause(); 221 llvm_unreachable("Clause local register live at end of clause."); 222 } 223 return true; 224 } 225 226 MachineBasicBlock::iterator 227 MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { 228 MachineBasicBlock::iterator ClauseHead = I; 229 std::vector<std::pair<unsigned, unsigned> > KCacheBanks; 230 bool PushBeforeModifier = false; 231 unsigned AluInstCount = 0; 232 for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { 233 if (IsTrivialInst(I)) 234 continue; 235 if (!isALU(I)) 236 break; 237 if (AluInstCount > TII->getMaxAlusPerClause()) 238 break; 239 if (I->getOpcode() == AMDGPU::PRED_X) { 240 // We put PRED_X in its own clause to ensure that ifcvt won't create 241 // clauses with more than 128 insts. 242 // IfCvt is indeed checking that "then" and "else" branches of an if 243 // statement have less than ~60 insts thus converted clauses can't be 244 // bigger than ~121 insts (predicate setter needs to be in the same 245 // clause as predicated alus). 246 if (AluInstCount > 0) 247 break; 248 if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH) 249 PushBeforeModifier = true; 250 AluInstCount ++; 251 continue; 252 } 253 // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as: 254 // 255 // * KILL or INTERP instructions 256 // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits 257 // * Uses waterfalling (i.e. INDEX_MODE = AR.X) 258 // 259 // XXX: These checks have not been implemented yet. 260 if (TII->mustBeLastInClause(I->getOpcode())) { 261 I++; 262 break; 263 } 264 265 // If this instruction defines a clause local register, make sure 266 // its use can fit in this clause. 267 if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) 268 break; 269 270 if (!SubstituteKCacheBank(I, KCacheBanks)) 271 break; 272 AluInstCount += OccupiedDwords(I); 273 } 274 unsigned Opcode = PushBeforeModifier ? 275 AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; 276 BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) 277 // We don't use the ADDR field until R600ControlFlowFinalizer pass, where 278 // it is safe to assume it is 0. However if we always put 0 here, the ifcvt 279 // pass may assume that identical ALU clause starter at the beginning of a 280 // true and false branch can be factorized which is not the case. 281 .addImm(Address++) // ADDR 282 .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 283 .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1 284 .addImm(KCacheBanks.empty()?0:2) // KM0 285 .addImm((KCacheBanks.size() < 2)?0:2) // KM1 286 .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0 287 .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1 288 .addImm(AluInstCount) // COUNT 289 .addImm(1); // Enabled 290 return I; 291 } 292 293 public: 294 static char ID; 295 R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) { 296 297 initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); 298 } 299 300 bool runOnMachineFunction(MachineFunction &MF) override { 301 TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); 302 303 for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); 304 BB != BB_E; ++BB) { 305 MachineBasicBlock &MBB = *BB; 306 MachineBasicBlock::iterator I = MBB.begin(); 307 if (I->getOpcode() == AMDGPU::CF_ALU) 308 continue; // BB was already parsed 309 for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { 310 if (isALU(I)) 311 I = MakeALUClause(MBB, I); 312 else 313 ++I; 314 } 315 } 316 return false; 317 } 318 319 const char *getPassName() const override { 320 return "R600 Emit Clause Markers Pass"; 321 } 322 }; 323 324 char R600EmitClauseMarkers::ID = 0; 325 326 } // end anonymous namespace 327 328 INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", 329 "R600 Emit Clause Markters", false, false) 330 INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", 331 "R600 Emit Clause Markters", false, false) 332 333 llvm::FunctionPass *llvm::createR600EmitClauseMarkers() { 334 return new R600EmitClauseMarkers(); 335 } 336 337