1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for SI 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIISelLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUIntrinsicInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIInstrInfo.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "SIRegisterInfo.h" 22 #include "llvm/CodeGen/CallingConvLower.h" 23 #include "llvm/CodeGen/MachineInstrBuilder.h" 24 #include "llvm/CodeGen/MachineRegisterInfo.h" 25 #include "llvm/CodeGen/SelectionDAG.h" 26 #include "llvm/IR/Function.h" 27 #include "llvm/ADT/SmallString.h" 28 29 using namespace llvm; 30 31 SITargetLowering::SITargetLowering(TargetMachine &TM) : 32 AMDGPUTargetLowering(TM) { 33 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 34 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 35 36 addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 37 addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 38 39 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); 40 addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); 41 42 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 43 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 44 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 45 46 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); 47 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 48 49 addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); 50 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 51 52 addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); 53 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 54 55 computeRegisterProperties(); 56 57 // Condition Codes 58 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 59 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 60 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 61 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 62 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 63 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 64 65 setCondCodeAction(ISD::SETONE, MVT::f64, Expand); 66 setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); 67 setCondCodeAction(ISD::SETUGE, MVT::f64, Expand); 68 setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); 69 setCondCodeAction(ISD::SETULE, MVT::f64, Expand); 70 setCondCodeAction(ISD::SETULT, MVT::f64, Expand); 71 72 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 73 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 74 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 75 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 76 77 setOperationAction(ISD::ADD, MVT::i32, Legal); 78 setOperationAction(ISD::ADDC, MVT::i32, Legal); 79 setOperationAction(ISD::ADDE, MVT::i32, Legal); 80 setOperationAction(ISD::SUBC, MVT::i32, Legal); 81 setOperationAction(ISD::SUBE, MVT::i32, Legal); 82 83 // We need to custom lower vector stores from local memory 84 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 85 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 86 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 87 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 88 89 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 90 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 91 92 // We need to custom lower loads/stores from private memory 93 setOperationAction(ISD::LOAD, MVT::i32, Custom); 94 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 95 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 96 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 97 98 setOperationAction(ISD::STORE, MVT::i1, Custom); 99 setOperationAction(ISD::STORE, MVT::i32, Custom); 100 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 101 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 102 103 setOperationAction(ISD::SELECT, MVT::f32, Promote); 104 AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32); 105 setOperationAction(ISD::SELECT, MVT::i64, Custom); 106 setOperationAction(ISD::SELECT, MVT::f64, Promote); 107 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 108 109 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 110 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 111 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 112 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 113 114 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 115 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 116 117 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); 118 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 119 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 120 121 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); 122 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 123 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 124 125 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 126 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 127 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 128 129 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom); 130 131 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 132 133 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 134 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 135 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); 136 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 137 138 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 139 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 140 141 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 142 setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); 143 setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); 144 setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); 145 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); 146 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); 147 148 setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); 149 setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); 150 setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); 151 setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand); 152 153 setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); 154 setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); 155 setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); 156 setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); 157 setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); 158 159 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 160 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 161 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 162 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 163 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 164 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 165 166 setOperationAction(ISD::LOAD, MVT::i1, Custom); 167 168 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 169 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 170 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 171 172 // These should use UDIVREM, so set them to expand 173 setOperationAction(ISD::UDIV, MVT::i64, Expand); 174 setOperationAction(ISD::UREM, MVT::i64, Expand); 175 176 // We only support LOAD/STORE and vector manipulation ops for vectors 177 // with > 4 elements. 178 MVT VecTypes[] = { 179 MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32 180 }; 181 182 for (MVT VT : VecTypes) { 183 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 184 switch(Op) { 185 case ISD::LOAD: 186 case ISD::STORE: 187 case ISD::BUILD_VECTOR: 188 case ISD::BITCAST: 189 case ISD::EXTRACT_VECTOR_ELT: 190 case ISD::INSERT_VECTOR_ELT: 191 case ISD::CONCAT_VECTORS: 192 case ISD::INSERT_SUBVECTOR: 193 case ISD::EXTRACT_SUBVECTOR: 194 break; 195 default: 196 setOperationAction(Op, VT, Expand); 197 break; 198 } 199 } 200 } 201 202 for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) { 203 MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I); 204 setOperationAction(ISD::FTRUNC, VT, Expand); 205 setOperationAction(ISD::FCEIL, VT, Expand); 206 setOperationAction(ISD::FFLOOR, VT, Expand); 207 } 208 209 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 210 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 211 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 212 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 213 setOperationAction(ISD::FRINT, MVT::f64, Legal); 214 } 215 216 // FIXME: These should be removed and handled the same was as f32 fneg. Source 217 // modifiers also work for the double instructions. 218 setOperationAction(ISD::FNEG, MVT::f64, Expand); 219 setOperationAction(ISD::FABS, MVT::f64, Expand); 220 221 setTargetDAGCombine(ISD::SELECT_CC); 222 setTargetDAGCombine(ISD::SETCC); 223 224 setTargetDAGCombine(ISD::UINT_TO_FP); 225 226 setSchedulingPreference(Sched::RegPressure); 227 } 228 229 //===----------------------------------------------------------------------===// 230 // TargetLowering queries 231 //===----------------------------------------------------------------------===// 232 233 bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, 234 unsigned AddrSpace, 235 bool *IsFast) const { 236 if (IsFast) 237 *IsFast = false; 238 239 // XXX: This depends on the address space and also we may want to revist 240 // the alignment values we specify in the DataLayout. 241 242 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, 243 // which isn't a simple VT. 244 if (!VT.isSimple() || VT == MVT::Other) 245 return false; 246 247 // XXX - CI changes say "Support for unaligned memory accesses" but I don't 248 // see what for specifically. The wording everywhere else seems to be the 249 // same. 250 251 // 3.6.4 - Operations using pairs of VGPRs (for example: double-floats) have 252 // no alignment restrictions. 253 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { 254 // Using any pair of GPRs should be the same as any other pair. 255 if (IsFast) 256 *IsFast = true; 257 return VT.bitsGE(MVT::i64); 258 } 259 260 // XXX - The only mention I see of this in the ISA manual is for LDS direct 261 // reads the "byte address and must be dword aligned". Is it also true for the 262 // normal loads and stores? 263 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) 264 return false; 265 266 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 267 // byte-address are ignored, thus forcing Dword alignment. 268 if (IsFast) 269 *IsFast = true; 270 return VT.bitsGT(MVT::i32); 271 } 272 273 TargetLoweringBase::LegalizeTypeAction 274 SITargetLowering::getPreferredVectorAction(EVT VT) const { 275 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) 276 return TypeSplitVector; 277 278 return TargetLoweringBase::getPreferredVectorAction(VT); 279 } 280 281 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 282 Type *Ty) const { 283 const SIInstrInfo *TII = 284 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 285 return TII->isInlineConstant(Imm); 286 } 287 288 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 289 SDLoc DL, SDValue Chain, 290 unsigned Offset, bool Signed) const { 291 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 292 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 293 AMDGPUAS::CONSTANT_ADDRESS); 294 SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, 295 MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); 296 SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 297 DAG.getConstant(Offset, MVT::i64)); 298 return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr, 299 MachinePointerInfo(UndefValue::get(PtrTy)), MemVT, 300 false, false, MemVT.getSizeInBits() >> 3); 301 302 } 303 304 SDValue SITargetLowering::LowerFormalArguments( 305 SDValue Chain, 306 CallingConv::ID CallConv, 307 bool isVarArg, 308 const SmallVectorImpl<ISD::InputArg> &Ins, 309 SDLoc DL, SelectionDAG &DAG, 310 SmallVectorImpl<SDValue> &InVals) const { 311 312 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 313 314 MachineFunction &MF = DAG.getMachineFunction(); 315 FunctionType *FType = MF.getFunction()->getFunctionType(); 316 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 317 318 assert(CallConv == CallingConv::C); 319 320 SmallVector<ISD::InputArg, 16> Splits; 321 uint32_t Skipped = 0; 322 323 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 324 const ISD::InputArg &Arg = Ins[i]; 325 326 // First check if it's a PS input addr 327 if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() && 328 !Arg.Flags.isByVal()) { 329 330 assert((PSInputNum <= 15) && "Too many PS inputs!"); 331 332 if (!Arg.Used) { 333 // We can savely skip PS inputs 334 Skipped |= 1 << i; 335 ++PSInputNum; 336 continue; 337 } 338 339 Info->PSInputAddr |= 1 << PSInputNum++; 340 } 341 342 // Second split vertices into their elements 343 if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) { 344 ISD::InputArg NewArg = Arg; 345 NewArg.Flags.setSplit(); 346 NewArg.VT = Arg.VT.getVectorElementType(); 347 348 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 349 // three or five element vertex only needs three or five registers, 350 // NOT four or eigth. 351 Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 352 unsigned NumElements = ParamType->getVectorNumElements(); 353 354 for (unsigned j = 0; j != NumElements; ++j) { 355 Splits.push_back(NewArg); 356 NewArg.PartOffset += NewArg.VT.getStoreSize(); 357 } 358 359 } else if (Info->ShaderType != ShaderType::COMPUTE) { 360 Splits.push_back(Arg); 361 } 362 } 363 364 SmallVector<CCValAssign, 16> ArgLocs; 365 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 366 getTargetMachine(), ArgLocs, *DAG.getContext()); 367 368 // At least one interpolation mode must be enabled or else the GPU will hang. 369 if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { 370 Info->PSInputAddr |= 1; 371 CCInfo.AllocateReg(AMDGPU::VGPR0); 372 CCInfo.AllocateReg(AMDGPU::VGPR1); 373 } 374 375 // The pointer to the list of arguments is stored in SGPR0, SGPR1 376 if (Info->ShaderType == ShaderType::COMPUTE) { 377 CCInfo.AllocateReg(AMDGPU::SGPR0); 378 CCInfo.AllocateReg(AMDGPU::SGPR1); 379 MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); 380 } 381 382 if (Info->ShaderType == ShaderType::COMPUTE) { 383 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 384 Splits); 385 } 386 387 AnalyzeFormalArguments(CCInfo, Splits); 388 389 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 390 391 const ISD::InputArg &Arg = Ins[i]; 392 if (Skipped & (1 << i)) { 393 InVals.push_back(DAG.getUNDEF(Arg.VT)); 394 continue; 395 } 396 397 CCValAssign &VA = ArgLocs[ArgIdx++]; 398 EVT VT = VA.getLocVT(); 399 400 if (VA.isMemLoc()) { 401 VT = Ins[i].VT; 402 EVT MemVT = Splits[i].VT; 403 // The first 36 bytes of the input buffer contains information about 404 // thread group and global sizes. 405 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), 406 36 + VA.getLocMemOffset(), 407 Ins[i].Flags.isSExt()); 408 InVals.push_back(Arg); 409 continue; 410 } 411 assert(VA.isRegLoc() && "Parameter must be in a register!"); 412 413 unsigned Reg = VA.getLocReg(); 414 415 if (VT == MVT::i64) { 416 // For now assume it is a pointer 417 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 418 &AMDGPU::SReg_64RegClass); 419 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 420 InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 421 continue; 422 } 423 424 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 425 426 Reg = MF.addLiveIn(Reg, RC); 427 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 428 429 if (Arg.VT.isVector()) { 430 431 // Build a vector from the registers 432 Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 433 unsigned NumElements = ParamType->getVectorNumElements(); 434 435 SmallVector<SDValue, 4> Regs; 436 Regs.push_back(Val); 437 for (unsigned j = 1; j != NumElements; ++j) { 438 Reg = ArgLocs[ArgIdx++].getLocReg(); 439 Reg = MF.addLiveIn(Reg, RC); 440 Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 441 } 442 443 // Fill up the missing vector elements 444 NumElements = Arg.VT.getVectorNumElements() - NumElements; 445 for (unsigned j = 0; j != NumElements; ++j) 446 Regs.push_back(DAG.getUNDEF(VT)); 447 448 InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); 449 continue; 450 } 451 452 InVals.push_back(Val); 453 } 454 return Chain; 455 } 456 457 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 458 MachineInstr * MI, MachineBasicBlock * BB) const { 459 460 MachineBasicBlock::iterator I = *MI; 461 const SIInstrInfo *TII = 462 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 463 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 464 465 switch (MI->getOpcode()) { 466 default: 467 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 468 case AMDGPU::BRANCH: return BB; 469 case AMDGPU::SI_ADDR64_RSRC: { 470 unsigned SuperReg = MI->getOperand(0).getReg(); 471 unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); 472 unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); 473 unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 474 unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 475 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) 476 .addOperand(MI->getOperand(1)); 477 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) 478 .addImm(0); 479 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) 480 .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); 481 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) 482 .addReg(SubRegHiLo) 483 .addImm(AMDGPU::sub0) 484 .addReg(SubRegHiHi) 485 .addImm(AMDGPU::sub1); 486 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg) 487 .addReg(SubRegLo) 488 .addImm(AMDGPU::sub0_sub1) 489 .addReg(SubRegHi) 490 .addImm(AMDGPU::sub2_sub3); 491 MI->eraseFromParent(); 492 break; 493 } 494 case AMDGPU::V_SUB_F64: { 495 unsigned DestReg = MI->getOperand(0).getReg(); 496 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) 497 .addImm(0) // SRC0 modifiers 498 .addReg(MI->getOperand(1).getReg()) 499 .addImm(1) // SRC1 modifiers 500 .addReg(MI->getOperand(2).getReg()) 501 .addImm(0) // SRC2 modifiers 502 .addImm(0) // src2 503 .addImm(0) // CLAMP 504 .addImm(0); // OMOD 505 MI->eraseFromParent(); 506 break; 507 } 508 case AMDGPU::SI_RegisterStorePseudo: { 509 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 510 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 511 MachineInstrBuilder MIB = 512 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), 513 Reg); 514 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) 515 MIB.addOperand(MI->getOperand(i)); 516 517 MI->eraseFromParent(); 518 break; 519 } 520 case AMDGPU::FABS_SI: { 521 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 522 const SIInstrInfo *TII = 523 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 524 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); 525 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), 526 Reg) 527 .addImm(0x7fffffff); 528 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32), 529 MI->getOperand(0).getReg()) 530 .addReg(MI->getOperand(1).getReg()) 531 .addReg(Reg); 532 MI->eraseFromParent(); 533 break; 534 } 535 case AMDGPU::FNEG_SI: { 536 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 537 const SIInstrInfo *TII = 538 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 539 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); 540 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), 541 Reg) 542 .addImm(0x80000000); 543 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32), 544 MI->getOperand(0).getReg()) 545 .addReg(MI->getOperand(1).getReg()) 546 .addReg(Reg); 547 MI->eraseFromParent(); 548 break; 549 } 550 case AMDGPU::FCLAMP_SI: { 551 const SIInstrInfo *TII = 552 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 553 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64), 554 MI->getOperand(0).getReg()) 555 .addImm(0) // SRC0 modifiers 556 .addOperand(MI->getOperand(1)) 557 .addImm(0) // SRC1 modifiers 558 .addImm(0) // SRC1 559 .addImm(1) // CLAMP 560 .addImm(0); // OMOD 561 MI->eraseFromParent(); 562 } 563 } 564 return BB; 565 } 566 567 EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 568 if (!VT.isVector()) { 569 return MVT::i1; 570 } 571 return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 572 } 573 574 MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { 575 return MVT::i32; 576 } 577 578 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 579 VT = VT.getScalarType(); 580 581 if (!VT.isSimple()) 582 return false; 583 584 switch (VT.getSimpleVT().SimpleTy) { 585 case MVT::f32: 586 return false; /* There is V_MAD_F32 for f32 */ 587 case MVT::f64: 588 return true; 589 default: 590 break; 591 } 592 593 return false; 594 } 595 596 //===----------------------------------------------------------------------===// 597 // Custom DAG Lowering Operations 598 //===----------------------------------------------------------------------===// 599 600 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 601 MachineFunction &MF = DAG.getMachineFunction(); 602 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 603 switch (Op.getOpcode()) { 604 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 605 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 606 case ISD::LOAD: { 607 LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); 608 EVT VT = Op.getValueType(); 609 610 // These loads are legal. 611 if (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && 612 VT.isVector() && VT.getVectorNumElements() == 2 && 613 VT.getVectorElementType() == MVT::i32) 614 return SDValue(); 615 616 if (Op.getValueType().isVector() && 617 (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 618 Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || 619 (Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 620 Op.getValueType().getVectorNumElements() > 4))) { 621 return SplitVectorLoad(Op, DAG); 622 } else { 623 SDValue Result = LowerLOAD(Op, DAG); 624 assert((!Result.getNode() || 625 Result.getNode()->getNumValues() == 2) && 626 "Load should return a value and a chain"); 627 return Result; 628 } 629 } 630 631 case ISD::SELECT: return LowerSELECT(Op, DAG); 632 case ISD::STORE: return LowerSTORE(Op, DAG); 633 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 634 case ISD::INTRINSIC_WO_CHAIN: { 635 unsigned IntrinsicID = 636 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 637 EVT VT = Op.getValueType(); 638 SDLoc DL(Op); 639 //XXX: Hardcoded we only use two to store the pointer to the parameters. 640 unsigned NumUserSGPRs = 2; 641 switch (IntrinsicID) { 642 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 643 case Intrinsic::r600_read_ngroups_x: 644 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false); 645 case Intrinsic::r600_read_ngroups_y: 646 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false); 647 case Intrinsic::r600_read_ngroups_z: 648 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false); 649 case Intrinsic::r600_read_global_size_x: 650 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false); 651 case Intrinsic::r600_read_global_size_y: 652 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false); 653 case Intrinsic::r600_read_global_size_z: 654 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false); 655 case Intrinsic::r600_read_local_size_x: 656 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false); 657 case Intrinsic::r600_read_local_size_y: 658 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false); 659 case Intrinsic::r600_read_local_size_z: 660 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false); 661 case Intrinsic::r600_read_tgid_x: 662 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 663 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); 664 case Intrinsic::r600_read_tgid_y: 665 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 666 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); 667 case Intrinsic::r600_read_tgid_z: 668 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 669 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); 670 case Intrinsic::r600_read_tidig_x: 671 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 672 AMDGPU::VGPR0, VT); 673 case Intrinsic::r600_read_tidig_y: 674 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 675 AMDGPU::VGPR1, VT); 676 case Intrinsic::r600_read_tidig_z: 677 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 678 AMDGPU::VGPR2, VT); 679 case AMDGPUIntrinsic::SI_load_const: { 680 SDValue Ops [] = { 681 Op.getOperand(1), 682 Op.getOperand(2) 683 }; 684 685 MachineMemOperand *MMO = MF.getMachineMemOperand( 686 MachinePointerInfo(), 687 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, 688 VT.getSizeInBits() / 8, 4); 689 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 690 Op->getVTList(), Ops, VT, MMO); 691 } 692 case AMDGPUIntrinsic::SI_sample: 693 return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); 694 case AMDGPUIntrinsic::SI_sampleb: 695 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); 696 case AMDGPUIntrinsic::SI_sampled: 697 return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); 698 case AMDGPUIntrinsic::SI_samplel: 699 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); 700 case AMDGPUIntrinsic::SI_vs_load_input: 701 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 702 Op.getOperand(1), 703 Op.getOperand(2), 704 Op.getOperand(3)); 705 } 706 } 707 708 case ISD::INTRINSIC_VOID: 709 SDValue Chain = Op.getOperand(0); 710 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 711 712 switch (IntrinsicID) { 713 case AMDGPUIntrinsic::SI_tbuffer_store: { 714 SDLoc DL(Op); 715 SDValue Ops [] = { 716 Chain, 717 Op.getOperand(2), 718 Op.getOperand(3), 719 Op.getOperand(4), 720 Op.getOperand(5), 721 Op.getOperand(6), 722 Op.getOperand(7), 723 Op.getOperand(8), 724 Op.getOperand(9), 725 Op.getOperand(10), 726 Op.getOperand(11), 727 Op.getOperand(12), 728 Op.getOperand(13), 729 Op.getOperand(14) 730 }; 731 EVT VT = Op.getOperand(3).getValueType(); 732 733 MachineMemOperand *MMO = MF.getMachineMemOperand( 734 MachinePointerInfo(), 735 MachineMemOperand::MOStore, 736 VT.getSizeInBits() / 8, 4); 737 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 738 Op->getVTList(), Ops, VT, MMO); 739 } 740 default: 741 break; 742 } 743 } 744 return SDValue(); 745 } 746 747 /// \brief Helper function for LowerBRCOND 748 static SDNode *findUser(SDValue Value, unsigned Opcode) { 749 750 SDNode *Parent = Value.getNode(); 751 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 752 I != E; ++I) { 753 754 if (I.getUse().get() != Value) 755 continue; 756 757 if (I->getOpcode() == Opcode) 758 return *I; 759 } 760 return nullptr; 761 } 762 763 /// This transforms the control flow intrinsics to get the branch destination as 764 /// last parameter, also switches branch target with BR if the need arise 765 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 766 SelectionDAG &DAG) const { 767 768 SDLoc DL(BRCOND); 769 770 SDNode *Intr = BRCOND.getOperand(1).getNode(); 771 SDValue Target = BRCOND.getOperand(2); 772 SDNode *BR = nullptr; 773 774 if (Intr->getOpcode() == ISD::SETCC) { 775 // As long as we negate the condition everything is fine 776 SDNode *SetCC = Intr; 777 assert(SetCC->getConstantOperandVal(1) == 1); 778 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 779 ISD::SETNE); 780 Intr = SetCC->getOperand(0).getNode(); 781 782 } else { 783 // Get the target from BR if we don't negate the condition 784 BR = findUser(BRCOND, ISD::BR); 785 Target = BR->getOperand(1); 786 } 787 788 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 789 790 // Build the result and 791 SmallVector<EVT, 4> Res; 792 for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) 793 Res.push_back(Intr->getValueType(i)); 794 795 // operands of the new intrinsic call 796 SmallVector<SDValue, 4> Ops; 797 Ops.push_back(BRCOND.getOperand(0)); 798 for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) 799 Ops.push_back(Intr->getOperand(i)); 800 Ops.push_back(Target); 801 802 // build the new intrinsic call 803 SDNode *Result = DAG.getNode( 804 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 805 DAG.getVTList(Res), Ops).getNode(); 806 807 if (BR) { 808 // Give the branch instruction our target 809 SDValue Ops[] = { 810 BR->getOperand(0), 811 BRCOND.getOperand(2) 812 }; 813 DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops); 814 } 815 816 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 817 818 // Copy the intrinsic results to registers 819 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 820 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 821 if (!CopyToReg) 822 continue; 823 824 Chain = DAG.getCopyToReg( 825 Chain, DL, 826 CopyToReg->getOperand(1), 827 SDValue(Result, i - 1), 828 SDValue()); 829 830 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 831 } 832 833 // Remove the old intrinsic from the chain 834 DAG.ReplaceAllUsesOfValueWith( 835 SDValue(Intr, Intr->getNumValues() - 1), 836 Intr->getOperand(0)); 837 838 return Chain; 839 } 840 841 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 842 SDLoc DL(Op); 843 LoadSDNode *Load = cast<LoadSDNode>(Op); 844 SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG); 845 if (Lowered.getNode()) 846 return Lowered; 847 848 if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 849 return SDValue(); 850 } 851 852 EVT MemVT = Load->getMemoryVT(); 853 854 assert(!MemVT.isVector() && "Private loads should be scalarized"); 855 assert(!MemVT.isFloatingPoint() && "FP loads should be promoted to int"); 856 857 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), 858 DAG.getConstant(2, MVT::i32)); 859 860 // FIXME: REGISTER_LOAD should probably have a chain result. 861 SDValue Chain = Load->getChain(); 862 SDValue LoLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 863 Chain, Ptr, 864 DAG.getTargetConstant(0, MVT::i32), 865 Op.getOperand(2)); 866 867 SDValue Ret = LoLoad.getValue(0); 868 if (MemVT.getSizeInBits() == 64) { 869 // TODO: This needs a test to make sure the right thing is happening with 870 // the chain. That is hard without general function support. 871 872 SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 873 DAG.getConstant(1, MVT::i32)); 874 875 SDValue HiLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 876 Chain, IncPtr, 877 DAG.getTargetConstant(0, MVT::i32), 878 Op.getOperand(2)); 879 880 Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, LoLoad, HiLoad); 881 // Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 882 // LoLoad.getValue(1), HiLoad.getValue(1)); 883 } 884 885 SDValue Ops[] = { 886 Ret, 887 Chain 888 }; 889 890 return DAG.getMergeValues(Ops, DL); 891 } 892 893 SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, 894 const SDValue &Op, 895 SelectionDAG &DAG) const { 896 return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), 897 Op.getOperand(2), 898 Op.getOperand(3), 899 Op.getOperand(4)); 900 } 901 902 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 903 if (Op.getValueType() != MVT::i64) 904 return SDValue(); 905 906 SDLoc DL(Op); 907 SDValue Cond = Op.getOperand(0); 908 909 SDValue Zero = DAG.getConstant(0, MVT::i32); 910 SDValue One = DAG.getConstant(1, MVT::i32); 911 912 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 913 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 914 915 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 916 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 917 918 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 919 920 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 921 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 922 923 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 924 925 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); 926 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 927 } 928 929 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 930 SDLoc DL(Op); 931 StoreSDNode *Store = cast<StoreSDNode>(Op); 932 EVT VT = Store->getMemoryVT(); 933 934 // These stores are legal. 935 if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && 936 VT.isVector() && VT.getVectorNumElements() == 2 && 937 VT.getVectorElementType() == MVT::i32) 938 return SDValue(); 939 940 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 941 if (Ret.getNode()) 942 return Ret; 943 944 if (VT.isVector() && VT.getVectorNumElements() >= 8) 945 return SplitVectorStore(Op, DAG); 946 947 if (VT == MVT::i1) 948 return DAG.getTruncStore(Store->getChain(), DL, 949 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 950 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 951 952 if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) 953 return SDValue(); 954 955 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Store->getBasePtr(), 956 DAG.getConstant(2, MVT::i32)); 957 SDValue Chain = Store->getChain(); 958 SmallVector<SDValue, 8> Values; 959 960 if (Store->isTruncatingStore()) { 961 unsigned Mask = 0; 962 if (Store->getMemoryVT() == MVT::i8) { 963 Mask = 0xff; 964 } else if (Store->getMemoryVT() == MVT::i16) { 965 Mask = 0xffff; 966 } 967 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 968 Chain, Store->getBasePtr(), 969 DAG.getConstant(0, MVT::i32)); 970 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getBasePtr(), 971 DAG.getConstant(0x3, MVT::i32)); 972 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 973 DAG.getConstant(3, MVT::i32)); 974 SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getValue(), 975 DAG.getConstant(Mask, MVT::i32)); 976 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 977 MaskedValue, ShiftAmt); 978 SDValue RotrAmt = DAG.getNode(ISD::SUB, DL, MVT::i32, 979 DAG.getConstant(32, MVT::i32), ShiftAmt); 980 SDValue DstMask = DAG.getNode(ISD::ROTR, DL, MVT::i32, 981 DAG.getConstant(Mask, MVT::i32), 982 RotrAmt); 983 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 984 Dst = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 985 986 Values.push_back(Dst); 987 } else if (VT == MVT::i64) { 988 for (unsigned i = 0; i < 2; ++i) { 989 Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, 990 Store->getValue(), DAG.getConstant(i, MVT::i32))); 991 } 992 } else if (VT == MVT::i128) { 993 for (unsigned i = 0; i < 2; ++i) { 994 for (unsigned j = 0; j < 2; ++j) { 995 Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, 996 DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, 997 Store->getValue(), DAG.getConstant(i, MVT::i32)), 998 DAG.getConstant(j, MVT::i32))); 999 } 1000 } 1001 } else { 1002 Values.push_back(Store->getValue()); 1003 } 1004 1005 for (unsigned i = 0; i < Values.size(); ++i) { 1006 SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, 1007 Ptr, DAG.getConstant(i, MVT::i32)); 1008 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1009 Chain, Values[i], PartPtr, 1010 DAG.getTargetConstant(0, MVT::i32)); 1011 } 1012 return Chain; 1013 } 1014 1015 //===----------------------------------------------------------------------===// 1016 // Custom DAG optimizations 1017 //===----------------------------------------------------------------------===// 1018 1019 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 1020 DAGCombinerInfo &DCI) { 1021 EVT VT = N->getValueType(0); 1022 EVT ScalarVT = VT.getScalarType(); 1023 if (ScalarVT != MVT::f32) 1024 return SDValue(); 1025 1026 SelectionDAG &DAG = DCI.DAG; 1027 SDLoc DL(N); 1028 1029 SDValue Src = N->getOperand(0); 1030 EVT SrcVT = Src.getValueType(); 1031 1032 // TODO: We could try to match extracting the higher bytes, which would be 1033 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 1034 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 1035 // about in practice. 1036 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { 1037 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 1038 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); 1039 DCI.AddToWorklist(Cvt.getNode()); 1040 return Cvt; 1041 } 1042 } 1043 1044 // We are primarily trying to catch operations on illegal vector types 1045 // before they are expanded. 1046 // For scalars, we can use the more flexible method of checking masked bits 1047 // after legalization. 1048 if (!DCI.isBeforeLegalize() || 1049 !SrcVT.isVector() || 1050 SrcVT.getVectorElementType() != MVT::i8) { 1051 return SDValue(); 1052 } 1053 1054 assert(DCI.isBeforeLegalize() && "Unexpected legal type"); 1055 1056 // Weird sized vectors are a pain to handle, but we know 3 is really the same 1057 // size as 4. 1058 unsigned NElts = SrcVT.getVectorNumElements(); 1059 if (!SrcVT.isSimple() && NElts != 3) 1060 return SDValue(); 1061 1062 // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to 1063 // prevent a mess from expanding to v4i32 and repacking. 1064 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { 1065 EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); 1066 EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); 1067 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); 1068 1069 LoadSDNode *Load = cast<LoadSDNode>(Src); 1070 SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, 1071 Load->getChain(), 1072 Load->getBasePtr(), 1073 LoadVT, 1074 Load->getMemOperand()); 1075 1076 // Make sure successors of the original load stay after it by updating 1077 // them to use the new Chain. 1078 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); 1079 1080 SmallVector<SDValue, 4> Elts; 1081 if (RegVT.isVector()) 1082 DAG.ExtractVectorElements(NewLoad, Elts); 1083 else 1084 Elts.push_back(NewLoad); 1085 1086 SmallVector<SDValue, 4> Ops; 1087 1088 unsigned EltIdx = 0; 1089 for (SDValue Elt : Elts) { 1090 unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); 1091 for (unsigned I = 0; I < ComponentsInElt; ++I) { 1092 unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; 1093 SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); 1094 DCI.AddToWorklist(Cvt.getNode()); 1095 Ops.push_back(Cvt); 1096 } 1097 1098 ++EltIdx; 1099 } 1100 1101 assert(Ops.size() == NElts); 1102 1103 return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); 1104 } 1105 1106 return SDValue(); 1107 } 1108 1109 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 1110 DAGCombinerInfo &DCI) const { 1111 SelectionDAG &DAG = DCI.DAG; 1112 SDLoc DL(N); 1113 EVT VT = N->getValueType(0); 1114 1115 switch (N->getOpcode()) { 1116 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1117 case ISD::SELECT_CC: { 1118 ConstantSDNode *True, *False; 1119 // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) 1120 if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) 1121 && (False = dyn_cast<ConstantSDNode>(N->getOperand(3))) 1122 && True->isAllOnesValue() 1123 && False->isNullValue() 1124 && VT == MVT::i1) { 1125 return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), 1126 N->getOperand(1), N->getOperand(4)); 1127 1128 } 1129 break; 1130 } 1131 case ISD::SETCC: { 1132 SDValue Arg0 = N->getOperand(0); 1133 SDValue Arg1 = N->getOperand(1); 1134 SDValue CC = N->getOperand(2); 1135 ConstantSDNode * C = nullptr; 1136 ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); 1137 1138 // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) 1139 if (VT == MVT::i1 1140 && Arg0.getOpcode() == ISD::SIGN_EXTEND 1141 && Arg0.getOperand(0).getValueType() == MVT::i1 1142 && (C = dyn_cast<ConstantSDNode>(Arg1)) 1143 && C->isNullValue() 1144 && CCOp == ISD::SETNE) { 1145 return SimplifySetCC(VT, Arg0.getOperand(0), 1146 DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); 1147 } 1148 break; 1149 } 1150 1151 case AMDGPUISD::CVT_F32_UBYTE0: 1152 case AMDGPUISD::CVT_F32_UBYTE1: 1153 case AMDGPUISD::CVT_F32_UBYTE2: 1154 case AMDGPUISD::CVT_F32_UBYTE3: { 1155 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 1156 1157 SDValue Src = N->getOperand(0); 1158 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 1159 1160 APInt KnownZero, KnownOne; 1161 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 1162 !DCI.isBeforeLegalizeOps()); 1163 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 1164 if (TLO.ShrinkDemandedConstant(Src, Demanded) || 1165 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { 1166 DCI.CommitTargetLoweringOpt(TLO); 1167 } 1168 1169 break; 1170 } 1171 1172 case ISD::UINT_TO_FP: { 1173 return performUCharToFloatCombine(N, DCI); 1174 } 1175 } 1176 1177 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1178 } 1179 1180 /// \brief Test if RegClass is one of the VSrc classes 1181 static bool isVSrc(unsigned RegClass) { 1182 return AMDGPU::VSrc_32RegClassID == RegClass || 1183 AMDGPU::VSrc_64RegClassID == RegClass; 1184 } 1185 1186 /// \brief Test if RegClass is one of the SSrc classes 1187 static bool isSSrc(unsigned RegClass) { 1188 return AMDGPU::SSrc_32RegClassID == RegClass || 1189 AMDGPU::SSrc_64RegClassID == RegClass; 1190 } 1191 1192 /// \brief Analyze the possible immediate value Op 1193 /// 1194 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 1195 /// and the immediate value if it's a literal immediate 1196 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 1197 1198 union { 1199 int32_t I; 1200 float F; 1201 } Imm; 1202 1203 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 1204 if (Node->getZExtValue() >> 32) { 1205 return -1; 1206 } 1207 Imm.I = Node->getSExtValue(); 1208 } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { 1209 if (N->getValueType(0) != MVT::f32) 1210 return -1; 1211 Imm.F = Node->getValueAPF().convertToFloat(); 1212 } else 1213 return -1; // It isn't an immediate 1214 1215 if ((Imm.I >= -16 && Imm.I <= 64) || 1216 Imm.F == 0.5f || Imm.F == -0.5f || 1217 Imm.F == 1.0f || Imm.F == -1.0f || 1218 Imm.F == 2.0f || Imm.F == -2.0f || 1219 Imm.F == 4.0f || Imm.F == -4.0f) 1220 return 0; // It's an inline immediate 1221 1222 return Imm.I; // It's a literal immediate 1223 } 1224 1225 /// \brief Try to fold an immediate directly into an instruction 1226 bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, 1227 bool &ScalarSlotUsed) const { 1228 1229 MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); 1230 const SIInstrInfo *TII = 1231 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1232 if (!Mov || !TII->isMov(Mov->getMachineOpcode())) 1233 return false; 1234 1235 const SDValue &Op = Mov->getOperand(0); 1236 int32_t Value = analyzeImmediate(Op.getNode()); 1237 if (Value == -1) { 1238 // Not an immediate at all 1239 return false; 1240 1241 } else if (Value == 0) { 1242 // Inline immediates can always be fold 1243 Operand = Op; 1244 return true; 1245 1246 } else if (Value == Immediate) { 1247 // Already fold literal immediate 1248 Operand = Op; 1249 return true; 1250 1251 } else if (!ScalarSlotUsed && !Immediate) { 1252 // Fold this literal immediate 1253 ScalarSlotUsed = true; 1254 Immediate = Value; 1255 Operand = Op; 1256 return true; 1257 1258 } 1259 1260 return false; 1261 } 1262 1263 const TargetRegisterClass *SITargetLowering::getRegClassForNode( 1264 SelectionDAG &DAG, const SDValue &Op) const { 1265 const SIInstrInfo *TII = 1266 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1267 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1268 1269 if (!Op->isMachineOpcode()) { 1270 switch(Op->getOpcode()) { 1271 case ISD::CopyFromReg: { 1272 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1273 unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg(); 1274 if (TargetRegisterInfo::isVirtualRegister(Reg)) { 1275 return MRI.getRegClass(Reg); 1276 } 1277 return TRI.getPhysRegClass(Reg); 1278 } 1279 default: return nullptr; 1280 } 1281 } 1282 const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); 1283 int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; 1284 if (OpClassID != -1) { 1285 return TRI.getRegClass(OpClassID); 1286 } 1287 switch(Op.getMachineOpcode()) { 1288 case AMDGPU::COPY_TO_REGCLASS: 1289 // Operand 1 is the register class id for COPY_TO_REGCLASS instructions. 1290 OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue(); 1291 1292 // If the COPY_TO_REGCLASS instruction is copying to a VSrc register 1293 // class, then the register class for the value could be either a 1294 // VReg or and SReg. In order to get a more accurate 1295 if (OpClassID == AMDGPU::VSrc_32RegClassID || 1296 OpClassID == AMDGPU::VSrc_64RegClassID) { 1297 return getRegClassForNode(DAG, Op.getOperand(0)); 1298 } 1299 return TRI.getRegClass(OpClassID); 1300 case AMDGPU::EXTRACT_SUBREG: { 1301 int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1302 const TargetRegisterClass *SuperClass = 1303 getRegClassForNode(DAG, Op.getOperand(0)); 1304 return TRI.getSubClassWithSubReg(SuperClass, SubIdx); 1305 } 1306 case AMDGPU::REG_SEQUENCE: 1307 // Operand 0 is the register class id for REG_SEQUENCE instructions. 1308 return TRI.getRegClass( 1309 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()); 1310 default: 1311 return getRegClassFor(Op.getSimpleValueType()); 1312 } 1313 } 1314 1315 /// \brief Does "Op" fit into register class "RegClass" ? 1316 bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, 1317 unsigned RegClass) const { 1318 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 1319 const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); 1320 if (!RC) { 1321 return false; 1322 } 1323 return TRI->getRegClass(RegClass)->hasSubClassEq(RC); 1324 } 1325 1326 /// \brief Make sure that we don't exeed the number of allowed scalars 1327 void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 1328 unsigned RegClass, 1329 bool &ScalarSlotUsed) const { 1330 1331 // First map the operands register class to a destination class 1332 if (RegClass == AMDGPU::VSrc_32RegClassID) 1333 RegClass = AMDGPU::VReg_32RegClassID; 1334 else if (RegClass == AMDGPU::VSrc_64RegClassID) 1335 RegClass = AMDGPU::VReg_64RegClassID; 1336 else 1337 return; 1338 1339 // Nothing to do if they fit naturally 1340 if (fitsRegClass(DAG, Operand, RegClass)) 1341 return; 1342 1343 // If the scalar slot isn't used yet use it now 1344 if (!ScalarSlotUsed) { 1345 ScalarSlotUsed = true; 1346 return; 1347 } 1348 1349 // This is a conservative aproach. It is possible that we can't determine the 1350 // correct register class and copy too often, but better safe than sorry. 1351 SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); 1352 SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), 1353 Operand.getValueType(), Operand, RC); 1354 Operand = SDValue(Node, 0); 1355 } 1356 1357 /// \returns true if \p Node's operands are different from the SDValue list 1358 /// \p Ops 1359 static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { 1360 for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) { 1361 if (Ops[i].getNode() != Node->getOperand(i).getNode()) { 1362 return true; 1363 } 1364 } 1365 return false; 1366 } 1367 1368 /// \brief Try to fold the Nodes operands into the Node 1369 SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, 1370 SelectionDAG &DAG) const { 1371 1372 // Original encoding (either e32 or e64) 1373 int Opcode = Node->getMachineOpcode(); 1374 const SIInstrInfo *TII = 1375 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1376 const MCInstrDesc *Desc = &TII->get(Opcode); 1377 1378 unsigned NumDefs = Desc->getNumDefs(); 1379 unsigned NumOps = Desc->getNumOperands(); 1380 1381 // Commuted opcode if available 1382 int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; 1383 const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev); 1384 1385 assert(!DescRev || DescRev->getNumDefs() == NumDefs); 1386 assert(!DescRev || DescRev->getNumOperands() == NumOps); 1387 1388 // e64 version if available, -1 otherwise 1389 int OpcodeE64 = AMDGPU::getVOPe64(Opcode); 1390 const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64); 1391 int InputModifiers[3] = {0}; 1392 1393 assert(!DescE64 || DescE64->getNumDefs() == NumDefs); 1394 1395 int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; 1396 bool HaveVSrc = false, HaveSSrc = false; 1397 1398 // First figure out what we already have in this instruction. 1399 for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 1400 i != e && Op < NumOps; ++i, ++Op) { 1401 1402 unsigned RegClass = Desc->OpInfo[Op].RegClass; 1403 if (isVSrc(RegClass)) 1404 HaveVSrc = true; 1405 else if (isSSrc(RegClass)) 1406 HaveSSrc = true; 1407 else 1408 continue; 1409 1410 int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); 1411 if (Imm != -1 && Imm != 0) { 1412 // Literal immediate 1413 Immediate = Imm; 1414 } 1415 } 1416 1417 // If we neither have VSrc nor SSrc, it makes no sense to continue. 1418 if (!HaveVSrc && !HaveSSrc) 1419 return Node; 1420 1421 // No scalar allowed when we have both VSrc and SSrc 1422 bool ScalarSlotUsed = HaveVSrc && HaveSSrc; 1423 1424 // Second go over the operands and try to fold them 1425 std::vector<SDValue> Ops; 1426 bool Promote2e64 = false; 1427 for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 1428 i != e && Op < NumOps; ++i, ++Op) { 1429 1430 const SDValue &Operand = Node->getOperand(i); 1431 Ops.push_back(Operand); 1432 1433 // Already folded immediate? 1434 if (isa<ConstantSDNode>(Operand.getNode()) || 1435 isa<ConstantFPSDNode>(Operand.getNode())) 1436 continue; 1437 1438 // Is this a VSrc or SSrc operand? 1439 unsigned RegClass = Desc->OpInfo[Op].RegClass; 1440 if (isVSrc(RegClass) || isSSrc(RegClass)) { 1441 // Try to fold the immediates 1442 if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { 1443 // Folding didn't work, make sure we don't hit the SReg limit. 1444 ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); 1445 } 1446 continue; 1447 } 1448 1449 if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { 1450 1451 unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; 1452 assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); 1453 1454 // Test if it makes sense to swap operands 1455 if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || 1456 (!fitsRegClass(DAG, Ops[1], RegClass) && 1457 fitsRegClass(DAG, Ops[1], OtherRegClass))) { 1458 1459 // Swap commutable operands 1460 std::swap(Ops[0], Ops[1]); 1461 1462 Desc = DescRev; 1463 DescRev = nullptr; 1464 continue; 1465 } 1466 } 1467 1468 if (Immediate) 1469 continue; 1470 1471 if (DescE64) { 1472 // Test if it makes sense to switch to e64 encoding 1473 unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; 1474 if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) 1475 continue; 1476 1477 int32_t TmpImm = -1; 1478 if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) || 1479 (!fitsRegClass(DAG, Ops[i], RegClass) && 1480 fitsRegClass(DAG, Ops[1], OtherRegClass))) { 1481 1482 // Switch to e64 encoding 1483 Immediate = -1; 1484 Promote2e64 = true; 1485 Desc = DescE64; 1486 DescE64 = nullptr; 1487 } 1488 } 1489 1490 if (!DescE64 && !Promote2e64) 1491 continue; 1492 if (!Operand.isMachineOpcode()) 1493 continue; 1494 if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) { 1495 Ops.pop_back(); 1496 Ops.push_back(Operand.getOperand(0)); 1497 InputModifiers[i] = 1; 1498 Promote2e64 = true; 1499 if (!DescE64) 1500 continue; 1501 Desc = DescE64; 1502 DescE64 = nullptr; 1503 } 1504 else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) { 1505 Ops.pop_back(); 1506 Ops.push_back(Operand.getOperand(0)); 1507 InputModifiers[i] = 2; 1508 Promote2e64 = true; 1509 if (!DescE64) 1510 continue; 1511 Desc = DescE64; 1512 DescE64 = nullptr; 1513 } 1514 } 1515 1516 if (Promote2e64) { 1517 std::vector<SDValue> OldOps(Ops); 1518 Ops.clear(); 1519 for (unsigned i = 0; i < OldOps.size(); ++i) { 1520 // src_modifier 1521 Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32)); 1522 Ops.push_back(OldOps[i]); 1523 } 1524 // Add the modifier flags while promoting 1525 for (unsigned i = 0; i < 2; ++i) 1526 Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); 1527 } 1528 1529 // Add optional chain and glue 1530 for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) 1531 Ops.push_back(Node->getOperand(i)); 1532 1533 // Nodes that have a glue result are not CSE'd by getMachineNode(), so in 1534 // this case a brand new node is always be created, even if the operands 1535 // are the same as before. So, manually check if anything has been changed. 1536 if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) { 1537 return Node; 1538 } 1539 1540 // Create a complete new instruction 1541 return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops); 1542 } 1543 1544 /// \brief Helper function for adjustWritemask 1545 static unsigned SubIdx2Lane(unsigned Idx) { 1546 switch (Idx) { 1547 default: return 0; 1548 case AMDGPU::sub0: return 0; 1549 case AMDGPU::sub1: return 1; 1550 case AMDGPU::sub2: return 2; 1551 case AMDGPU::sub3: return 3; 1552 } 1553 } 1554 1555 /// \brief Adjust the writemask of MIMG instructions 1556 void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 1557 SelectionDAG &DAG) const { 1558 SDNode *Users[4] = { }; 1559 unsigned Lane = 0; 1560 unsigned OldDmask = Node->getConstantOperandVal(0); 1561 unsigned NewDmask = 0; 1562 1563 // Try to figure out the used register components 1564 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 1565 I != E; ++I) { 1566 1567 // Abort if we can't understand the usage 1568 if (!I->isMachineOpcode() || 1569 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 1570 return; 1571 1572 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 1573 // Note that subregs are packed, i.e. Lane==0 is the first bit set 1574 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 1575 // set, etc. 1576 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 1577 1578 // Set which texture component corresponds to the lane. 1579 unsigned Comp; 1580 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 1581 assert(Dmask); 1582 Comp = countTrailingZeros(Dmask); 1583 Dmask &= ~(1 << Comp); 1584 } 1585 1586 // Abort if we have more than one user per component 1587 if (Users[Lane]) 1588 return; 1589 1590 Users[Lane] = *I; 1591 NewDmask |= 1 << Comp; 1592 } 1593 1594 // Abort if there's no change 1595 if (NewDmask == OldDmask) 1596 return; 1597 1598 // Adjust the writemask in the node 1599 std::vector<SDValue> Ops; 1600 Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); 1601 for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) 1602 Ops.push_back(Node->getOperand(i)); 1603 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); 1604 1605 // If we only got one lane, replace it with a copy 1606 // (if NewDmask has only one bit set...) 1607 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 1608 SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); 1609 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 1610 SDLoc(), Users[Lane]->getValueType(0), 1611 SDValue(Node, 0), RC); 1612 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 1613 return; 1614 } 1615 1616 // Update the users of the node with the new indices 1617 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 1618 1619 SDNode *User = Users[i]; 1620 if (!User) 1621 continue; 1622 1623 SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); 1624 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 1625 1626 switch (Idx) { 1627 default: break; 1628 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 1629 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 1630 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 1631 } 1632 } 1633 } 1634 1635 /// \brief Fold the instructions after selecting them. 1636 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 1637 SelectionDAG &DAG) const { 1638 const SIInstrInfo *TII = 1639 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1640 Node = AdjustRegClass(Node, DAG); 1641 1642 if (TII->isMIMG(Node->getMachineOpcode())) 1643 adjustWritemask(Node, DAG); 1644 1645 return foldOperands(Node, DAG); 1646 } 1647 1648 /// \brief Assign the register class depending on the number of 1649 /// bits set in the writemask 1650 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 1651 SDNode *Node) const { 1652 const SIInstrInfo *TII = 1653 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1654 if (!TII->isMIMG(MI->getOpcode())) 1655 return; 1656 1657 unsigned VReg = MI->getOperand(0).getReg(); 1658 unsigned Writemask = MI->getOperand(1).getImm(); 1659 unsigned BitsSet = 0; 1660 for (unsigned i = 0; i < 4; ++i) 1661 BitsSet += Writemask & (1 << i) ? 1 : 0; 1662 1663 const TargetRegisterClass *RC; 1664 switch (BitsSet) { 1665 default: return; 1666 case 1: RC = &AMDGPU::VReg_32RegClass; break; 1667 case 2: RC = &AMDGPU::VReg_64RegClass; break; 1668 case 3: RC = &AMDGPU::VReg_96RegClass; break; 1669 } 1670 1671 unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); 1672 MI->setDesc(TII->get(NewOpcode)); 1673 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1674 MRI.setRegClass(VReg, RC); 1675 } 1676 1677 MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, 1678 SelectionDAG &DAG) const { 1679 1680 SDLoc DL(N); 1681 unsigned NewOpcode = N->getMachineOpcode(); 1682 1683 switch (N->getMachineOpcode()) { 1684 default: return N; 1685 case AMDGPU::S_LOAD_DWORD_IMM: 1686 NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; 1687 // Fall-through 1688 case AMDGPU::S_LOAD_DWORDX2_SGPR: 1689 if (NewOpcode == N->getMachineOpcode()) { 1690 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; 1691 } 1692 // Fall-through 1693 case AMDGPU::S_LOAD_DWORDX4_IMM: 1694 case AMDGPU::S_LOAD_DWORDX4_SGPR: { 1695 if (NewOpcode == N->getMachineOpcode()) { 1696 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; 1697 } 1698 if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) { 1699 return N; 1700 } 1701 ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); 1702 SDValue Ops[] = { 1703 SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128, 1704 DAG.getConstant(0, MVT::i64)), 0), 1705 N->getOperand(0), 1706 DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32) 1707 }; 1708 return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); 1709 } 1710 } 1711 } 1712 1713 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 1714 const TargetRegisterClass *RC, 1715 unsigned Reg, EVT VT) const { 1716 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 1717 1718 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 1719 cast<RegisterSDNode>(VReg)->getReg(), VT); 1720 } 1721