1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for SI 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifdef _MSC_VER 16 // Provide M_PI. 17 #define _USE_MATH_DEFINES 18 #include <cmath> 19 #endif 20 21 #include "SIISelLowering.h" 22 #include "AMDGPU.h" 23 #include "AMDGPUIntrinsicInfo.h" 24 #include "AMDGPUSubtarget.h" 25 #include "SIInstrInfo.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "SIRegisterInfo.h" 28 #include "llvm/ADT/BitVector.h" 29 #include "llvm/CodeGen/CallingConvLower.h" 30 #include "llvm/CodeGen/MachineInstrBuilder.h" 31 #include "llvm/CodeGen/MachineRegisterInfo.h" 32 #include "llvm/CodeGen/SelectionDAG.h" 33 #include "llvm/IR/Function.h" 34 #include "llvm/ADT/SmallString.h" 35 36 using namespace llvm; 37 38 SITargetLowering::SITargetLowering(TargetMachine &TM, 39 const AMDGPUSubtarget &STI) 40 : AMDGPUTargetLowering(TM, STI) { 41 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 42 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 43 44 addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 45 addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 46 47 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); 48 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 49 50 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 51 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 52 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 53 54 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); 55 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 56 57 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); 58 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 59 60 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); 61 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 62 63 computeRegisterProperties(STI.getRegisterInfo()); 64 65 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 66 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 67 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 68 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 69 70 setOperationAction(ISD::ADD, MVT::i32, Legal); 71 setOperationAction(ISD::ADDC, MVT::i32, Legal); 72 setOperationAction(ISD::ADDE, MVT::i32, Legal); 73 setOperationAction(ISD::SUBC, MVT::i32, Legal); 74 setOperationAction(ISD::SUBE, MVT::i32, Legal); 75 76 setOperationAction(ISD::FSIN, MVT::f32, Custom); 77 setOperationAction(ISD::FCOS, MVT::f32, Custom); 78 79 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 80 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 81 82 // We need to custom lower vector stores from local memory 83 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 84 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 85 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 86 87 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 88 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 89 90 setOperationAction(ISD::STORE, MVT::i1, Custom); 91 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 92 93 setOperationAction(ISD::SELECT, MVT::i64, Custom); 94 setOperationAction(ISD::SELECT, MVT::f64, Promote); 95 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 96 97 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 98 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 99 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 100 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 101 102 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 103 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 104 105 setOperationAction(ISD::BSWAP, MVT::i32, Legal); 106 107 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); 108 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 109 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 110 111 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); 112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 113 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 114 115 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 116 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 117 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 118 119 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 120 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 121 122 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 123 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 124 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); 125 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 126 127 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 128 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 129 130 for (MVT VT : MVT::integer_valuetypes()) { 131 if (VT == MVT::i64) 132 continue; 133 134 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 135 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); 136 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); 137 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); 138 139 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 140 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); 141 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); 142 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); 143 144 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 145 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); 146 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); 147 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); 148 } 149 150 for (MVT VT : MVT::integer_vector_valuetypes()) { 151 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); 152 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); 153 } 154 155 for (MVT VT : MVT::fp_valuetypes()) 156 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 157 158 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 159 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 160 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 161 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 162 163 setOperationAction(ISD::LOAD, MVT::i1, Custom); 164 165 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 166 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 167 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 168 169 // These should use UDIVREM, so set them to expand 170 setOperationAction(ISD::UDIV, MVT::i64, Expand); 171 setOperationAction(ISD::UREM, MVT::i64, Expand); 172 173 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 174 setOperationAction(ISD::SELECT, MVT::i1, Promote); 175 176 // We only support LOAD/STORE and vector manipulation ops for vectors 177 // with > 4 elements. 178 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { 179 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 180 switch(Op) { 181 case ISD::LOAD: 182 case ISD::STORE: 183 case ISD::BUILD_VECTOR: 184 case ISD::BITCAST: 185 case ISD::EXTRACT_VECTOR_ELT: 186 case ISD::INSERT_VECTOR_ELT: 187 case ISD::INSERT_SUBVECTOR: 188 case ISD::EXTRACT_SUBVECTOR: 189 break; 190 case ISD::CONCAT_VECTORS: 191 setOperationAction(Op, VT, Custom); 192 break; 193 default: 194 setOperationAction(Op, VT, Expand); 195 break; 196 } 197 } 198 } 199 200 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 201 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 202 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 203 setOperationAction(ISD::FRINT, MVT::f64, Legal); 204 } 205 206 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 207 setOperationAction(ISD::FDIV, MVT::f32, Custom); 208 setOperationAction(ISD::FDIV, MVT::f64, Custom); 209 210 setTargetDAGCombine(ISD::FADD); 211 setTargetDAGCombine(ISD::FSUB); 212 setTargetDAGCombine(ISD::FMINNUM); 213 setTargetDAGCombine(ISD::FMAXNUM); 214 setTargetDAGCombine(ISD::SELECT_CC); 215 setTargetDAGCombine(ISD::SETCC); 216 setTargetDAGCombine(ISD::AND); 217 setTargetDAGCombine(ISD::OR); 218 setTargetDAGCombine(ISD::UINT_TO_FP); 219 220 // All memory operations. Some folding on the pointer operand is done to help 221 // matching the constant offsets in the addressing modes. 222 setTargetDAGCombine(ISD::LOAD); 223 setTargetDAGCombine(ISD::STORE); 224 setTargetDAGCombine(ISD::ATOMIC_LOAD); 225 setTargetDAGCombine(ISD::ATOMIC_STORE); 226 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); 227 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); 228 setTargetDAGCombine(ISD::ATOMIC_SWAP); 229 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); 230 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); 231 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); 232 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); 233 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); 234 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); 235 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); 236 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); 237 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); 238 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); 239 240 setSchedulingPreference(Sched::RegPressure); 241 } 242 243 //===----------------------------------------------------------------------===// 244 // TargetLowering queries 245 //===----------------------------------------------------------------------===// 246 247 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, 248 EVT) const { 249 // SI has some legal vector types, but no legal vector operations. Say no 250 // shuffles are legal in order to prefer scalarizing some vector operations. 251 return false; 252 } 253 254 // FIXME: This really needs an address space argument. The immediate offset 255 // size is different for different sets of memory instruction sets. 256 257 // The single offset DS instructions have a 16-bit unsigned byte offset. 258 // 259 // MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r + 260 // r + i with addr64. 32-bit has more addressing mode options. Depending on the 261 // resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i). 262 // 263 // SMRD instructions have an 8-bit, dword offset. 264 // 265 bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, 266 Type *Ty) const { 267 // No global is ever allowed as a base. 268 if (AM.BaseGV) 269 return false; 270 271 // Allow a 16-bit unsigned immediate field, since this is what DS instructions 272 // use. 273 if (!isUInt<16>(AM.BaseOffs)) 274 return false; 275 276 // Only support r+r, 277 switch (AM.Scale) { 278 case 0: // "r+i" or just "i", depending on HasBaseReg. 279 break; 280 case 1: 281 if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. 282 return false; 283 // Otherwise we have r+r or r+i. 284 break; 285 case 2: 286 if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. 287 return false; 288 // Allow 2*r as r+r. 289 break; 290 default: // Don't allow n * r 291 return false; 292 } 293 294 return true; 295 } 296 297 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 298 unsigned AddrSpace, 299 unsigned Align, 300 bool *IsFast) const { 301 if (IsFast) 302 *IsFast = false; 303 304 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, 305 // which isn't a simple VT. 306 if (!VT.isSimple() || VT == MVT::Other) 307 return false; 308 309 // TODO - CI+ supports unaligned memory accesses, but this requires driver 310 // support. 311 312 // XXX - The only mention I see of this in the ISA manual is for LDS direct 313 // reads the "byte address and must be dword aligned". Is it also true for the 314 // normal loads and stores? 315 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { 316 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte 317 // aligned, 8 byte access in a single operation using ds_read2/write2_b32 318 // with adjacent offsets. 319 return Align % 4 == 0; 320 } 321 322 // Smaller than dword value must be aligned. 323 // FIXME: This should be allowed on CI+ 324 if (VT.bitsLT(MVT::i32)) 325 return false; 326 327 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 328 // byte-address are ignored, thus forcing Dword alignment. 329 // This applies to private, global, and constant memory. 330 if (IsFast) 331 *IsFast = true; 332 333 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 334 } 335 336 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 337 unsigned SrcAlign, bool IsMemset, 338 bool ZeroMemset, 339 bool MemcpyStrSrc, 340 MachineFunction &MF) const { 341 // FIXME: Should account for address space here. 342 343 // The default fallback uses the private pointer size as a guess for a type to 344 // use. Make sure we switch these to 64-bit accesses. 345 346 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global 347 return MVT::v4i32; 348 349 if (Size >= 8 && DstAlign >= 4) 350 return MVT::v2i32; 351 352 // Use the default. 353 return MVT::Other; 354 } 355 356 TargetLoweringBase::LegalizeTypeAction 357 SITargetLowering::getPreferredVectorAction(EVT VT) const { 358 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) 359 return TypeSplitVector; 360 361 return TargetLoweringBase::getPreferredVectorAction(VT); 362 } 363 364 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 365 Type *Ty) const { 366 const SIInstrInfo *TII = 367 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 368 return TII->isInlineConstant(Imm); 369 } 370 371 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 372 SDLoc SL, SDValue Chain, 373 unsigned Offset, bool Signed) const { 374 const DataLayout *DL = getDataLayout(); 375 MachineFunction &MF = DAG.getMachineFunction(); 376 const SIRegisterInfo *TRI = 377 static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); 378 unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); 379 380 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 381 382 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 383 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 384 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, 385 MRI.getLiveInVirtReg(InputPtrReg), MVT::i64); 386 SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, BasePtr, 387 DAG.getConstant(Offset, MVT::i64)); 388 SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); 389 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 390 391 return DAG.getLoad(ISD::UNINDEXED, Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, 392 VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, 393 false, // isVolatile 394 true, // isNonTemporal 395 true, // isInvariant 396 DL->getABITypeAlignment(Ty)); // Alignment 397 } 398 399 SDValue SITargetLowering::LowerFormalArguments( 400 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 401 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 402 SmallVectorImpl<SDValue> &InVals) const { 403 const SIRegisterInfo *TRI = 404 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 405 406 MachineFunction &MF = DAG.getMachineFunction(); 407 FunctionType *FType = MF.getFunction()->getFunctionType(); 408 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 409 410 assert(CallConv == CallingConv::C); 411 412 SmallVector<ISD::InputArg, 16> Splits; 413 BitVector Skipped(Ins.size()); 414 415 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 416 const ISD::InputArg &Arg = Ins[i]; 417 418 // First check if it's a PS input addr 419 if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && 420 !Arg.Flags.isByVal()) { 421 422 assert((PSInputNum <= 15) && "Too many PS inputs!"); 423 424 if (!Arg.Used) { 425 // We can savely skip PS inputs 426 Skipped.set(i); 427 ++PSInputNum; 428 continue; 429 } 430 431 Info->PSInputAddr |= 1 << PSInputNum++; 432 } 433 434 // Second split vertices into their elements 435 if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { 436 ISD::InputArg NewArg = Arg; 437 NewArg.Flags.setSplit(); 438 NewArg.VT = Arg.VT.getVectorElementType(); 439 440 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 441 // three or five element vertex only needs three or five registers, 442 // NOT four or eigth. 443 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 444 unsigned NumElements = ParamType->getVectorNumElements(); 445 446 for (unsigned j = 0; j != NumElements; ++j) { 447 Splits.push_back(NewArg); 448 NewArg.PartOffset += NewArg.VT.getStoreSize(); 449 } 450 451 } else if (Info->getShaderType() != ShaderType::COMPUTE) { 452 Splits.push_back(Arg); 453 } 454 } 455 456 SmallVector<CCValAssign, 16> ArgLocs; 457 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 458 *DAG.getContext()); 459 460 // At least one interpolation mode must be enabled or else the GPU will hang. 461 if (Info->getShaderType() == ShaderType::PIXEL && 462 (Info->PSInputAddr & 0x7F) == 0) { 463 Info->PSInputAddr |= 1; 464 CCInfo.AllocateReg(AMDGPU::VGPR0); 465 CCInfo.AllocateReg(AMDGPU::VGPR1); 466 } 467 468 // The pointer to the list of arguments is stored in SGPR0, SGPR1 469 // The pointer to the scratch buffer is stored in SGPR2, SGPR3 470 if (Info->getShaderType() == ShaderType::COMPUTE) { 471 if (Subtarget->isAmdHsaOS()) 472 Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. 473 else 474 Info->NumUserSGPRs = 4; 475 476 unsigned InputPtrReg = 477 TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); 478 unsigned InputPtrRegLo = 479 TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); 480 unsigned InputPtrRegHi = 481 TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); 482 483 unsigned ScratchPtrReg = 484 TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); 485 unsigned ScratchPtrRegLo = 486 TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); 487 unsigned ScratchPtrRegHi = 488 TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); 489 490 CCInfo.AllocateReg(InputPtrRegLo); 491 CCInfo.AllocateReg(InputPtrRegHi); 492 CCInfo.AllocateReg(ScratchPtrRegLo); 493 CCInfo.AllocateReg(ScratchPtrRegHi); 494 MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); 495 MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); 496 } 497 498 if (Info->getShaderType() == ShaderType::COMPUTE) { 499 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 500 Splits); 501 } 502 503 AnalyzeFormalArguments(CCInfo, Splits); 504 505 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 506 507 const ISD::InputArg &Arg = Ins[i]; 508 if (Skipped[i]) { 509 InVals.push_back(DAG.getUNDEF(Arg.VT)); 510 continue; 511 } 512 513 CCValAssign &VA = ArgLocs[ArgIdx++]; 514 MVT VT = VA.getLocVT(); 515 516 if (VA.isMemLoc()) { 517 VT = Ins[i].VT; 518 EVT MemVT = Splits[i].VT; 519 const unsigned Offset = 36 + VA.getLocMemOffset(); 520 // The first 36 bytes of the input buffer contains information about 521 // thread group and global sizes. 522 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), 523 Offset, Ins[i].Flags.isSExt()); 524 525 const PointerType *ParamTy = 526 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 527 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && 528 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 529 // On SI local pointers are just offsets into LDS, so they are always 530 // less than 16-bits. On CI and newer they could potentially be 531 // real pointers, so we can't guarantee their size. 532 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, 533 DAG.getValueType(MVT::i16)); 534 } 535 536 InVals.push_back(Arg); 537 Info->ABIArgOffset = Offset + MemVT.getStoreSize(); 538 continue; 539 } 540 assert(VA.isRegLoc() && "Parameter must be in a register!"); 541 542 unsigned Reg = VA.getLocReg(); 543 544 if (VT == MVT::i64) { 545 // For now assume it is a pointer 546 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 547 &AMDGPU::SReg_64RegClass); 548 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 549 InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 550 continue; 551 } 552 553 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 554 555 Reg = MF.addLiveIn(Reg, RC); 556 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 557 558 if (Arg.VT.isVector()) { 559 560 // Build a vector from the registers 561 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 562 unsigned NumElements = ParamType->getVectorNumElements(); 563 564 SmallVector<SDValue, 4> Regs; 565 Regs.push_back(Val); 566 for (unsigned j = 1; j != NumElements; ++j) { 567 Reg = ArgLocs[ArgIdx++].getLocReg(); 568 Reg = MF.addLiveIn(Reg, RC); 569 Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 570 } 571 572 // Fill up the missing vector elements 573 NumElements = Arg.VT.getVectorNumElements() - NumElements; 574 Regs.append(NumElements, DAG.getUNDEF(VT)); 575 576 InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); 577 continue; 578 } 579 580 InVals.push_back(Val); 581 } 582 583 if (Info->getShaderType() != ShaderType::COMPUTE) { 584 unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>( 585 AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); 586 Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); 587 } 588 return Chain; 589 } 590 591 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 592 MachineInstr * MI, MachineBasicBlock * BB) const { 593 594 MachineBasicBlock::iterator I = *MI; 595 const SIInstrInfo *TII = 596 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 597 598 switch (MI->getOpcode()) { 599 default: 600 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 601 case AMDGPU::BRANCH: 602 return BB; 603 case AMDGPU::SI_RegisterStorePseudo: { 604 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 605 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 606 MachineInstrBuilder MIB = 607 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), 608 Reg); 609 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) 610 MIB.addOperand(MI->getOperand(i)); 611 612 MI->eraseFromParent(); 613 break; 614 } 615 } 616 return BB; 617 } 618 619 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 620 // This currently forces unfolding various combinations of fsub into fma with 621 // free fneg'd operands. As long as we have fast FMA (controlled by 622 // isFMAFasterThanFMulAndFAdd), we should perform these. 623 624 // When fma is quarter rate, for f64 where add / sub are at best half rate, 625 // most of these combines appear to be cycle neutral but save on instruction 626 // count / code size. 627 return true; 628 } 629 630 EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { 631 if (!VT.isVector()) { 632 return MVT::i1; 633 } 634 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 635 } 636 637 MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { 638 return MVT::i32; 639 } 640 641 // Answering this is somewhat tricky and depends on the specific device which 642 // have different rates for fma or all f64 operations. 643 // 644 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 645 // regardless of which device (although the number of cycles differs between 646 // devices), so it is always profitable for f64. 647 // 648 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 649 // only on full rate devices. Normally, we should prefer selecting v_mad_f32 650 // which we can always do even without fused FP ops since it returns the same 651 // result as the separate operations and since it is always full 652 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 653 // however does not support denormals, so we do report fma as faster if we have 654 // a fast fma device and require denormals. 655 // 656 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 657 VT = VT.getScalarType(); 658 659 if (!VT.isSimple()) 660 return false; 661 662 switch (VT.getSimpleVT().SimpleTy) { 663 case MVT::f32: 664 // This is as fast on some subtargets. However, we always have full rate f32 665 // mad available which returns the same result as the separate operations 666 // which we should prefer over fma. We can't use this if we want to support 667 // denormals, so only report this in these cases. 668 return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); 669 case MVT::f64: 670 return true; 671 default: 672 break; 673 } 674 675 return false; 676 } 677 678 //===----------------------------------------------------------------------===// 679 // Custom DAG Lowering Operations 680 //===----------------------------------------------------------------------===// 681 682 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 683 switch (Op.getOpcode()) { 684 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 685 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 686 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 687 case ISD::LOAD: { 688 SDValue Result = LowerLOAD(Op, DAG); 689 assert((!Result.getNode() || 690 Result.getNode()->getNumValues() == 2) && 691 "Load should return a value and a chain"); 692 return Result; 693 } 694 695 case ISD::FSIN: 696 case ISD::FCOS: 697 return LowerTrig(Op, DAG); 698 case ISD::SELECT: return LowerSELECT(Op, DAG); 699 case ISD::FDIV: return LowerFDIV(Op, DAG); 700 case ISD::STORE: return LowerSTORE(Op, DAG); 701 case ISD::GlobalAddress: { 702 MachineFunction &MF = DAG.getMachineFunction(); 703 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 704 return LowerGlobalAddress(MFI, Op, DAG); 705 } 706 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 707 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); 708 } 709 return SDValue(); 710 } 711 712 /// \brief Helper function for LowerBRCOND 713 static SDNode *findUser(SDValue Value, unsigned Opcode) { 714 715 SDNode *Parent = Value.getNode(); 716 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 717 I != E; ++I) { 718 719 if (I.getUse().get() != Value) 720 continue; 721 722 if (I->getOpcode() == Opcode) 723 return *I; 724 } 725 return nullptr; 726 } 727 728 SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { 729 730 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); 731 unsigned FrameIndex = FINode->getIndex(); 732 733 return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); 734 } 735 736 /// This transforms the control flow intrinsics to get the branch destination as 737 /// last parameter, also switches branch target with BR if the need arise 738 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 739 SelectionDAG &DAG) const { 740 741 SDLoc DL(BRCOND); 742 743 SDNode *Intr = BRCOND.getOperand(1).getNode(); 744 SDValue Target = BRCOND.getOperand(2); 745 SDNode *BR = nullptr; 746 747 if (Intr->getOpcode() == ISD::SETCC) { 748 // As long as we negate the condition everything is fine 749 SDNode *SetCC = Intr; 750 assert(SetCC->getConstantOperandVal(1) == 1); 751 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 752 ISD::SETNE); 753 Intr = SetCC->getOperand(0).getNode(); 754 755 } else { 756 // Get the target from BR if we don't negate the condition 757 BR = findUser(BRCOND, ISD::BR); 758 Target = BR->getOperand(1); 759 } 760 761 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 762 763 // Build the result and 764 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 765 766 // operands of the new intrinsic call 767 SmallVector<SDValue, 4> Ops; 768 Ops.push_back(BRCOND.getOperand(0)); 769 Ops.append(Intr->op_begin() + 1, Intr->op_end()); 770 Ops.push_back(Target); 771 772 // build the new intrinsic call 773 SDNode *Result = DAG.getNode( 774 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 775 DAG.getVTList(Res), Ops).getNode(); 776 777 if (BR) { 778 // Give the branch instruction our target 779 SDValue Ops[] = { 780 BR->getOperand(0), 781 BRCOND.getOperand(2) 782 }; 783 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 784 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 785 BR = NewBR.getNode(); 786 } 787 788 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 789 790 // Copy the intrinsic results to registers 791 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 792 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 793 if (!CopyToReg) 794 continue; 795 796 Chain = DAG.getCopyToReg( 797 Chain, DL, 798 CopyToReg->getOperand(1), 799 SDValue(Result, i - 1), 800 SDValue()); 801 802 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 803 } 804 805 // Remove the old intrinsic from the chain 806 DAG.ReplaceAllUsesOfValueWith( 807 SDValue(Intr, Intr->getNumValues() - 1), 808 Intr->getOperand(0)); 809 810 return Chain; 811 } 812 813 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 814 SDValue Op, 815 SelectionDAG &DAG) const { 816 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 817 818 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 819 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 820 821 SDLoc DL(GSD); 822 const GlobalValue *GV = GSD->getGlobal(); 823 MVT PtrVT = getPointerTy(GSD->getAddressSpace()); 824 825 SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); 826 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); 827 828 SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, 829 DAG.getConstant(0, MVT::i32)); 830 SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, 831 DAG.getConstant(1, MVT::i32)); 832 833 SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), 834 PtrLo, GA); 835 SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), 836 PtrHi, DAG.getConstant(0, MVT::i32), 837 SDValue(Lo.getNode(), 1)); 838 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); 839 } 840 841 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 842 SelectionDAG &DAG) const { 843 MachineFunction &MF = DAG.getMachineFunction(); 844 const SIRegisterInfo *TRI = 845 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 846 847 EVT VT = Op.getValueType(); 848 SDLoc DL(Op); 849 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 850 851 switch (IntrinsicID) { 852 case Intrinsic::r600_read_ngroups_x: 853 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 854 SI::KernelInputOffsets::NGROUPS_X, false); 855 case Intrinsic::r600_read_ngroups_y: 856 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 857 SI::KernelInputOffsets::NGROUPS_Y, false); 858 case Intrinsic::r600_read_ngroups_z: 859 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 860 SI::KernelInputOffsets::NGROUPS_Z, false); 861 case Intrinsic::r600_read_global_size_x: 862 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 863 SI::KernelInputOffsets::GLOBAL_SIZE_X, false); 864 case Intrinsic::r600_read_global_size_y: 865 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 866 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); 867 case Intrinsic::r600_read_global_size_z: 868 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 869 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); 870 case Intrinsic::r600_read_local_size_x: 871 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 872 SI::KernelInputOffsets::LOCAL_SIZE_X, false); 873 case Intrinsic::r600_read_local_size_y: 874 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 875 SI::KernelInputOffsets::LOCAL_SIZE_Y, false); 876 case Intrinsic::r600_read_local_size_z: 877 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 878 SI::KernelInputOffsets::LOCAL_SIZE_Z, false); 879 880 case Intrinsic::AMDGPU_read_workdim: 881 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 882 MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset, 883 false); 884 885 case Intrinsic::r600_read_tgid_x: 886 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 887 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); 888 case Intrinsic::r600_read_tgid_y: 889 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 890 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); 891 case Intrinsic::r600_read_tgid_z: 892 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 893 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); 894 case Intrinsic::r600_read_tidig_x: 895 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 896 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); 897 case Intrinsic::r600_read_tidig_y: 898 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 899 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); 900 case Intrinsic::r600_read_tidig_z: 901 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 902 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); 903 case AMDGPUIntrinsic::SI_load_const: { 904 SDValue Ops[] = { 905 Op.getOperand(1), 906 Op.getOperand(2) 907 }; 908 909 MachineMemOperand *MMO = MF.getMachineMemOperand( 910 MachinePointerInfo(), 911 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, 912 VT.getStoreSize(), 4); 913 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 914 Op->getVTList(), Ops, VT, MMO); 915 } 916 case AMDGPUIntrinsic::SI_sample: 917 return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); 918 case AMDGPUIntrinsic::SI_sampleb: 919 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); 920 case AMDGPUIntrinsic::SI_sampled: 921 return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); 922 case AMDGPUIntrinsic::SI_samplel: 923 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); 924 case AMDGPUIntrinsic::SI_vs_load_input: 925 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 926 Op.getOperand(1), 927 Op.getOperand(2), 928 Op.getOperand(3)); 929 930 case AMDGPUIntrinsic::AMDGPU_fract: 931 case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. 932 return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), 933 DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); 934 935 default: 936 return AMDGPUTargetLowering::LowerOperation(Op, DAG); 937 } 938 } 939 940 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 941 SelectionDAG &DAG) const { 942 MachineFunction &MF = DAG.getMachineFunction(); 943 SDValue Chain = Op.getOperand(0); 944 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 945 946 switch (IntrinsicID) { 947 case AMDGPUIntrinsic::SI_tbuffer_store: { 948 SDLoc DL(Op); 949 SDValue Ops[] = { 950 Chain, 951 Op.getOperand(2), 952 Op.getOperand(3), 953 Op.getOperand(4), 954 Op.getOperand(5), 955 Op.getOperand(6), 956 Op.getOperand(7), 957 Op.getOperand(8), 958 Op.getOperand(9), 959 Op.getOperand(10), 960 Op.getOperand(11), 961 Op.getOperand(12), 962 Op.getOperand(13), 963 Op.getOperand(14) 964 }; 965 966 EVT VT = Op.getOperand(3).getValueType(); 967 968 MachineMemOperand *MMO = MF.getMachineMemOperand( 969 MachinePointerInfo(), 970 MachineMemOperand::MOStore, 971 VT.getStoreSize(), 4); 972 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 973 Op->getVTList(), Ops, VT, MMO); 974 } 975 default: 976 return SDValue(); 977 } 978 } 979 980 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 981 SDLoc DL(Op); 982 LoadSDNode *Load = cast<LoadSDNode>(Op); 983 984 if (Op.getValueType().isVector()) { 985 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 986 "Custom lowering for non-i32 vectors hasn't been implemented."); 987 unsigned NumElements = Op.getValueType().getVectorNumElements(); 988 assert(NumElements != 2 && "v2 loads are supported for all address spaces."); 989 switch (Load->getAddressSpace()) { 990 default: break; 991 case AMDGPUAS::GLOBAL_ADDRESS: 992 case AMDGPUAS::PRIVATE_ADDRESS: 993 // v4 loads are supported for private and global memory. 994 if (NumElements <= 4) 995 break; 996 // fall-through 997 case AMDGPUAS::LOCAL_ADDRESS: 998 return ScalarizeVectorLoad(Op, DAG); 999 } 1000 } 1001 1002 return AMDGPUTargetLowering::LowerLOAD(Op, DAG); 1003 } 1004 1005 SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, 1006 const SDValue &Op, 1007 SelectionDAG &DAG) const { 1008 return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), 1009 Op.getOperand(2), 1010 Op.getOperand(3), 1011 Op.getOperand(4)); 1012 } 1013 1014 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 1015 if (Op.getValueType() != MVT::i64) 1016 return SDValue(); 1017 1018 SDLoc DL(Op); 1019 SDValue Cond = Op.getOperand(0); 1020 1021 SDValue Zero = DAG.getConstant(0, MVT::i32); 1022 SDValue One = DAG.getConstant(1, MVT::i32); 1023 1024 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 1025 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 1026 1027 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 1028 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 1029 1030 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 1031 1032 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 1033 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 1034 1035 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 1036 1037 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); 1038 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 1039 } 1040 1041 // Catch division cases where we can use shortcuts with rcp and rsq 1042 // instructions. 1043 SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { 1044 SDLoc SL(Op); 1045 SDValue LHS = Op.getOperand(0); 1046 SDValue RHS = Op.getOperand(1); 1047 EVT VT = Op.getValueType(); 1048 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; 1049 1050 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 1051 if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && 1052 CLHS->isExactlyValue(1.0)) { 1053 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 1054 // the CI documentation has a worst case error of 1 ulp. 1055 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 1056 // use it as long as we aren't trying to use denormals. 1057 1058 // 1.0 / sqrt(x) -> rsq(x) 1059 // 1060 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 1061 // error seems really high at 2^29 ULP. 1062 if (RHS.getOpcode() == ISD::FSQRT) 1063 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); 1064 1065 // 1.0 / x -> rcp(x) 1066 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 1067 } 1068 } 1069 1070 if (Unsafe) { 1071 // Turn into multiply by the reciprocal. 1072 // x / y -> x * (1.0 / y) 1073 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 1074 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); 1075 } 1076 1077 return SDValue(); 1078 } 1079 1080 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 1081 SDValue FastLowered = LowerFastFDIV(Op, DAG); 1082 if (FastLowered.getNode()) 1083 return FastLowered; 1084 1085 // This uses v_rcp_f32 which does not handle denormals. Let this hit a 1086 // selection error for now rather than do something incorrect. 1087 if (Subtarget->hasFP32Denormals()) 1088 return SDValue(); 1089 1090 SDLoc SL(Op); 1091 SDValue LHS = Op.getOperand(0); 1092 SDValue RHS = Op.getOperand(1); 1093 1094 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); 1095 1096 const APFloat K0Val(BitsToFloat(0x6f800000)); 1097 const SDValue K0 = DAG.getConstantFP(K0Val, MVT::f32); 1098 1099 const APFloat K1Val(BitsToFloat(0x2f800000)); 1100 const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32); 1101 1102 const SDValue One = DAG.getConstantFP(1.0, MVT::f32); 1103 1104 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); 1105 1106 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 1107 1108 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); 1109 1110 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); 1111 1112 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); 1113 1114 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); 1115 1116 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); 1117 } 1118 1119 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 1120 if (DAG.getTarget().Options.UnsafeFPMath) 1121 return LowerFastFDIV(Op, DAG); 1122 1123 SDLoc SL(Op); 1124 SDValue X = Op.getOperand(0); 1125 SDValue Y = Op.getOperand(1); 1126 1127 const SDValue One = DAG.getConstantFP(1.0, MVT::f64); 1128 1129 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 1130 1131 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 1132 1133 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 1134 1135 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 1136 1137 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 1138 1139 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 1140 1141 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 1142 1143 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 1144 1145 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 1146 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 1147 1148 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, 1149 NegDivScale0, Mul, DivScale1); 1150 1151 SDValue Scale; 1152 1153 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { 1154 // Workaround a hardware bug on SI where the condition output from div_scale 1155 // is not usable. 1156 1157 const SDValue Hi = DAG.getConstant(1, MVT::i32); 1158 1159 // Figure out if the scale to use for div_fmas. 1160 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 1161 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 1162 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 1163 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 1164 1165 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 1166 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 1167 1168 SDValue Scale0Hi 1169 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 1170 SDValue Scale1Hi 1171 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 1172 1173 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 1174 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 1175 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 1176 } else { 1177 Scale = DivScale1.getValue(1); 1178 } 1179 1180 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, 1181 Fma4, Fma3, Mul, Scale); 1182 1183 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 1184 } 1185 1186 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 1187 EVT VT = Op.getValueType(); 1188 1189 if (VT == MVT::f32) 1190 return LowerFDIV32(Op, DAG); 1191 1192 if (VT == MVT::f64) 1193 return LowerFDIV64(Op, DAG); 1194 1195 llvm_unreachable("Unexpected type for fdiv"); 1196 } 1197 1198 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1199 SDLoc DL(Op); 1200 StoreSDNode *Store = cast<StoreSDNode>(Op); 1201 EVT VT = Store->getMemoryVT(); 1202 1203 // These stores are legal. 1204 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 1205 if (VT.isVector() && VT.getVectorNumElements() > 4) 1206 return ScalarizeVectorStore(Op, DAG); 1207 return SDValue(); 1208 } 1209 1210 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1211 if (Ret.getNode()) 1212 return Ret; 1213 1214 if (VT.isVector() && VT.getVectorNumElements() >= 8) 1215 return ScalarizeVectorStore(Op, DAG); 1216 1217 if (VT == MVT::i1) 1218 return DAG.getTruncStore(Store->getChain(), DL, 1219 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 1220 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 1221 1222 return SDValue(); 1223 } 1224 1225 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 1226 EVT VT = Op.getValueType(); 1227 SDValue Arg = Op.getOperand(0); 1228 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, 1229 DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, 1230 DAG.getConstantFP(0.5 / M_PI, VT))); 1231 1232 switch (Op.getOpcode()) { 1233 case ISD::FCOS: 1234 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); 1235 case ISD::FSIN: 1236 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); 1237 default: 1238 llvm_unreachable("Wrong trig opcode"); 1239 } 1240 } 1241 1242 //===----------------------------------------------------------------------===// 1243 // Custom DAG optimizations 1244 //===----------------------------------------------------------------------===// 1245 1246 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 1247 DAGCombinerInfo &DCI) const { 1248 EVT VT = N->getValueType(0); 1249 EVT ScalarVT = VT.getScalarType(); 1250 if (ScalarVT != MVT::f32) 1251 return SDValue(); 1252 1253 SelectionDAG &DAG = DCI.DAG; 1254 SDLoc DL(N); 1255 1256 SDValue Src = N->getOperand(0); 1257 EVT SrcVT = Src.getValueType(); 1258 1259 // TODO: We could try to match extracting the higher bytes, which would be 1260 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 1261 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 1262 // about in practice. 1263 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { 1264 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 1265 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); 1266 DCI.AddToWorklist(Cvt.getNode()); 1267 return Cvt; 1268 } 1269 } 1270 1271 // We are primarily trying to catch operations on illegal vector types 1272 // before they are expanded. 1273 // For scalars, we can use the more flexible method of checking masked bits 1274 // after legalization. 1275 if (!DCI.isBeforeLegalize() || 1276 !SrcVT.isVector() || 1277 SrcVT.getVectorElementType() != MVT::i8) { 1278 return SDValue(); 1279 } 1280 1281 assert(DCI.isBeforeLegalize() && "Unexpected legal type"); 1282 1283 // Weird sized vectors are a pain to handle, but we know 3 is really the same 1284 // size as 4. 1285 unsigned NElts = SrcVT.getVectorNumElements(); 1286 if (!SrcVT.isSimple() && NElts != 3) 1287 return SDValue(); 1288 1289 // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to 1290 // prevent a mess from expanding to v4i32 and repacking. 1291 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { 1292 EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); 1293 EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); 1294 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); 1295 LoadSDNode *Load = cast<LoadSDNode>(Src); 1296 1297 unsigned AS = Load->getAddressSpace(); 1298 unsigned Align = Load->getAlignment(); 1299 Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); 1300 unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); 1301 1302 // Don't try to replace the load if we have to expand it due to alignment 1303 // problems. Otherwise we will end up scalarizing the load, and trying to 1304 // repack into the vector for no real reason. 1305 if (Align < ABIAlignment && 1306 !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { 1307 return SDValue(); 1308 } 1309 1310 SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, 1311 Load->getChain(), 1312 Load->getBasePtr(), 1313 LoadVT, 1314 Load->getMemOperand()); 1315 1316 // Make sure successors of the original load stay after it by updating 1317 // them to use the new Chain. 1318 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); 1319 1320 SmallVector<SDValue, 4> Elts; 1321 if (RegVT.isVector()) 1322 DAG.ExtractVectorElements(NewLoad, Elts); 1323 else 1324 Elts.push_back(NewLoad); 1325 1326 SmallVector<SDValue, 4> Ops; 1327 1328 unsigned EltIdx = 0; 1329 for (SDValue Elt : Elts) { 1330 unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); 1331 for (unsigned I = 0; I < ComponentsInElt; ++I) { 1332 unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; 1333 SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); 1334 DCI.AddToWorklist(Cvt.getNode()); 1335 Ops.push_back(Cvt); 1336 } 1337 1338 ++EltIdx; 1339 } 1340 1341 assert(Ops.size() == NElts); 1342 1343 return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); 1344 } 1345 1346 return SDValue(); 1347 } 1348 1349 /// \brief Return true if the given offset Size in bytes can be folded into 1350 /// the immediate offsets of a memory instruction for the given address space. 1351 static bool canFoldOffset(unsigned OffsetSize, unsigned AS, 1352 const AMDGPUSubtarget &STI) { 1353 switch (AS) { 1354 case AMDGPUAS::GLOBAL_ADDRESS: { 1355 // MUBUF instructions a 12-bit offset in bytes. 1356 return isUInt<12>(OffsetSize); 1357 } 1358 case AMDGPUAS::CONSTANT_ADDRESS: { 1359 // SMRD instructions have an 8-bit offset in dwords on SI and 1360 // a 20-bit offset in bytes on VI. 1361 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1362 return isUInt<20>(OffsetSize); 1363 else 1364 return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); 1365 } 1366 case AMDGPUAS::LOCAL_ADDRESS: 1367 case AMDGPUAS::REGION_ADDRESS: { 1368 // The single offset versions have a 16-bit offset in bytes. 1369 return isUInt<16>(OffsetSize); 1370 } 1371 case AMDGPUAS::PRIVATE_ADDRESS: 1372 // Indirect register addressing does not use any offsets. 1373 default: 1374 return 0; 1375 } 1376 } 1377 1378 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 1379 1380 // This is a variant of 1381 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 1382 // 1383 // The normal DAG combiner will do this, but only if the add has one use since 1384 // that would increase the number of instructions. 1385 // 1386 // This prevents us from seeing a constant offset that can be folded into a 1387 // memory instruction's addressing mode. If we know the resulting add offset of 1388 // a pointer can be folded into an addressing offset, we can replace the pointer 1389 // operand with the add of new constant offset. This eliminates one of the uses, 1390 // and may allow the remaining use to also be simplified. 1391 // 1392 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, 1393 unsigned AddrSpace, 1394 DAGCombinerInfo &DCI) const { 1395 SDValue N0 = N->getOperand(0); 1396 SDValue N1 = N->getOperand(1); 1397 1398 if (N0.getOpcode() != ISD::ADD) 1399 return SDValue(); 1400 1401 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 1402 if (!CN1) 1403 return SDValue(); 1404 1405 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 1406 if (!CAdd) 1407 return SDValue(); 1408 1409 // If the resulting offset is too large, we can't fold it into the addressing 1410 // mode offset. 1411 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 1412 if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) 1413 return SDValue(); 1414 1415 SelectionDAG &DAG = DCI.DAG; 1416 SDLoc SL(N); 1417 EVT VT = N->getValueType(0); 1418 1419 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 1420 SDValue COffset = DAG.getConstant(Offset, MVT::i32); 1421 1422 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); 1423 } 1424 1425 SDValue SITargetLowering::performAndCombine(SDNode *N, 1426 DAGCombinerInfo &DCI) const { 1427 if (DCI.isBeforeLegalize()) 1428 return SDValue(); 1429 1430 SelectionDAG &DAG = DCI.DAG; 1431 1432 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 1433 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 1434 SDValue LHS = N->getOperand(0); 1435 SDValue RHS = N->getOperand(1); 1436 1437 if (LHS.getOpcode() == ISD::SETCC && 1438 RHS.getOpcode() == ISD::SETCC) { 1439 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 1440 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 1441 1442 SDValue X = LHS.getOperand(0); 1443 SDValue Y = RHS.getOperand(0); 1444 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) 1445 return SDValue(); 1446 1447 if (LCC == ISD::SETO) { 1448 if (X != LHS.getOperand(1)) 1449 return SDValue(); 1450 1451 if (RCC == ISD::SETUNE) { 1452 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 1453 if (!C1 || !C1->isInfinity() || C1->isNegative()) 1454 return SDValue(); 1455 1456 const uint32_t Mask = SIInstrFlags::N_NORMAL | 1457 SIInstrFlags::N_SUBNORMAL | 1458 SIInstrFlags::N_ZERO | 1459 SIInstrFlags::P_ZERO | 1460 SIInstrFlags::P_SUBNORMAL | 1461 SIInstrFlags::P_NORMAL; 1462 1463 static_assert(((~(SIInstrFlags::S_NAN | 1464 SIInstrFlags::Q_NAN | 1465 SIInstrFlags::N_INFINITY | 1466 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, 1467 "mask not equal"); 1468 1469 return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1, 1470 X, DAG.getConstant(Mask, MVT::i32)); 1471 } 1472 } 1473 } 1474 1475 return SDValue(); 1476 } 1477 1478 SDValue SITargetLowering::performOrCombine(SDNode *N, 1479 DAGCombinerInfo &DCI) const { 1480 SelectionDAG &DAG = DCI.DAG; 1481 SDValue LHS = N->getOperand(0); 1482 SDValue RHS = N->getOperand(1); 1483 1484 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 1485 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 1486 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 1487 SDValue Src = LHS.getOperand(0); 1488 if (Src != RHS.getOperand(0)) 1489 return SDValue(); 1490 1491 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 1492 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 1493 if (!CLHS || !CRHS) 1494 return SDValue(); 1495 1496 // Only 10 bits are used. 1497 static const uint32_t MaxMask = 0x3ff; 1498 1499 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 1500 return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1, 1501 Src, DAG.getConstant(NewMask, MVT::i32)); 1502 } 1503 1504 return SDValue(); 1505 } 1506 1507 SDValue SITargetLowering::performClassCombine(SDNode *N, 1508 DAGCombinerInfo &DCI) const { 1509 SelectionDAG &DAG = DCI.DAG; 1510 SDValue Mask = N->getOperand(1); 1511 1512 // fp_class x, 0 -> false 1513 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { 1514 if (CMask->isNullValue()) 1515 return DAG.getConstant(0, MVT::i1); 1516 } 1517 1518 return SDValue(); 1519 } 1520 1521 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 1522 switch (Opc) { 1523 case ISD::FMAXNUM: 1524 return AMDGPUISD::FMAX3; 1525 case AMDGPUISD::SMAX: 1526 return AMDGPUISD::SMAX3; 1527 case AMDGPUISD::UMAX: 1528 return AMDGPUISD::UMAX3; 1529 case ISD::FMINNUM: 1530 return AMDGPUISD::FMIN3; 1531 case AMDGPUISD::SMIN: 1532 return AMDGPUISD::SMIN3; 1533 case AMDGPUISD::UMIN: 1534 return AMDGPUISD::UMIN3; 1535 default: 1536 llvm_unreachable("Not a min/max opcode"); 1537 } 1538 } 1539 1540 SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, 1541 DAGCombinerInfo &DCI) const { 1542 SelectionDAG &DAG = DCI.DAG; 1543 1544 unsigned Opc = N->getOpcode(); 1545 SDValue Op0 = N->getOperand(0); 1546 SDValue Op1 = N->getOperand(1); 1547 1548 // Only do this if the inner op has one use since this will just increases 1549 // register pressure for no benefit. 1550 1551 // max(max(a, b), c) 1552 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 1553 SDLoc DL(N); 1554 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 1555 DL, 1556 N->getValueType(0), 1557 Op0.getOperand(0), 1558 Op0.getOperand(1), 1559 Op1); 1560 } 1561 1562 // max(a, max(b, c)) 1563 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 1564 SDLoc DL(N); 1565 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 1566 DL, 1567 N->getValueType(0), 1568 Op0, 1569 Op1.getOperand(0), 1570 Op1.getOperand(1)); 1571 } 1572 1573 return SDValue(); 1574 } 1575 1576 SDValue SITargetLowering::performSetCCCombine(SDNode *N, 1577 DAGCombinerInfo &DCI) const { 1578 SelectionDAG &DAG = DCI.DAG; 1579 SDLoc SL(N); 1580 1581 SDValue LHS = N->getOperand(0); 1582 SDValue RHS = N->getOperand(1); 1583 EVT VT = LHS.getValueType(); 1584 1585 if (VT != MVT::f32 && VT != MVT::f64) 1586 return SDValue(); 1587 1588 // Match isinf pattern 1589 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 1590 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 1591 if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { 1592 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 1593 if (!CRHS) 1594 return SDValue(); 1595 1596 const APFloat &APF = CRHS->getValueAPF(); 1597 if (APF.isInfinity() && !APF.isNegative()) { 1598 unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; 1599 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, 1600 LHS.getOperand(0), DAG.getConstant(Mask, MVT::i32)); 1601 } 1602 } 1603 1604 return SDValue(); 1605 } 1606 1607 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 1608 DAGCombinerInfo &DCI) const { 1609 SelectionDAG &DAG = DCI.DAG; 1610 SDLoc DL(N); 1611 1612 switch (N->getOpcode()) { 1613 default: 1614 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1615 case ISD::SETCC: 1616 return performSetCCCombine(N, DCI); 1617 case ISD::FMAXNUM: // TODO: What about fmax_legacy? 1618 case ISD::FMINNUM: 1619 case AMDGPUISD::SMAX: 1620 case AMDGPUISD::SMIN: 1621 case AMDGPUISD::UMAX: 1622 case AMDGPUISD::UMIN: { 1623 if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && 1624 N->getValueType(0) != MVT::f64 && 1625 getTargetMachine().getOptLevel() > CodeGenOpt::None) 1626 return performMin3Max3Combine(N, DCI); 1627 break; 1628 } 1629 1630 case AMDGPUISD::CVT_F32_UBYTE0: 1631 case AMDGPUISD::CVT_F32_UBYTE1: 1632 case AMDGPUISD::CVT_F32_UBYTE2: 1633 case AMDGPUISD::CVT_F32_UBYTE3: { 1634 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 1635 1636 SDValue Src = N->getOperand(0); 1637 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 1638 1639 APInt KnownZero, KnownOne; 1640 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 1641 !DCI.isBeforeLegalizeOps()); 1642 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 1643 if (TLO.ShrinkDemandedConstant(Src, Demanded) || 1644 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { 1645 DCI.CommitTargetLoweringOpt(TLO); 1646 } 1647 1648 break; 1649 } 1650 1651 case ISD::UINT_TO_FP: { 1652 return performUCharToFloatCombine(N, DCI); 1653 1654 case ISD::FADD: { 1655 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 1656 break; 1657 1658 EVT VT = N->getValueType(0); 1659 if (VT != MVT::f32) 1660 break; 1661 1662 // Only do this if we are not trying to support denormals. v_mad_f32 does 1663 // not support denormals ever. 1664 if (Subtarget->hasFP32Denormals()) 1665 break; 1666 1667 SDValue LHS = N->getOperand(0); 1668 SDValue RHS = N->getOperand(1); 1669 1670 // These should really be instruction patterns, but writing patterns with 1671 // source modiifiers is a pain. 1672 1673 // fadd (fadd (a, a), b) -> mad 2.0, a, b 1674 if (LHS.getOpcode() == ISD::FADD) { 1675 SDValue A = LHS.getOperand(0); 1676 if (A == LHS.getOperand(1)) { 1677 const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); 1678 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); 1679 } 1680 } 1681 1682 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 1683 if (RHS.getOpcode() == ISD::FADD) { 1684 SDValue A = RHS.getOperand(0); 1685 if (A == RHS.getOperand(1)) { 1686 const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); 1687 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); 1688 } 1689 } 1690 1691 return SDValue(); 1692 } 1693 case ISD::FSUB: { 1694 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 1695 break; 1696 1697 EVT VT = N->getValueType(0); 1698 1699 // Try to get the fneg to fold into the source modifier. This undoes generic 1700 // DAG combines and folds them into the mad. 1701 // 1702 // Only do this if we are not trying to support denormals. v_mad_f32 does 1703 // not support denormals ever. 1704 if (VT == MVT::f32 && 1705 !Subtarget->hasFP32Denormals()) { 1706 SDValue LHS = N->getOperand(0); 1707 SDValue RHS = N->getOperand(1); 1708 if (LHS.getOpcode() == ISD::FADD) { 1709 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 1710 1711 SDValue A = LHS.getOperand(0); 1712 if (A == LHS.getOperand(1)) { 1713 const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); 1714 SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); 1715 1716 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); 1717 } 1718 } 1719 1720 if (RHS.getOpcode() == ISD::FADD) { 1721 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 1722 1723 SDValue A = RHS.getOperand(0); 1724 if (A == RHS.getOperand(1)) { 1725 const SDValue NegTwo = DAG.getConstantFP(-2.0, MVT::f32); 1726 return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); 1727 } 1728 } 1729 1730 return SDValue(); 1731 } 1732 1733 break; 1734 } 1735 } 1736 case ISD::LOAD: 1737 case ISD::STORE: 1738 case ISD::ATOMIC_LOAD: 1739 case ISD::ATOMIC_STORE: 1740 case ISD::ATOMIC_CMP_SWAP: 1741 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 1742 case ISD::ATOMIC_SWAP: 1743 case ISD::ATOMIC_LOAD_ADD: 1744 case ISD::ATOMIC_LOAD_SUB: 1745 case ISD::ATOMIC_LOAD_AND: 1746 case ISD::ATOMIC_LOAD_OR: 1747 case ISD::ATOMIC_LOAD_XOR: 1748 case ISD::ATOMIC_LOAD_NAND: 1749 case ISD::ATOMIC_LOAD_MIN: 1750 case ISD::ATOMIC_LOAD_MAX: 1751 case ISD::ATOMIC_LOAD_UMIN: 1752 case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. 1753 if (DCI.isBeforeLegalize()) 1754 break; 1755 1756 MemSDNode *MemNode = cast<MemSDNode>(N); 1757 SDValue Ptr = MemNode->getBasePtr(); 1758 1759 // TODO: We could also do this for multiplies. 1760 unsigned AS = MemNode->getAddressSpace(); 1761 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { 1762 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); 1763 if (NewPtr) { 1764 SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); 1765 1766 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; 1767 return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); 1768 } 1769 } 1770 break; 1771 } 1772 case ISD::AND: 1773 return performAndCombine(N, DCI); 1774 case ISD::OR: 1775 return performOrCombine(N, DCI); 1776 case AMDGPUISD::FP_CLASS: 1777 return performClassCombine(N, DCI); 1778 } 1779 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1780 } 1781 1782 /// \brief Analyze the possible immediate value Op 1783 /// 1784 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 1785 /// and the immediate value if it's a literal immediate 1786 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 1787 1788 const SIInstrInfo *TII = 1789 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 1790 1791 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 1792 if (TII->isInlineConstant(Node->getAPIntValue())) 1793 return 0; 1794 1795 uint64_t Val = Node->getZExtValue(); 1796 return isUInt<32>(Val) ? Val : -1; 1797 } 1798 1799 if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { 1800 if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) 1801 return 0; 1802 1803 if (Node->getValueType(0) == MVT::f32) 1804 return FloatToBits(Node->getValueAPF().convertToFloat()); 1805 1806 return -1; 1807 } 1808 1809 return -1; 1810 } 1811 1812 /// \brief Helper function for adjustWritemask 1813 static unsigned SubIdx2Lane(unsigned Idx) { 1814 switch (Idx) { 1815 default: return 0; 1816 case AMDGPU::sub0: return 0; 1817 case AMDGPU::sub1: return 1; 1818 case AMDGPU::sub2: return 2; 1819 case AMDGPU::sub3: return 3; 1820 } 1821 } 1822 1823 /// \brief Adjust the writemask of MIMG instructions 1824 void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 1825 SelectionDAG &DAG) const { 1826 SDNode *Users[4] = { }; 1827 unsigned Lane = 0; 1828 unsigned OldDmask = Node->getConstantOperandVal(0); 1829 unsigned NewDmask = 0; 1830 1831 // Try to figure out the used register components 1832 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 1833 I != E; ++I) { 1834 1835 // Abort if we can't understand the usage 1836 if (!I->isMachineOpcode() || 1837 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 1838 return; 1839 1840 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 1841 // Note that subregs are packed, i.e. Lane==0 is the first bit set 1842 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 1843 // set, etc. 1844 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 1845 1846 // Set which texture component corresponds to the lane. 1847 unsigned Comp; 1848 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 1849 assert(Dmask); 1850 Comp = countTrailingZeros(Dmask); 1851 Dmask &= ~(1 << Comp); 1852 } 1853 1854 // Abort if we have more than one user per component 1855 if (Users[Lane]) 1856 return; 1857 1858 Users[Lane] = *I; 1859 NewDmask |= 1 << Comp; 1860 } 1861 1862 // Abort if there's no change 1863 if (NewDmask == OldDmask) 1864 return; 1865 1866 // Adjust the writemask in the node 1867 std::vector<SDValue> Ops; 1868 Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); 1869 Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); 1870 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); 1871 1872 // If we only got one lane, replace it with a copy 1873 // (if NewDmask has only one bit set...) 1874 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 1875 SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, MVT::i32); 1876 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 1877 SDLoc(), Users[Lane]->getValueType(0), 1878 SDValue(Node, 0), RC); 1879 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 1880 return; 1881 } 1882 1883 // Update the users of the node with the new indices 1884 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 1885 1886 SDNode *User = Users[i]; 1887 if (!User) 1888 continue; 1889 1890 SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); 1891 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 1892 1893 switch (Idx) { 1894 default: break; 1895 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 1896 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 1897 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 1898 } 1899 } 1900 } 1901 1902 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) 1903 /// with frame index operands. 1904 /// LLVM assumes that inputs are to these instructions are registers. 1905 void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 1906 SelectionDAG &DAG) const { 1907 1908 SmallVector<SDValue, 8> Ops; 1909 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 1910 if (!isa<FrameIndexSDNode>(Node->getOperand(i))) { 1911 Ops.push_back(Node->getOperand(i)); 1912 continue; 1913 } 1914 1915 SDLoc DL(Node); 1916 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 1917 Node->getOperand(i).getValueType(), 1918 Node->getOperand(i)), 0)); 1919 } 1920 1921 DAG.UpdateNodeOperands(Node, Ops); 1922 } 1923 1924 /// \brief Fold the instructions after selecting them. 1925 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 1926 SelectionDAG &DAG) const { 1927 const SIInstrInfo *TII = 1928 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 1929 1930 if (TII->isMIMG(Node->getMachineOpcode())) 1931 adjustWritemask(Node, DAG); 1932 1933 if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || 1934 Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { 1935 legalizeTargetIndependentNode(Node, DAG); 1936 return Node; 1937 } 1938 return Node; 1939 } 1940 1941 /// \brief Assign the register class depending on the number of 1942 /// bits set in the writemask 1943 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 1944 SDNode *Node) const { 1945 const SIInstrInfo *TII = 1946 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 1947 1948 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1949 TII->legalizeOperands(MI); 1950 1951 if (TII->isMIMG(MI->getOpcode())) { 1952 unsigned VReg = MI->getOperand(0).getReg(); 1953 unsigned Writemask = MI->getOperand(1).getImm(); 1954 unsigned BitsSet = 0; 1955 for (unsigned i = 0; i < 4; ++i) 1956 BitsSet += Writemask & (1 << i) ? 1 : 0; 1957 1958 const TargetRegisterClass *RC; 1959 switch (BitsSet) { 1960 default: return; 1961 case 1: RC = &AMDGPU::VGPR_32RegClass; break; 1962 case 2: RC = &AMDGPU::VReg_64RegClass; break; 1963 case 3: RC = &AMDGPU::VReg_96RegClass; break; 1964 } 1965 1966 unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); 1967 MI->setDesc(TII->get(NewOpcode)); 1968 MRI.setRegClass(VReg, RC); 1969 return; 1970 } 1971 1972 // Replace unused atomics with the no return version. 1973 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); 1974 if (NoRetAtomicOp != -1) { 1975 if (!Node->hasAnyUseOfValue(0)) { 1976 MI->setDesc(TII->get(NoRetAtomicOp)); 1977 MI->RemoveOperand(0); 1978 } 1979 1980 return; 1981 } 1982 } 1983 1984 static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { 1985 SDValue K = DAG.getTargetConstant(Val, MVT::i32); 1986 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 1987 } 1988 1989 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 1990 SDLoc DL, 1991 SDValue Ptr) const { 1992 const SIInstrInfo *TII = 1993 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 1994 #if 1 1995 // XXX - Workaround for moveToVALU not handling different register class 1996 // inserts for REG_SEQUENCE. 1997 1998 // Build the half of the subregister with the constants. 1999 const SDValue Ops0[] = { 2000 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, MVT::i32), 2001 buildSMovImm32(DAG, DL, 0), 2002 DAG.getTargetConstant(AMDGPU::sub0, MVT::i32), 2003 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 2004 DAG.getTargetConstant(AMDGPU::sub1, MVT::i32) 2005 }; 2006 2007 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, 2008 MVT::v2i32, Ops0), 0); 2009 2010 // Combine the constants and the pointer. 2011 const SDValue Ops1[] = { 2012 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), 2013 Ptr, 2014 DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), 2015 SubRegHi, 2016 DAG.getTargetConstant(AMDGPU::sub2_sub3, MVT::i32) 2017 }; 2018 2019 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 2020 #else 2021 const SDValue Ops[] = { 2022 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), 2023 Ptr, 2024 DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), 2025 buildSMovImm32(DAG, DL, 0), 2026 DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), 2027 buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), 2028 DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) 2029 }; 2030 2031 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 2032 2033 #endif 2034 } 2035 2036 /// \brief Return a resource descriptor with the 'Add TID' bit enabled 2037 /// The TID (Thread ID) is multipled by the stride value (bits [61:48] 2038 /// of the resource descriptor) to create an offset, which is added to the 2039 /// resource ponter. 2040 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, 2041 SDLoc DL, 2042 SDValue Ptr, 2043 uint32_t RsrcDword1, 2044 uint64_t RsrcDword2And3) const { 2045 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 2046 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 2047 if (RsrcDword1) { 2048 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 2049 DAG.getConstant(RsrcDword1, MVT::i32)), 0); 2050 } 2051 2052 SDValue DataLo = buildSMovImm32(DAG, DL, 2053 RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 2054 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 2055 2056 const SDValue Ops[] = { 2057 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), 2058 PtrLo, 2059 DAG.getTargetConstant(AMDGPU::sub0, MVT::i32), 2060 PtrHi, 2061 DAG.getTargetConstant(AMDGPU::sub1, MVT::i32), 2062 DataLo, 2063 DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), 2064 DataHi, 2065 DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) 2066 }; 2067 2068 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 2069 } 2070 2071 MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, 2072 SDLoc DL, 2073 SDValue Ptr) const { 2074 const SIInstrInfo *TII = 2075 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2076 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE | 2077 0xffffffff; // Size 2078 2079 return buildRSRC(DAG, DL, Ptr, 0, Rsrc); 2080 } 2081 2082 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 2083 const TargetRegisterClass *RC, 2084 unsigned Reg, EVT VT) const { 2085 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 2086 2087 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 2088 cast<RegisterSDNode>(VReg)->getReg(), VT); 2089 } 2090 2091 //===----------------------------------------------------------------------===// 2092 // SI Inline Assembly Support 2093 //===----------------------------------------------------------------------===// 2094 2095 std::pair<unsigned, const TargetRegisterClass *> 2096 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 2097 const std::string &Constraint, 2098 MVT VT) const { 2099 if (Constraint == "r") { 2100 switch(VT.SimpleTy) { 2101 default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); 2102 case MVT::i64: 2103 return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); 2104 case MVT::i32: 2105 return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); 2106 } 2107 } 2108 2109 if (Constraint.size() > 1) { 2110 const TargetRegisterClass *RC = nullptr; 2111 if (Constraint[1] == 'v') { 2112 RC = &AMDGPU::VGPR_32RegClass; 2113 } else if (Constraint[1] == 's') { 2114 RC = &AMDGPU::SGPR_32RegClass; 2115 } 2116 2117 if (RC) { 2118 unsigned Idx = std::atoi(Constraint.substr(2).c_str()); 2119 if (Idx < RC->getNumRegs()) 2120 return std::make_pair(RC->getRegister(Idx), RC); 2121 } 2122 } 2123 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 2124 } 2125