1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for SI 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIISelLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDILIntrinsicInfo.h" 18 #include "SIInstrInfo.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "llvm/CodeGen/CallingConvLower.h" 22 #include "llvm/CodeGen/MachineInstrBuilder.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/SelectionDAG.h" 25 #include "llvm/IR/Function.h" 26 27 const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; 28 29 using namespace llvm; 30 31 SITargetLowering::SITargetLowering(TargetMachine &TM) : 32 AMDGPUTargetLowering(TM) { 33 34 addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); 35 addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass); 36 37 addRegisterClass(MVT::v2i1, &AMDGPU::VReg_64RegClass); 38 addRegisterClass(MVT::v4i1, &AMDGPU::VReg_128RegClass); 39 40 addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass); 41 addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 42 addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 43 44 addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass); 45 addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass); 46 47 addRegisterClass(MVT::v1i32, &AMDGPU::VSrc_32RegClass); 48 49 addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass); 50 addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass); 51 addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass); 52 53 addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); 54 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 55 addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass); 56 57 addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); 58 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 59 60 addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); 61 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 62 63 computeRegisterProperties(); 64 65 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 66 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 67 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 68 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 69 70 setOperationAction(ISD::ADD, MVT::i64, Legal); 71 setOperationAction(ISD::ADD, MVT::i32, Legal); 72 73 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 74 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 75 76 setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); 77 78 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 79 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 80 81 setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); 82 setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); 83 84 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 85 86 setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); 87 88 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 89 90 setTargetDAGCombine(ISD::SELECT_CC); 91 92 setTargetDAGCombine(ISD::SETCC); 93 94 setSchedulingPreference(Sched::RegPressure); 95 } 96 97 //===----------------------------------------------------------------------===// 98 // TargetLowering queries 99 //===----------------------------------------------------------------------===// 100 101 bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, 102 bool *IsFast) const { 103 // XXX: This depends on the address space and also we may want to revist 104 // the alignment values we specify in the DataLayout. 105 return VT.bitsGT(MVT::i32); 106 } 107 108 109 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, 110 SDLoc DL, SDValue Chain, 111 unsigned Offset) const { 112 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 113 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 114 AMDGPUAS::CONSTANT_ADDRESS); 115 EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits()); 116 SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, 117 MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); 118 SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 119 DAG.getConstant(Offset, MVT::i64)); 120 return DAG.getLoad(VT, DL, Chain, Ptr, 121 MachinePointerInfo(UndefValue::get(PtrTy)), 122 false, false, false, ArgVT.getSizeInBits() >> 3); 123 124 } 125 126 SDValue SITargetLowering::LowerFormalArguments( 127 SDValue Chain, 128 CallingConv::ID CallConv, 129 bool isVarArg, 130 const SmallVectorImpl<ISD::InputArg> &Ins, 131 SDLoc DL, SelectionDAG &DAG, 132 SmallVectorImpl<SDValue> &InVals) const { 133 134 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 135 136 MachineFunction &MF = DAG.getMachineFunction(); 137 FunctionType *FType = MF.getFunction()->getFunctionType(); 138 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 139 140 assert(CallConv == CallingConv::C); 141 142 SmallVector<ISD::InputArg, 16> Splits; 143 uint32_t Skipped = 0; 144 145 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 146 const ISD::InputArg &Arg = Ins[i]; 147 148 // First check if it's a PS input addr 149 if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) { 150 151 assert((PSInputNum <= 15) && "Too many PS inputs!"); 152 153 if (!Arg.Used) { 154 // We can savely skip PS inputs 155 Skipped |= 1 << i; 156 ++PSInputNum; 157 continue; 158 } 159 160 Info->PSInputAddr |= 1 << PSInputNum++; 161 } 162 163 // Second split vertices into their elements 164 if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) { 165 ISD::InputArg NewArg = Arg; 166 NewArg.Flags.setSplit(); 167 NewArg.VT = Arg.VT.getVectorElementType(); 168 169 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 170 // three or five element vertex only needs three or five registers, 171 // NOT four or eigth. 172 Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 173 unsigned NumElements = ParamType->getVectorNumElements(); 174 175 for (unsigned j = 0; j != NumElements; ++j) { 176 Splits.push_back(NewArg); 177 NewArg.PartOffset += NewArg.VT.getStoreSize(); 178 } 179 180 } else { 181 Splits.push_back(Arg); 182 } 183 } 184 185 SmallVector<CCValAssign, 16> ArgLocs; 186 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 187 getTargetMachine(), ArgLocs, *DAG.getContext()); 188 189 // At least one interpolation mode must be enabled or else the GPU will hang. 190 if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { 191 Info->PSInputAddr |= 1; 192 CCInfo.AllocateReg(AMDGPU::VGPR0); 193 CCInfo.AllocateReg(AMDGPU::VGPR1); 194 } 195 196 // The pointer to the list of arguments is stored in SGPR0, SGPR1 197 if (Info->ShaderType == ShaderType::COMPUTE) { 198 CCInfo.AllocateReg(AMDGPU::SGPR0); 199 CCInfo.AllocateReg(AMDGPU::SGPR1); 200 MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); 201 } 202 203 AnalyzeFormalArguments(CCInfo, Splits); 204 205 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 206 207 const ISD::InputArg &Arg = Ins[i]; 208 if (Skipped & (1 << i)) { 209 InVals.push_back(DAG.getUNDEF(Arg.VT)); 210 continue; 211 } 212 213 CCValAssign &VA = ArgLocs[ArgIdx++]; 214 EVT VT = VA.getLocVT(); 215 216 if (VA.isMemLoc()) { 217 // The first 36 bytes of the input buffer contains information about 218 // thread group and global sizes. 219 SDValue Arg = LowerParameter(DAG, VT, DL, DAG.getRoot(), 220 36 + VA.getLocMemOffset()); 221 InVals.push_back(Arg); 222 continue; 223 } 224 assert(VA.isRegLoc() && "Parameter must be in a register!"); 225 226 unsigned Reg = VA.getLocReg(); 227 228 if (VT == MVT::i64) { 229 // For now assume it is a pointer 230 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 231 &AMDGPU::SReg_64RegClass); 232 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 233 InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 234 continue; 235 } 236 237 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 238 239 Reg = MF.addLiveIn(Reg, RC); 240 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 241 242 if (Arg.VT.isVector()) { 243 244 // Build a vector from the registers 245 Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 246 unsigned NumElements = ParamType->getVectorNumElements(); 247 248 SmallVector<SDValue, 4> Regs; 249 Regs.push_back(Val); 250 for (unsigned j = 1; j != NumElements; ++j) { 251 Reg = ArgLocs[ArgIdx++].getLocReg(); 252 Reg = MF.addLiveIn(Reg, RC); 253 Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 254 } 255 256 // Fill up the missing vector elements 257 NumElements = Arg.VT.getVectorNumElements() - NumElements; 258 for (unsigned j = 0; j != NumElements; ++j) 259 Regs.push_back(DAG.getUNDEF(VT)); 260 261 InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, 262 Regs.data(), Regs.size())); 263 continue; 264 } 265 266 InVals.push_back(Val); 267 } 268 return Chain; 269 } 270 271 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 272 MachineInstr * MI, MachineBasicBlock * BB) const { 273 274 MachineBasicBlock::iterator I = *MI; 275 276 switch (MI->getOpcode()) { 277 default: 278 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 279 case AMDGPU::BRANCH: return BB; 280 case AMDGPU::SI_ADDR64_RSRC: { 281 const SIInstrInfo *TII = 282 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 283 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 284 unsigned SuperReg = MI->getOperand(0).getReg(); 285 unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 286 unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 287 unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 288 unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 289 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) 290 .addOperand(MI->getOperand(1)); 291 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) 292 .addImm(0); 293 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) 294 .addImm(RSRC_DATA_FORMAT >> 32); 295 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) 296 .addReg(SubRegHiLo) 297 .addImm(AMDGPU::sub0) 298 .addReg(SubRegHiHi) 299 .addImm(AMDGPU::sub1); 300 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg) 301 .addReg(SubRegLo) 302 .addImm(AMDGPU::sub0_sub1) 303 .addReg(SubRegHi) 304 .addImm(AMDGPU::sub2_sub3); 305 MI->eraseFromParent(); 306 break; 307 } 308 case AMDGPU::V_SUB_F64: { 309 const SIInstrInfo *TII = 310 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 311 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), 312 MI->getOperand(0).getReg()) 313 .addReg(MI->getOperand(1).getReg()) 314 .addReg(MI->getOperand(2).getReg()) 315 .addImm(0) /* src2 */ 316 .addImm(0) /* ABS */ 317 .addImm(0) /* CLAMP */ 318 .addImm(0) /* OMOD */ 319 .addImm(2); /* NEG */ 320 MI->eraseFromParent(); 321 break; 322 } 323 } 324 return BB; 325 } 326 327 EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 328 if (!VT.isVector()) { 329 return MVT::i1; 330 } 331 return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 332 } 333 334 MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { 335 return MVT::i32; 336 } 337 338 //===----------------------------------------------------------------------===// 339 // Custom DAG Lowering Operations 340 //===----------------------------------------------------------------------===// 341 342 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 343 MachineFunction &MF = DAG.getMachineFunction(); 344 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 345 switch (Op.getOpcode()) { 346 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 347 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 348 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 349 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); 350 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); 351 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 352 case ISD::INTRINSIC_WO_CHAIN: { 353 unsigned IntrinsicID = 354 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 355 EVT VT = Op.getValueType(); 356 SDLoc DL(Op); 357 //XXX: Hardcoded we only use two to store the pointer to the parameters. 358 unsigned NumUserSGPRs = 2; 359 switch (IntrinsicID) { 360 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 361 case Intrinsic::r600_read_ngroups_x: 362 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 0); 363 case Intrinsic::r600_read_ngroups_y: 364 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 4); 365 case Intrinsic::r600_read_ngroups_z: 366 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 8); 367 case Intrinsic::r600_read_global_size_x: 368 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 12); 369 case Intrinsic::r600_read_global_size_y: 370 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 16); 371 case Intrinsic::r600_read_global_size_z: 372 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 20); 373 case Intrinsic::r600_read_local_size_x: 374 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 24); 375 case Intrinsic::r600_read_local_size_y: 376 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 28); 377 case Intrinsic::r600_read_local_size_z: 378 return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 32); 379 case Intrinsic::r600_read_tgid_x: 380 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 381 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); 382 case Intrinsic::r600_read_tgid_y: 383 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 384 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); 385 case Intrinsic::r600_read_tgid_z: 386 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 387 AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); 388 case Intrinsic::r600_read_tidig_x: 389 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 390 AMDGPU::VGPR0, VT); 391 case Intrinsic::r600_read_tidig_y: 392 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 393 AMDGPU::VGPR1, VT); 394 case Intrinsic::r600_read_tidig_z: 395 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 396 AMDGPU::VGPR2, VT); 397 398 } 399 } 400 } 401 return SDValue(); 402 } 403 404 /// \brief Helper function for LowerBRCOND 405 static SDNode *findUser(SDValue Value, unsigned Opcode) { 406 407 SDNode *Parent = Value.getNode(); 408 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 409 I != E; ++I) { 410 411 if (I.getUse().get() != Value) 412 continue; 413 414 if (I->getOpcode() == Opcode) 415 return *I; 416 } 417 return 0; 418 } 419 420 /// This transforms the control flow intrinsics to get the branch destination as 421 /// last parameter, also switches branch target with BR if the need arise 422 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 423 SelectionDAG &DAG) const { 424 425 SDLoc DL(BRCOND); 426 427 SDNode *Intr = BRCOND.getOperand(1).getNode(); 428 SDValue Target = BRCOND.getOperand(2); 429 SDNode *BR = 0; 430 431 if (Intr->getOpcode() == ISD::SETCC) { 432 // As long as we negate the condition everything is fine 433 SDNode *SetCC = Intr; 434 assert(SetCC->getConstantOperandVal(1) == 1); 435 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 436 ISD::SETNE); 437 Intr = SetCC->getOperand(0).getNode(); 438 439 } else { 440 // Get the target from BR if we don't negate the condition 441 BR = findUser(BRCOND, ISD::BR); 442 Target = BR->getOperand(1); 443 } 444 445 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 446 447 // Build the result and 448 SmallVector<EVT, 4> Res; 449 for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) 450 Res.push_back(Intr->getValueType(i)); 451 452 // operands of the new intrinsic call 453 SmallVector<SDValue, 4> Ops; 454 Ops.push_back(BRCOND.getOperand(0)); 455 for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) 456 Ops.push_back(Intr->getOperand(i)); 457 Ops.push_back(Target); 458 459 // build the new intrinsic call 460 SDNode *Result = DAG.getNode( 461 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 462 DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); 463 464 if (BR) { 465 // Give the branch instruction our target 466 SDValue Ops[] = { 467 BR->getOperand(0), 468 BRCOND.getOperand(2) 469 }; 470 DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); 471 } 472 473 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 474 475 // Copy the intrinsic results to registers 476 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 477 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 478 if (!CopyToReg) 479 continue; 480 481 Chain = DAG.getCopyToReg( 482 Chain, DL, 483 CopyToReg->getOperand(1), 484 SDValue(Result, i - 1), 485 SDValue()); 486 487 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 488 } 489 490 // Remove the old intrinsic from the chain 491 DAG.ReplaceAllUsesOfValueWith( 492 SDValue(Intr, Intr->getNumValues() - 1), 493 Intr->getOperand(0)); 494 495 return Chain; 496 } 497 498 SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 499 SDValue LHS = Op.getOperand(0); 500 SDValue RHS = Op.getOperand(1); 501 SDValue True = Op.getOperand(2); 502 SDValue False = Op.getOperand(3); 503 SDValue CC = Op.getOperand(4); 504 EVT VT = Op.getValueType(); 505 SDLoc DL(Op); 506 507 // Possible Min/Max pattern 508 SDValue MinMax = LowerMinMax(Op, DAG); 509 if (MinMax.getNode()) { 510 return MinMax; 511 } 512 513 SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); 514 return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); 515 } 516 517 SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, 518 SelectionDAG &DAG) const { 519 EVT VT = Op.getValueType(); 520 SDLoc DL(Op); 521 522 if (VT != MVT::i64) { 523 return SDValue(); 524 } 525 526 SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0), 527 DAG.getConstant(31, MVT::i32)); 528 529 return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); 530 } 531 532 SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, 533 SelectionDAG &DAG) const { 534 EVT VT = Op.getValueType(); 535 SDLoc DL(Op); 536 537 if (VT != MVT::i64) { 538 return SDValue(); 539 } 540 541 return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), 542 DAG.getConstant(0, MVT::i32)); 543 } 544 545 //===----------------------------------------------------------------------===// 546 // Custom DAG optimizations 547 //===----------------------------------------------------------------------===// 548 549 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 550 DAGCombinerInfo &DCI) const { 551 SelectionDAG &DAG = DCI.DAG; 552 SDLoc DL(N); 553 EVT VT = N->getValueType(0); 554 555 switch (N->getOpcode()) { 556 default: break; 557 case ISD::SELECT_CC: { 558 N->dump(); 559 ConstantSDNode *True, *False; 560 // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) 561 if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) 562 && (False = dyn_cast<ConstantSDNode>(N->getOperand(3))) 563 && True->isAllOnesValue() 564 && False->isNullValue() 565 && VT == MVT::i1) { 566 return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), 567 N->getOperand(1), N->getOperand(4)); 568 569 } 570 break; 571 } 572 case ISD::SETCC: { 573 SDValue Arg0 = N->getOperand(0); 574 SDValue Arg1 = N->getOperand(1); 575 SDValue CC = N->getOperand(2); 576 ConstantSDNode * C = NULL; 577 ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); 578 579 // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) 580 if (VT == MVT::i1 581 && Arg0.getOpcode() == ISD::SIGN_EXTEND 582 && Arg0.getOperand(0).getValueType() == MVT::i1 583 && (C = dyn_cast<ConstantSDNode>(Arg1)) 584 && C->isNullValue() 585 && CCOp == ISD::SETNE) { 586 return SimplifySetCC(VT, Arg0.getOperand(0), 587 DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); 588 } 589 break; 590 } 591 } 592 return SDValue(); 593 } 594 595 /// \brief Test if RegClass is one of the VSrc classes 596 static bool isVSrc(unsigned RegClass) { 597 return AMDGPU::VSrc_32RegClassID == RegClass || 598 AMDGPU::VSrc_64RegClassID == RegClass; 599 } 600 601 /// \brief Test if RegClass is one of the SSrc classes 602 static bool isSSrc(unsigned RegClass) { 603 return AMDGPU::SSrc_32RegClassID == RegClass || 604 AMDGPU::SSrc_64RegClassID == RegClass; 605 } 606 607 /// \brief Analyze the possible immediate value Op 608 /// 609 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 610 /// and the immediate value if it's a literal immediate 611 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 612 613 union { 614 int32_t I; 615 float F; 616 } Imm; 617 618 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 619 if (Node->getZExtValue() >> 32) { 620 return -1; 621 } 622 Imm.I = Node->getSExtValue(); 623 } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) 624 Imm.F = Node->getValueAPF().convertToFloat(); 625 else 626 return -1; // It isn't an immediate 627 628 if ((Imm.I >= -16 && Imm.I <= 64) || 629 Imm.F == 0.5f || Imm.F == -0.5f || 630 Imm.F == 1.0f || Imm.F == -1.0f || 631 Imm.F == 2.0f || Imm.F == -2.0f || 632 Imm.F == 4.0f || Imm.F == -4.0f) 633 return 0; // It's an inline immediate 634 635 return Imm.I; // It's a literal immediate 636 } 637 638 /// \brief Try to fold an immediate directly into an instruction 639 bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, 640 bool &ScalarSlotUsed) const { 641 642 MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); 643 const SIInstrInfo *TII = 644 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 645 if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) 646 return false; 647 648 const SDValue &Op = Mov->getOperand(0); 649 int32_t Value = analyzeImmediate(Op.getNode()); 650 if (Value == -1) { 651 // Not an immediate at all 652 return false; 653 654 } else if (Value == 0) { 655 // Inline immediates can always be fold 656 Operand = Op; 657 return true; 658 659 } else if (Value == Immediate) { 660 // Already fold literal immediate 661 Operand = Op; 662 return true; 663 664 } else if (!ScalarSlotUsed && !Immediate) { 665 // Fold this literal immediate 666 ScalarSlotUsed = true; 667 Immediate = Value; 668 Operand = Op; 669 return true; 670 671 } 672 673 return false; 674 } 675 676 const TargetRegisterClass *SITargetLowering::getRegClassForNode( 677 SelectionDAG &DAG, const SDValue &Op) const { 678 const SIInstrInfo *TII = 679 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 680 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 681 682 if (!Op->isMachineOpcode()) { 683 switch(Op->getOpcode()) { 684 case ISD::CopyFromReg: { 685 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 686 unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg(); 687 if (TargetRegisterInfo::isVirtualRegister(Reg)) { 688 return MRI.getRegClass(Reg); 689 } 690 return TRI.getPhysRegClass(Reg); 691 } 692 default: return NULL; 693 } 694 } 695 const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); 696 int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; 697 if (OpClassID != -1) { 698 return TRI.getRegClass(OpClassID); 699 } 700 switch(Op.getMachineOpcode()) { 701 case AMDGPU::COPY_TO_REGCLASS: 702 // Operand 1 is the register class id for COPY_TO_REGCLASS instructions. 703 OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue(); 704 705 // If the COPY_TO_REGCLASS instruction is copying to a VSrc register 706 // class, then the register class for the value could be either a 707 // VReg or and SReg. In order to get a more accurate 708 if (OpClassID == AMDGPU::VSrc_32RegClassID || 709 OpClassID == AMDGPU::VSrc_64RegClassID) { 710 return getRegClassForNode(DAG, Op.getOperand(0)); 711 } 712 return TRI.getRegClass(OpClassID); 713 case AMDGPU::EXTRACT_SUBREG: { 714 int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 715 const TargetRegisterClass *SuperClass = 716 getRegClassForNode(DAG, Op.getOperand(0)); 717 return TRI.getSubClassWithSubReg(SuperClass, SubIdx); 718 } 719 case AMDGPU::REG_SEQUENCE: 720 // Operand 0 is the register class id for REG_SEQUENCE instructions. 721 return TRI.getRegClass( 722 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()); 723 default: 724 return getRegClassFor(Op.getSimpleValueType()); 725 } 726 } 727 728 /// \brief Does "Op" fit into register class "RegClass" ? 729 bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, 730 unsigned RegClass) const { 731 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 732 const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); 733 if (!RC) { 734 return false; 735 } 736 return TRI->getRegClass(RegClass)->hasSubClassEq(RC); 737 } 738 739 /// \brief Make sure that we don't exeed the number of allowed scalars 740 void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 741 unsigned RegClass, 742 bool &ScalarSlotUsed) const { 743 744 // First map the operands register class to a destination class 745 if (RegClass == AMDGPU::VSrc_32RegClassID) 746 RegClass = AMDGPU::VReg_32RegClassID; 747 else if (RegClass == AMDGPU::VSrc_64RegClassID) 748 RegClass = AMDGPU::VReg_64RegClassID; 749 else 750 return; 751 752 // Nothing todo if they fit naturaly 753 if (fitsRegClass(DAG, Operand, RegClass)) 754 return; 755 756 // If the scalar slot isn't used yet use it now 757 if (!ScalarSlotUsed) { 758 ScalarSlotUsed = true; 759 return; 760 } 761 762 // This is a conservative aproach, it is possible that we can't determine 763 // the correct register class and copy too often, but better save than sorry. 764 SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); 765 SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), 766 Operand.getValueType(), Operand, RC); 767 Operand = SDValue(Node, 0); 768 } 769 770 /// \returns true if \p Node's operands are different from the SDValue list 771 /// \p Ops 772 static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { 773 for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) { 774 if (Ops[i].getNode() != Node->getOperand(i).getNode()) { 775 return true; 776 } 777 } 778 return false; 779 } 780 781 /// \brief Try to fold the Nodes operands into the Node 782 SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, 783 SelectionDAG &DAG) const { 784 785 // Original encoding (either e32 or e64) 786 int Opcode = Node->getMachineOpcode(); 787 const SIInstrInfo *TII = 788 static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 789 const MCInstrDesc *Desc = &TII->get(Opcode); 790 791 unsigned NumDefs = Desc->getNumDefs(); 792 unsigned NumOps = Desc->getNumOperands(); 793 794 // Commuted opcode if available 795 int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; 796 const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev); 797 798 assert(!DescRev || DescRev->getNumDefs() == NumDefs); 799 assert(!DescRev || DescRev->getNumOperands() == NumOps); 800 801 // e64 version if available, -1 otherwise 802 int OpcodeE64 = AMDGPU::getVOPe64(Opcode); 803 const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); 804 805 assert(!DescE64 || DescE64->getNumDefs() == NumDefs); 806 assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); 807 808 int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; 809 bool HaveVSrc = false, HaveSSrc = false; 810 811 // First figure out what we alread have in this instruction 812 for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 813 i != e && Op < NumOps; ++i, ++Op) { 814 815 unsigned RegClass = Desc->OpInfo[Op].RegClass; 816 if (isVSrc(RegClass)) 817 HaveVSrc = true; 818 else if (isSSrc(RegClass)) 819 HaveSSrc = true; 820 else 821 continue; 822 823 int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); 824 if (Imm != -1 && Imm != 0) { 825 // Literal immediate 826 Immediate = Imm; 827 } 828 } 829 830 // If we neither have VSrc nor SSrc it makes no sense to continue 831 if (!HaveVSrc && !HaveSSrc) 832 return Node; 833 834 // No scalar allowed when we have both VSrc and SSrc 835 bool ScalarSlotUsed = HaveVSrc && HaveSSrc; 836 837 // Second go over the operands and try to fold them 838 std::vector<SDValue> Ops; 839 bool Promote2e64 = false; 840 for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 841 i != e && Op < NumOps; ++i, ++Op) { 842 843 const SDValue &Operand = Node->getOperand(i); 844 Ops.push_back(Operand); 845 846 // Already folded immediate ? 847 if (isa<ConstantSDNode>(Operand.getNode()) || 848 isa<ConstantFPSDNode>(Operand.getNode())) 849 continue; 850 851 // Is this a VSrc or SSrc operand ? 852 unsigned RegClass = Desc->OpInfo[Op].RegClass; 853 if (isVSrc(RegClass) || isSSrc(RegClass)) { 854 // Try to fold the immediates 855 if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { 856 // Folding didn't worked, make sure we don't hit the SReg limit 857 ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); 858 } 859 continue; 860 } 861 862 if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { 863 864 unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; 865 assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); 866 867 // Test if it makes sense to swap operands 868 if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || 869 (!fitsRegClass(DAG, Ops[1], RegClass) && 870 fitsRegClass(DAG, Ops[1], OtherRegClass))) { 871 872 // Swap commutable operands 873 SDValue Tmp = Ops[1]; 874 Ops[1] = Ops[0]; 875 Ops[0] = Tmp; 876 877 Desc = DescRev; 878 DescRev = 0; 879 continue; 880 } 881 } 882 883 if (DescE64 && !Immediate) { 884 885 // Test if it makes sense to switch to e64 encoding 886 unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; 887 if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) 888 continue; 889 890 int32_t TmpImm = -1; 891 if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) || 892 (!fitsRegClass(DAG, Ops[i], RegClass) && 893 fitsRegClass(DAG, Ops[1], OtherRegClass))) { 894 895 // Switch to e64 encoding 896 Immediate = -1; 897 Promote2e64 = true; 898 Desc = DescE64; 899 DescE64 = 0; 900 } 901 } 902 } 903 904 if (Promote2e64) { 905 // Add the modifier flags while promoting 906 for (unsigned i = 0; i < 4; ++i) 907 Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); 908 } 909 910 // Add optional chain and glue 911 for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) 912 Ops.push_back(Node->getOperand(i)); 913 914 // Nodes that have a glue result are not CSE'd by getMachineNode(), so in 915 // this case a brand new node is always be created, even if the operands 916 // are the same as before. So, manually check if anything has been changed. 917 if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) { 918 return Node; 919 } 920 921 // Create a complete new instruction 922 return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops); 923 } 924 925 /// \brief Helper function for adjustWritemask 926 static unsigned SubIdx2Lane(unsigned Idx) { 927 switch (Idx) { 928 default: return 0; 929 case AMDGPU::sub0: return 0; 930 case AMDGPU::sub1: return 1; 931 case AMDGPU::sub2: return 2; 932 case AMDGPU::sub3: return 3; 933 } 934 } 935 936 /// \brief Adjust the writemask of MIMG instructions 937 void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 938 SelectionDAG &DAG) const { 939 SDNode *Users[4] = { }; 940 unsigned Writemask = 0, Lane = 0; 941 942 // Try to figure out the used register components 943 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 944 I != E; ++I) { 945 946 // Abort if we can't understand the usage 947 if (!I->isMachineOpcode() || 948 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 949 return; 950 951 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 952 953 // Abort if we have more than one user per component 954 if (Users[Lane]) 955 return; 956 957 Users[Lane] = *I; 958 Writemask |= 1 << Lane; 959 } 960 961 // Abort if all components are used 962 if (Writemask == 0xf) 963 return; 964 965 // Adjust the writemask in the node 966 std::vector<SDValue> Ops; 967 Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32)); 968 for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) 969 Ops.push_back(Node->getOperand(i)); 970 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); 971 972 // If we only got one lane, replace it with a copy 973 if (Writemask == (1U << Lane)) { 974 SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); 975 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 976 SDLoc(), Users[Lane]->getValueType(0), 977 SDValue(Node, 0), RC); 978 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 979 return; 980 } 981 982 // Update the users of the node with the new indices 983 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 984 985 SDNode *User = Users[i]; 986 if (!User) 987 continue; 988 989 SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); 990 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 991 992 switch (Idx) { 993 default: break; 994 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 995 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 996 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 997 } 998 } 999 } 1000 1001 /// \brief Fold the instructions after slecting them 1002 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 1003 SelectionDAG &DAG) const { 1004 Node = AdjustRegClass(Node, DAG); 1005 1006 if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1) 1007 adjustWritemask(Node, DAG); 1008 1009 return foldOperands(Node, DAG); 1010 } 1011 1012 /// \brief Assign the register class depending on the number of 1013 /// bits set in the writemask 1014 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 1015 SDNode *Node) const { 1016 if (AMDGPU::isMIMG(MI->getOpcode()) == -1) 1017 return; 1018 1019 unsigned VReg = MI->getOperand(0).getReg(); 1020 unsigned Writemask = MI->getOperand(1).getImm(); 1021 unsigned BitsSet = 0; 1022 for (unsigned i = 0; i < 4; ++i) 1023 BitsSet += Writemask & (1 << i) ? 1 : 0; 1024 1025 const TargetRegisterClass *RC; 1026 switch (BitsSet) { 1027 default: return; 1028 case 1: RC = &AMDGPU::VReg_32RegClass; break; 1029 case 2: RC = &AMDGPU::VReg_64RegClass; break; 1030 case 3: RC = &AMDGPU::VReg_96RegClass; break; 1031 } 1032 1033 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1034 MRI.setRegClass(VReg, RC); 1035 } 1036 1037 MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, 1038 SelectionDAG &DAG) const { 1039 1040 SDLoc DL(N); 1041 unsigned NewOpcode = N->getMachineOpcode(); 1042 1043 switch (N->getMachineOpcode()) { 1044 default: return N; 1045 case AMDGPU::S_LOAD_DWORD_IMM: 1046 NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; 1047 // Fall-through 1048 case AMDGPU::S_LOAD_DWORDX2_SGPR: 1049 if (NewOpcode == N->getMachineOpcode()) { 1050 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; 1051 } 1052 // Fall-through 1053 case AMDGPU::S_LOAD_DWORDX4_IMM: 1054 case AMDGPU::S_LOAD_DWORDX4_SGPR: { 1055 if (NewOpcode == N->getMachineOpcode()) { 1056 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; 1057 } 1058 if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) { 1059 return N; 1060 } 1061 ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); 1062 SDValue Ops[] = { 1063 SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128, 1064 DAG.getConstant(0, MVT::i64)), 0), 1065 N->getOperand(0), 1066 DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32) 1067 }; 1068 return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); 1069 } 1070 } 1071 } 1072 1073 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 1074 const TargetRegisterClass *RC, 1075 unsigned Reg, EVT VT) const { 1076 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 1077 1078 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 1079 cast<RegisterSDNode>(VReg)->getReg(), VT); 1080 } 1081