1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief This is the parent TargetLowering class for hardware code gen 12 /// targets. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPUISelLowering.h" 17 #include "AMDGPU.h" 18 #include "AMDGPUFrameLowering.h" 19 #include "AMDGPUIntrinsicInfo.h" 20 #include "AMDGPURegisterInfo.h" 21 #include "AMDGPUSubtarget.h" 22 #include "R600MachineFunctionInfo.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "llvm/Analysis/ValueTracking.h" 25 #include "llvm/CodeGen/CallingConvLower.h" 26 #include "llvm/CodeGen/MachineFunction.h" 27 #include "llvm/CodeGen/MachineRegisterInfo.h" 28 #include "llvm/CodeGen/SelectionDAG.h" 29 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 30 #include "llvm/IR/DataLayout.h" 31 #include "llvm/IR/DiagnosticInfo.h" 32 #include "llvm/IR/DiagnosticPrinter.h" 33 34 using namespace llvm; 35 36 namespace { 37 38 /// Diagnostic information for unimplemented or unsupported feature reporting. 39 class DiagnosticInfoUnsupported : public DiagnosticInfo { 40 private: 41 const Twine &Description; 42 const Function &Fn; 43 44 static int KindID; 45 46 static int getKindID() { 47 if (KindID == 0) 48 KindID = llvm::getNextAvailablePluginDiagnosticKind(); 49 return KindID; 50 } 51 52 public: 53 DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, 54 DiagnosticSeverity Severity = DS_Error) 55 : DiagnosticInfo(getKindID(), Severity), 56 Description(Desc), 57 Fn(Fn) { } 58 59 const Function &getFunction() const { return Fn; } 60 const Twine &getDescription() const { return Description; } 61 62 void print(DiagnosticPrinter &DP) const override { 63 DP << "unsupported " << getDescription() << " in " << Fn.getName(); 64 } 65 66 static bool classof(const DiagnosticInfo *DI) { 67 return DI->getKind() == getKindID(); 68 } 69 }; 70 71 int DiagnosticInfoUnsupported::KindID = 0; 72 } 73 74 75 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, 76 CCValAssign::LocInfo LocInfo, 77 ISD::ArgFlagsTy ArgFlags, CCState &State) { 78 unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), 79 ArgFlags.getOrigAlign()); 80 State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); 81 82 return true; 83 } 84 85 #include "AMDGPUGenCallingConv.inc" 86 87 // Find a larger type to do a load / store of a vector with. 88 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { 89 unsigned StoreSize = VT.getStoreSizeInBits(); 90 if (StoreSize <= 32) 91 return EVT::getIntegerVT(Ctx, StoreSize); 92 93 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); 94 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); 95 } 96 97 // Type for a vector that will be loaded to. 98 EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { 99 unsigned StoreSize = VT.getStoreSizeInBits(); 100 if (StoreSize <= 32) 101 return EVT::getIntegerVT(Ctx, 32); 102 103 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); 104 } 105 106 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : 107 TargetLowering(TM, new TargetLoweringObjectFileELF()) { 108 109 Subtarget = &TM.getSubtarget<AMDGPUSubtarget>(); 110 111 setOperationAction(ISD::Constant, MVT::i32, Legal); 112 setOperationAction(ISD::Constant, MVT::i64, Legal); 113 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 114 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 115 116 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 117 setOperationAction(ISD::BRIND, MVT::Other, Expand); 118 119 // We need to custom lower some of the intrinsics 120 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 121 122 // Library functions. These default to Expand, but we have instructions 123 // for them. 124 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 125 setOperationAction(ISD::FEXP2, MVT::f32, Legal); 126 setOperationAction(ISD::FPOW, MVT::f32, Legal); 127 setOperationAction(ISD::FLOG2, MVT::f32, Legal); 128 setOperationAction(ISD::FABS, MVT::f32, Legal); 129 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 130 setOperationAction(ISD::FRINT, MVT::f32, Legal); 131 setOperationAction(ISD::FROUND, MVT::f32, Legal); 132 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 133 134 // Lower floating point store/load to integer store/load to reduce the number 135 // of patterns in tablegen. 136 setOperationAction(ISD::STORE, MVT::f32, Promote); 137 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); 138 139 setOperationAction(ISD::STORE, MVT::v2f32, Promote); 140 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); 141 142 setOperationAction(ISD::STORE, MVT::i64, Promote); 143 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); 144 145 setOperationAction(ISD::STORE, MVT::v4f32, Promote); 146 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); 147 148 setOperationAction(ISD::STORE, MVT::v8f32, Promote); 149 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); 150 151 setOperationAction(ISD::STORE, MVT::v16f32, Promote); 152 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); 153 154 setOperationAction(ISD::STORE, MVT::f64, Promote); 155 AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); 156 157 setOperationAction(ISD::STORE, MVT::v2f64, Promote); 158 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64); 159 160 // Custom lowering of vector stores is required for local address space 161 // stores. 162 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 163 // XXX: Native v2i32 local address space stores are possible, but not 164 // currently implemented. 165 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 166 167 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); 168 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); 169 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); 170 171 // XXX: This can be change to Custom, once ExpandVectorStores can 172 // handle 64-bit stores. 173 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); 174 175 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 176 setTruncStoreAction(MVT::i64, MVT::i8, Expand); 177 setTruncStoreAction(MVT::i64, MVT::i1, Expand); 178 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); 179 setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); 180 181 182 setOperationAction(ISD::LOAD, MVT::f32, Promote); 183 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); 184 185 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 186 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); 187 188 setOperationAction(ISD::LOAD, MVT::i64, Promote); 189 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); 190 191 setOperationAction(ISD::LOAD, MVT::v4f32, Promote); 192 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); 193 194 setOperationAction(ISD::LOAD, MVT::v8f32, Promote); 195 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); 196 197 setOperationAction(ISD::LOAD, MVT::v16f32, Promote); 198 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); 199 200 setOperationAction(ISD::LOAD, MVT::f64, Promote); 201 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); 202 203 setOperationAction(ISD::LOAD, MVT::v2f64, Promote); 204 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64); 205 206 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 207 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); 208 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); 209 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); 210 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); 211 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); 212 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); 213 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); 214 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); 215 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); 216 217 setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand); 218 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand); 219 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand); 220 setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand); 221 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand); 222 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand); 223 setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand); 224 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand); 225 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand); 226 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand); 227 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand); 228 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand); 229 230 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 231 232 if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { 233 setOperationAction(ISD::FCEIL, MVT::f64, Custom); 234 setOperationAction(ISD::FTRUNC, MVT::f64, Custom); 235 setOperationAction(ISD::FRINT, MVT::f64, Custom); 236 setOperationAction(ISD::FFLOOR, MVT::f64, Custom); 237 } 238 239 if (!Subtarget->hasBFI()) { 240 // fcopysign can be done in a single instruction with BFI. 241 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 242 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 243 } 244 245 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 246 for (MVT VT : ScalarIntVTs) { 247 setOperationAction(ISD::SREM, VT, Expand); 248 setOperationAction(ISD::SDIV, VT, Expand); 249 250 // GPU does not have divrem function for signed or unsigned. 251 setOperationAction(ISD::SDIVREM, VT, Custom); 252 setOperationAction(ISD::UDIVREM, VT, Custom); 253 254 // GPU does not have [S|U]MUL_LOHI functions as a single instruction. 255 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 256 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 257 258 setOperationAction(ISD::BSWAP, VT, Expand); 259 setOperationAction(ISD::CTTZ, VT, Expand); 260 setOperationAction(ISD::CTLZ, VT, Expand); 261 } 262 263 if (!Subtarget->hasBCNT(32)) 264 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 265 266 if (!Subtarget->hasBCNT(64)) 267 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 268 269 // The hardware supports 32-bit ROTR, but not ROTL. 270 setOperationAction(ISD::ROTL, MVT::i32, Expand); 271 setOperationAction(ISD::ROTL, MVT::i64, Expand); 272 setOperationAction(ISD::ROTR, MVT::i64, Expand); 273 274 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand); 275 setOperationAction(ISD::MUL, MVT::i64, Expand); 276 setOperationAction(ISD::MULHU, MVT::i64, Expand); 277 setOperationAction(ISD::MULHS, MVT::i64, Expand); 278 setOperationAction(ISD::UDIV, MVT::i32, Expand); 279 setOperationAction(ISD::UREM, MVT::i32, Expand); 280 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 281 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 282 283 static const MVT::SimpleValueType VectorIntTypes[] = { 284 MVT::v2i32, MVT::v4i32 285 }; 286 287 for (MVT VT : VectorIntTypes) { 288 // Expand the following operations for the current type by default. 289 setOperationAction(ISD::ADD, VT, Expand); 290 setOperationAction(ISD::AND, VT, Expand); 291 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 292 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 293 setOperationAction(ISD::MUL, VT, Expand); 294 setOperationAction(ISD::OR, VT, Expand); 295 setOperationAction(ISD::SHL, VT, Expand); 296 setOperationAction(ISD::SRA, VT, Expand); 297 setOperationAction(ISD::SRL, VT, Expand); 298 setOperationAction(ISD::ROTL, VT, Expand); 299 setOperationAction(ISD::ROTR, VT, Expand); 300 setOperationAction(ISD::SUB, VT, Expand); 301 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 302 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 303 // TODO: Implement custom UREM / SREM routines. 304 setOperationAction(ISD::SDIV, VT, Expand); 305 setOperationAction(ISD::UDIV, VT, Expand); 306 setOperationAction(ISD::SREM, VT, Expand); 307 setOperationAction(ISD::UREM, VT, Expand); 308 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 309 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 310 setOperationAction(ISD::SDIVREM, VT, Custom); 311 setOperationAction(ISD::UDIVREM, VT, Custom); 312 setOperationAction(ISD::ADDC, VT, Expand); 313 setOperationAction(ISD::SUBC, VT, Expand); 314 setOperationAction(ISD::ADDE, VT, Expand); 315 setOperationAction(ISD::SUBE, VT, Expand); 316 setOperationAction(ISD::SELECT, VT, Expand); 317 setOperationAction(ISD::VSELECT, VT, Expand); 318 setOperationAction(ISD::SELECT_CC, VT, Expand); 319 setOperationAction(ISD::XOR, VT, Expand); 320 setOperationAction(ISD::BSWAP, VT, Expand); 321 setOperationAction(ISD::CTPOP, VT, Expand); 322 setOperationAction(ISD::CTTZ, VT, Expand); 323 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); 324 setOperationAction(ISD::CTLZ, VT, Expand); 325 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); 326 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 327 } 328 329 static const MVT::SimpleValueType FloatVectorTypes[] = { 330 MVT::v2f32, MVT::v4f32 331 }; 332 333 for (MVT VT : FloatVectorTypes) { 334 setOperationAction(ISD::FABS, VT, Expand); 335 setOperationAction(ISD::FADD, VT, Expand); 336 setOperationAction(ISD::FCEIL, VT, Expand); 337 setOperationAction(ISD::FCOS, VT, Expand); 338 setOperationAction(ISD::FDIV, VT, Expand); 339 setOperationAction(ISD::FEXP2, VT, Expand); 340 setOperationAction(ISD::FLOG2, VT, Expand); 341 setOperationAction(ISD::FPOW, VT, Expand); 342 setOperationAction(ISD::FFLOOR, VT, Expand); 343 setOperationAction(ISD::FTRUNC, VT, Expand); 344 setOperationAction(ISD::FMUL, VT, Expand); 345 setOperationAction(ISD::FMA, VT, Expand); 346 setOperationAction(ISD::FRINT, VT, Expand); 347 setOperationAction(ISD::FNEARBYINT, VT, Expand); 348 setOperationAction(ISD::FSQRT, VT, Expand); 349 setOperationAction(ISD::FSIN, VT, Expand); 350 setOperationAction(ISD::FSUB, VT, Expand); 351 setOperationAction(ISD::FNEG, VT, Expand); 352 setOperationAction(ISD::SELECT, VT, Expand); 353 setOperationAction(ISD::VSELECT, VT, Expand); 354 setOperationAction(ISD::SELECT_CC, VT, Expand); 355 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 356 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 357 } 358 359 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); 360 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); 361 362 setTargetDAGCombine(ISD::MUL); 363 setTargetDAGCombine(ISD::SELECT_CC); 364 365 setSchedulingPreference(Sched::RegPressure); 366 setJumpIsExpensive(true); 367 368 setSelectIsExpensive(false); 369 PredictableSelectIsExpensive = false; 370 371 // There are no integer divide instructions, and these expand to a pretty 372 // large sequence of instructions. 373 setIntDivIsCheap(false); 374 setPow2DivIsCheap(false); 375 376 // TODO: Investigate this when 64-bit divides are implemented. 377 addBypassSlowDiv(64, 32); 378 379 // FIXME: Need to really handle these. 380 MaxStoresPerMemcpy = 4096; 381 MaxStoresPerMemmove = 4096; 382 MaxStoresPerMemset = 4096; 383 } 384 385 //===----------------------------------------------------------------------===// 386 // Target Information 387 //===----------------------------------------------------------------------===// 388 389 MVT AMDGPUTargetLowering::getVectorIdxTy() const { 390 return MVT::i32; 391 } 392 393 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { 394 return true; 395 } 396 397 // The backend supports 32 and 64 bit floating point immediates. 398 // FIXME: Why are we reporting vectors of FP immediates as legal? 399 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 400 EVT ScalarVT = VT.getScalarType(); 401 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64); 402 } 403 404 // We don't want to shrink f64 / f32 constants. 405 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { 406 EVT ScalarVT = VT.getScalarType(); 407 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); 408 } 409 410 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, 411 EVT CastTy) const { 412 if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) 413 return true; 414 415 unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); 416 unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); 417 418 return ((LScalarSize <= CastScalarSize) || 419 (CastScalarSize >= 32) || 420 (LScalarSize < 32)); 421 } 422 423 //===---------------------------------------------------------------------===// 424 // Target Properties 425 //===---------------------------------------------------------------------===// 426 427 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { 428 assert(VT.isFloatingPoint()); 429 return VT == MVT::f32; 430 } 431 432 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { 433 assert(VT.isFloatingPoint()); 434 return VT == MVT::f32; 435 } 436 437 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { 438 // Truncate is just accessing a subregister. 439 return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); 440 } 441 442 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { 443 // Truncate is just accessing a subregister. 444 return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && 445 (Dest->getPrimitiveSizeInBits() % 32 == 0); 446 } 447 448 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { 449 const DataLayout *DL = getDataLayout(); 450 unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); 451 unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); 452 453 return SrcSize == 32 && DestSize == 64; 454 } 455 456 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { 457 // Any register load of a 64-bit value really requires 2 32-bit moves. For all 458 // practical purposes, the extra mov 0 to load a 64-bit is free. As used, 459 // this will enable reducing 64-bit operations the 32-bit, which is always 460 // good. 461 return Src == MVT::i32 && Dest == MVT::i64; 462 } 463 464 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 465 return isZExtFree(Val.getValueType(), VT2); 466 } 467 468 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { 469 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a 470 // limited number of native 64-bit operations. Shrinking an operation to fit 471 // in a single 32-bit register should always be helpful. As currently used, 472 // this is much less general than the name suggests, and is only used in 473 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is 474 // not profitable, and may actually be harmful. 475 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; 476 } 477 478 //===---------------------------------------------------------------------===// 479 // TargetLowering Callbacks 480 //===---------------------------------------------------------------------===// 481 482 void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, 483 const SmallVectorImpl<ISD::InputArg> &Ins) const { 484 485 State.AnalyzeFormalArguments(Ins, CC_AMDGPU); 486 } 487 488 SDValue AMDGPUTargetLowering::LowerReturn( 489 SDValue Chain, 490 CallingConv::ID CallConv, 491 bool isVarArg, 492 const SmallVectorImpl<ISD::OutputArg> &Outs, 493 const SmallVectorImpl<SDValue> &OutVals, 494 SDLoc DL, SelectionDAG &DAG) const { 495 return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); 496 } 497 498 //===---------------------------------------------------------------------===// 499 // Target specific lowering 500 //===---------------------------------------------------------------------===// 501 502 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, 503 SmallVectorImpl<SDValue> &InVals) const { 504 SDValue Callee = CLI.Callee; 505 SelectionDAG &DAG = CLI.DAG; 506 507 const Function &Fn = *DAG.getMachineFunction().getFunction(); 508 509 StringRef FuncName("<unknown>"); 510 511 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) 512 FuncName = G->getSymbol(); 513 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 514 FuncName = G->getGlobal()->getName(); 515 516 DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); 517 DAG.getContext()->diagnose(NoCalls); 518 return SDValue(); 519 } 520 521 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, 522 SelectionDAG &DAG) const { 523 switch (Op.getOpcode()) { 524 default: 525 Op.getNode()->dump(); 526 llvm_unreachable("Custom lowering code for this" 527 "instruction is not implemented yet!"); 528 break; 529 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 530 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 531 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 532 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 533 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 534 case ISD::SDIV: return LowerSDIV(Op, DAG); 535 case ISD::SREM: return LowerSREM(Op, DAG); 536 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); 537 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); 538 case ISD::FCEIL: return LowerFCEIL(Op, DAG); 539 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); 540 case ISD::FRINT: return LowerFRINT(Op, DAG); 541 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); 542 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); 543 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 544 } 545 return Op; 546 } 547 548 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, 549 SmallVectorImpl<SDValue> &Results, 550 SelectionDAG &DAG) const { 551 switch (N->getOpcode()) { 552 case ISD::SIGN_EXTEND_INREG: 553 // Different parts of legalization seem to interpret which type of 554 // sign_extend_inreg is the one to check for custom lowering. The extended 555 // from type is what really matters, but some places check for custom 556 // lowering of the result type. This results in trying to use 557 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do 558 // nothing here and let the illegal result integer be handled normally. 559 return; 560 case ISD::LOAD: { 561 SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); 562 if (!Node) 563 return; 564 565 Results.push_back(SDValue(Node, 0)); 566 Results.push_back(SDValue(Node, 1)); 567 // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode 568 // function 569 DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); 570 return; 571 } 572 case ISD::STORE: { 573 SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); 574 if (Lowered.getNode()) 575 Results.push_back(Lowered); 576 return; 577 } 578 default: 579 return; 580 } 581 } 582 583 // FIXME: This implements accesses to initialized globals in the constant 584 // address space by copying them to private and accessing that. It does not 585 // properly handle illegal types or vectors. The private vector loads are not 586 // scalarized, and the illegal scalars hit an assertion. This technique will not 587 // work well with large initializers, and this should eventually be 588 // removed. Initialized globals should be placed into a data section that the 589 // runtime will load into a buffer before the kernel is executed. Uses of the 590 // global need to be replaced with a pointer loaded from an implicit kernel 591 // argument into this buffer holding the copy of the data, which will remove the 592 // need for any of this. 593 SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, 594 const GlobalValue *GV, 595 const SDValue &InitPtr, 596 SDValue Chain, 597 SelectionDAG &DAG) const { 598 const DataLayout *TD = getTargetMachine().getDataLayout(); 599 SDLoc DL(InitPtr); 600 Type *InitTy = Init->getType(); 601 602 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { 603 EVT VT = EVT::getEVT(InitTy); 604 PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); 605 return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, 606 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 607 TD->getPrefTypeAlignment(InitTy)); 608 } 609 610 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { 611 EVT VT = EVT::getEVT(CFP->getType()); 612 PointerType *PtrTy = PointerType::get(CFP->getType(), 0); 613 return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr, 614 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 615 TD->getPrefTypeAlignment(CFP->getType())); 616 } 617 618 if (StructType *ST = dyn_cast<StructType>(InitTy)) { 619 const StructLayout *SL = TD->getStructLayout(ST); 620 621 EVT PtrVT = InitPtr.getValueType(); 622 SmallVector<SDValue, 8> Chains; 623 624 for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { 625 SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT); 626 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); 627 628 Constant *Elt = Init->getAggregateElement(I); 629 Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); 630 } 631 632 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 633 } 634 635 if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) { 636 EVT PtrVT = InitPtr.getValueType(); 637 638 unsigned NumElements; 639 if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy)) 640 NumElements = AT->getNumElements(); 641 else if (VectorType *VT = dyn_cast<VectorType>(SeqTy)) 642 NumElements = VT->getNumElements(); 643 else 644 llvm_unreachable("Unexpected type"); 645 646 unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); 647 SmallVector<SDValue, 8> Chains; 648 for (unsigned i = 0; i < NumElements; ++i) { 649 SDValue Offset = DAG.getConstant(i * EltSize, PtrVT); 650 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); 651 652 Constant *Elt = Init->getAggregateElement(i); 653 Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); 654 } 655 656 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 657 } 658 659 if (isa<UndefValue>(Init)) { 660 EVT VT = EVT::getEVT(InitTy); 661 PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); 662 return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, 663 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 664 TD->getPrefTypeAlignment(InitTy)); 665 } 666 667 Init->dump(); 668 llvm_unreachable("Unhandled constant initializer"); 669 } 670 671 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, 672 SDValue Op, 673 SelectionDAG &DAG) const { 674 675 const DataLayout *TD = getTargetMachine().getDataLayout(); 676 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); 677 const GlobalValue *GV = G->getGlobal(); 678 679 switch (G->getAddressSpace()) { 680 default: llvm_unreachable("Global Address lowering not implemented for this " 681 "address space"); 682 case AMDGPUAS::LOCAL_ADDRESS: { 683 // XXX: What does the value of G->getOffset() mean? 684 assert(G->getOffset() == 0 && 685 "Do not know what to do with an non-zero offset"); 686 687 unsigned Offset; 688 if (MFI->LocalMemoryObjects.count(GV) == 0) { 689 uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); 690 Offset = MFI->LDSSize; 691 MFI->LocalMemoryObjects[GV] = Offset; 692 // XXX: Account for alignment? 693 MFI->LDSSize += Size; 694 } else { 695 Offset = MFI->LocalMemoryObjects[GV]; 696 } 697 698 return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace())); 699 } 700 case AMDGPUAS::CONSTANT_ADDRESS: { 701 MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); 702 Type *EltType = GV->getType()->getElementType(); 703 unsigned Size = TD->getTypeAllocSize(EltType); 704 unsigned Alignment = TD->getPrefTypeAlignment(EltType); 705 706 MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS); 707 MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); 708 709 int FI = FrameInfo->CreateStackObject(Size, Alignment, false); 710 SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); 711 712 const GlobalVariable *Var = cast<GlobalVariable>(GV); 713 if (!Var->hasInitializer()) { 714 // This has no use, but bugpoint will hit it. 715 return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); 716 } 717 718 const Constant *Init = Var->getInitializer(); 719 SmallVector<SDNode*, 8> WorkList; 720 721 for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), 722 E = DAG.getEntryNode()->use_end(); I != E; ++I) { 723 if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) 724 continue; 725 WorkList.push_back(*I); 726 } 727 SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); 728 for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(), 729 E = WorkList.end(); I != E; ++I) { 730 SmallVector<SDValue, 8> Ops; 731 Ops.push_back(Chain); 732 for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { 733 Ops.push_back((*I)->getOperand(i)); 734 } 735 DAG.UpdateNodeOperands(*I, Ops); 736 } 737 return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); 738 } 739 } 740 } 741 742 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, 743 SelectionDAG &DAG) const { 744 SmallVector<SDValue, 8> Args; 745 SDValue A = Op.getOperand(0); 746 SDValue B = Op.getOperand(1); 747 748 DAG.ExtractVectorElements(A, Args); 749 DAG.ExtractVectorElements(B, Args); 750 751 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); 752 } 753 754 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 755 SelectionDAG &DAG) const { 756 757 SmallVector<SDValue, 8> Args; 758 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 759 EVT VT = Op.getValueType(); 760 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, 761 VT.getVectorNumElements()); 762 763 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); 764 } 765 766 SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, 767 SelectionDAG &DAG) const { 768 769 MachineFunction &MF = DAG.getMachineFunction(); 770 const AMDGPUFrameLowering *TFL = 771 static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); 772 773 FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); 774 775 unsigned FrameIndex = FIN->getIndex(); 776 unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); 777 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), 778 Op.getValueType()); 779 } 780 781 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 782 SelectionDAG &DAG) const { 783 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 784 SDLoc DL(Op); 785 EVT VT = Op.getValueType(); 786 787 switch (IntrinsicID) { 788 default: return Op; 789 case AMDGPUIntrinsic::AMDGPU_abs: 790 case AMDGPUIntrinsic::AMDIL_abs: // Legacy name. 791 return LowerIntrinsicIABS(Op, DAG); 792 case AMDGPUIntrinsic::AMDGPU_lrp: 793 return LowerIntrinsicLRP(Op, DAG); 794 case AMDGPUIntrinsic::AMDGPU_fract: 795 case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. 796 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 797 798 case AMDGPUIntrinsic::AMDGPU_clamp: 799 case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. 800 return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, 801 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 802 803 case Intrinsic::AMDGPU_div_scale: { 804 // 3rd parameter required to be a constant. 805 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 806 if (!Param) 807 return DAG.getUNDEF(VT); 808 809 // Translate to the operands expected by the machine instruction. The 810 // first parameter must be the same as the first instruction. 811 SDValue Numerator = Op.getOperand(1); 812 SDValue Denominator = Op.getOperand(2); 813 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; 814 815 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT, 816 Src0, Denominator, Numerator); 817 } 818 819 case Intrinsic::AMDGPU_div_fmas: 820 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, 821 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 822 823 case Intrinsic::AMDGPU_div_fixup: 824 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, 825 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 826 827 case Intrinsic::AMDGPU_trig_preop: 828 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, 829 Op.getOperand(1), Op.getOperand(2)); 830 831 case Intrinsic::AMDGPU_rcp: 832 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); 833 834 case Intrinsic::AMDGPU_rsq: 835 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 836 837 case AMDGPUIntrinsic::AMDGPU_legacy_rsq: 838 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 839 840 case Intrinsic::AMDGPU_rsq_clamped: 841 return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); 842 843 case AMDGPUIntrinsic::AMDGPU_imax: 844 return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), 845 Op.getOperand(2)); 846 case AMDGPUIntrinsic::AMDGPU_umax: 847 return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1), 848 Op.getOperand(2)); 849 case AMDGPUIntrinsic::AMDGPU_imin: 850 return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1), 851 Op.getOperand(2)); 852 case AMDGPUIntrinsic::AMDGPU_umin: 853 return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), 854 Op.getOperand(2)); 855 856 case AMDGPUIntrinsic::AMDGPU_umul24: 857 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, 858 Op.getOperand(1), Op.getOperand(2)); 859 860 case AMDGPUIntrinsic::AMDGPU_imul24: 861 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, 862 Op.getOperand(1), Op.getOperand(2)); 863 864 case AMDGPUIntrinsic::AMDGPU_umad24: 865 return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT, 866 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 867 868 case AMDGPUIntrinsic::AMDGPU_imad24: 869 return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, 870 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 871 872 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: 873 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); 874 875 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: 876 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); 877 878 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: 879 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); 880 881 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: 882 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); 883 884 case AMDGPUIntrinsic::AMDGPU_bfe_i32: 885 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, 886 Op.getOperand(1), 887 Op.getOperand(2), 888 Op.getOperand(3)); 889 890 case AMDGPUIntrinsic::AMDGPU_bfe_u32: 891 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, 892 Op.getOperand(1), 893 Op.getOperand(2), 894 Op.getOperand(3)); 895 896 case AMDGPUIntrinsic::AMDGPU_bfi: 897 return DAG.getNode(AMDGPUISD::BFI, DL, VT, 898 Op.getOperand(1), 899 Op.getOperand(2), 900 Op.getOperand(3)); 901 902 case AMDGPUIntrinsic::AMDGPU_bfm: 903 return DAG.getNode(AMDGPUISD::BFM, DL, VT, 904 Op.getOperand(1), 905 Op.getOperand(2)); 906 907 case AMDGPUIntrinsic::AMDGPU_brev: 908 return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); 909 910 case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. 911 return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); 912 913 case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. 914 return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); 915 case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. 916 return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); 917 } 918 } 919 920 ///IABS(a) = SMAX(sub(0, a), a) 921 SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, 922 SelectionDAG &DAG) const { 923 SDLoc DL(Op); 924 EVT VT = Op.getValueType(); 925 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), 926 Op.getOperand(1)); 927 928 return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1)); 929 } 930 931 /// Linear Interpolation 932 /// LRP(a, b, c) = muladd(a, b, (1 - a) * c) 933 SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, 934 SelectionDAG &DAG) const { 935 SDLoc DL(Op); 936 EVT VT = Op.getValueType(); 937 SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, 938 DAG.getConstantFP(1.0f, MVT::f32), 939 Op.getOperand(1)); 940 SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, 941 Op.getOperand(3)); 942 return DAG.getNode(ISD::FADD, DL, VT, 943 DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), 944 OneSubAC); 945 } 946 947 /// \brief Generate Min/Max node 948 SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N, 949 SelectionDAG &DAG) const { 950 SDLoc DL(N); 951 EVT VT = N->getValueType(0); 952 953 SDValue LHS = N->getOperand(0); 954 SDValue RHS = N->getOperand(1); 955 SDValue True = N->getOperand(2); 956 SDValue False = N->getOperand(3); 957 SDValue CC = N->getOperand(4); 958 959 if (VT != MVT::f32 || 960 !((LHS == True && RHS == False) || (LHS == False && RHS == True))) { 961 return SDValue(); 962 } 963 964 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 965 switch (CCOpcode) { 966 case ISD::SETOEQ: 967 case ISD::SETONE: 968 case ISD::SETUNE: 969 case ISD::SETNE: 970 case ISD::SETUEQ: 971 case ISD::SETEQ: 972 case ISD::SETFALSE: 973 case ISD::SETFALSE2: 974 case ISD::SETTRUE: 975 case ISD::SETTRUE2: 976 case ISD::SETUO: 977 case ISD::SETO: 978 llvm_unreachable("Operation should already be optimised!"); 979 case ISD::SETULE: 980 case ISD::SETULT: 981 case ISD::SETOLE: 982 case ISD::SETOLT: 983 case ISD::SETLE: 984 case ISD::SETLT: { 985 unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX; 986 return DAG.getNode(Opc, DL, VT, LHS, RHS); 987 } 988 case ISD::SETGT: 989 case ISD::SETGE: 990 case ISD::SETUGE: 991 case ISD::SETOGE: 992 case ISD::SETUGT: 993 case ISD::SETOGT: { 994 unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN; 995 return DAG.getNode(Opc, DL, VT, LHS, RHS); 996 } 997 case ISD::SETCC_INVALID: 998 llvm_unreachable("Invalid setcc condcode!"); 999 } 1000 return SDValue(); 1001 } 1002 1003 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, 1004 SelectionDAG &DAG) const { 1005 LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); 1006 EVT MemEltVT = Load->getMemoryVT().getVectorElementType(); 1007 EVT LoadVT = Op.getValueType(); 1008 EVT EltVT = Op.getValueType().getVectorElementType(); 1009 EVT PtrVT = Load->getBasePtr().getValueType(); 1010 1011 unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); 1012 SmallVector<SDValue, 8> Loads; 1013 SmallVector<SDValue, 8> Chains; 1014 1015 SDLoc SL(Op); 1016 1017 for (unsigned i = 0, e = NumElts; i != e; ++i) { 1018 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), 1019 DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT)); 1020 1021 SDValue NewLoad 1022 = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, 1023 Load->getChain(), Ptr, 1024 MachinePointerInfo(Load->getMemOperand()->getValue()), 1025 MemEltVT, Load->isVolatile(), Load->isNonTemporal(), 1026 Load->getAlignment()); 1027 Loads.push_back(NewLoad.getValue(0)); 1028 Chains.push_back(NewLoad.getValue(1)); 1029 } 1030 1031 SDValue Ops[] = { 1032 DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), 1033 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) 1034 }; 1035 1036 return DAG.getMergeValues(Ops, SL); 1037 } 1038 1039 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, 1040 SelectionDAG &DAG) const { 1041 StoreSDNode *Store = cast<StoreSDNode>(Op); 1042 EVT MemVT = Store->getMemoryVT(); 1043 unsigned MemBits = MemVT.getSizeInBits(); 1044 1045 // Byte stores are really expensive, so if possible, try to pack 32-bit vector 1046 // truncating store into an i32 store. 1047 // XXX: We could also handle optimize other vector bitwidths. 1048 if (!MemVT.isVector() || MemBits > 32) { 1049 return SDValue(); 1050 } 1051 1052 SDLoc DL(Op); 1053 SDValue Value = Store->getValue(); 1054 EVT VT = Value.getValueType(); 1055 EVT ElemVT = VT.getVectorElementType(); 1056 SDValue Ptr = Store->getBasePtr(); 1057 EVT MemEltVT = MemVT.getVectorElementType(); 1058 unsigned MemEltBits = MemEltVT.getSizeInBits(); 1059 unsigned MemNumElements = MemVT.getVectorNumElements(); 1060 unsigned PackedSize = MemVT.getStoreSizeInBits(); 1061 SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32); 1062 1063 assert(Value.getValueType().getScalarSizeInBits() >= 32); 1064 1065 SDValue PackedValue; 1066 for (unsigned i = 0; i < MemNumElements; ++i) { 1067 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, 1068 DAG.getConstant(i, MVT::i32)); 1069 Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); 1070 Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg 1071 1072 SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32); 1073 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); 1074 1075 if (i == 0) { 1076 PackedValue = Elt; 1077 } else { 1078 PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); 1079 } 1080 } 1081 1082 if (PackedSize < 32) { 1083 EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); 1084 return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, 1085 Store->getMemOperand()->getPointerInfo(), 1086 PackedVT, 1087 Store->isNonTemporal(), Store->isVolatile(), 1088 Store->getAlignment()); 1089 } 1090 1091 return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, 1092 Store->getMemOperand()->getPointerInfo(), 1093 Store->isVolatile(), Store->isNonTemporal(), 1094 Store->getAlignment()); 1095 } 1096 1097 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, 1098 SelectionDAG &DAG) const { 1099 StoreSDNode *Store = cast<StoreSDNode>(Op); 1100 EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); 1101 EVT EltVT = Store->getValue().getValueType().getVectorElementType(); 1102 EVT PtrVT = Store->getBasePtr().getValueType(); 1103 unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); 1104 SDLoc SL(Op); 1105 1106 SmallVector<SDValue, 8> Chains; 1107 1108 for (unsigned i = 0, e = NumElts; i != e; ++i) { 1109 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, 1110 Store->getValue(), DAG.getConstant(i, MVT::i32)); 1111 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, 1112 Store->getBasePtr(), 1113 DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), 1114 PtrVT)); 1115 Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, 1116 MachinePointerInfo(Store->getMemOperand()->getValue()), 1117 MemEltVT, Store->isVolatile(), Store->isNonTemporal(), 1118 Store->getAlignment())); 1119 } 1120 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); 1121 } 1122 1123 SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1124 SDLoc DL(Op); 1125 LoadSDNode *Load = cast<LoadSDNode>(Op); 1126 ISD::LoadExtType ExtType = Load->getExtensionType(); 1127 EVT VT = Op.getValueType(); 1128 EVT MemVT = Load->getMemoryVT(); 1129 1130 if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) { 1131 // We can do the extload to 32-bits, and then need to separately extend to 1132 // 64-bits. 1133 1134 SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32, 1135 Load->getChain(), 1136 Load->getBasePtr(), 1137 MemVT, 1138 Load->getMemOperand()); 1139 1140 SDValue Ops[] = { 1141 DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32), 1142 ExtLoad32.getValue(1) 1143 }; 1144 1145 return DAG.getMergeValues(Ops, DL); 1146 } 1147 1148 if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { 1149 assert(VT == MVT::i1 && "Only i1 non-extloads expected"); 1150 // FIXME: Copied from PPC 1151 // First, load into 32 bits, then truncate to 1 bit. 1152 1153 SDValue Chain = Load->getChain(); 1154 SDValue BasePtr = Load->getBasePtr(); 1155 MachineMemOperand *MMO = Load->getMemOperand(); 1156 1157 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, 1158 BasePtr, MVT::i8, MMO); 1159 1160 SDValue Ops[] = { 1161 DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), 1162 NewLD.getValue(1) 1163 }; 1164 1165 return DAG.getMergeValues(Ops, DL); 1166 } 1167 1168 // Lower loads constant address space global variable loads 1169 if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && 1170 isa<GlobalVariable>( 1171 GetUnderlyingObject(Load->getMemOperand()->getValue()))) { 1172 1173 1174 SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL, 1175 getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); 1176 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 1177 DAG.getConstant(2, MVT::i32)); 1178 return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), 1179 Load->getChain(), Ptr, 1180 DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); 1181 } 1182 1183 if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || 1184 ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) 1185 return SDValue(); 1186 1187 1188 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), 1189 DAG.getConstant(2, MVT::i32)); 1190 SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), 1191 Load->getChain(), Ptr, 1192 DAG.getTargetConstant(0, MVT::i32), 1193 Op.getOperand(2)); 1194 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, 1195 Load->getBasePtr(), 1196 DAG.getConstant(0x3, MVT::i32)); 1197 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1198 DAG.getConstant(3, MVT::i32)); 1199 1200 Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); 1201 1202 EVT MemEltVT = MemVT.getScalarType(); 1203 if (ExtType == ISD::SEXTLOAD) { 1204 SDValue MemEltVTNode = DAG.getValueType(MemEltVT); 1205 1206 SDValue Ops[] = { 1207 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), 1208 Load->getChain() 1209 }; 1210 1211 return DAG.getMergeValues(Ops, DL); 1212 } 1213 1214 SDValue Ops[] = { 1215 DAG.getZeroExtendInReg(Ret, DL, MemEltVT), 1216 Load->getChain() 1217 }; 1218 1219 return DAG.getMergeValues(Ops, DL); 1220 } 1221 1222 SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1223 SDLoc DL(Op); 1224 SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); 1225 if (Result.getNode()) { 1226 return Result; 1227 } 1228 1229 StoreSDNode *Store = cast<StoreSDNode>(Op); 1230 SDValue Chain = Store->getChain(); 1231 if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1232 Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && 1233 Store->getValue().getValueType().isVector()) { 1234 return SplitVectorStore(Op, DAG); 1235 } 1236 1237 EVT MemVT = Store->getMemoryVT(); 1238 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && 1239 MemVT.bitsLT(MVT::i32)) { 1240 unsigned Mask = 0; 1241 if (Store->getMemoryVT() == MVT::i8) { 1242 Mask = 0xff; 1243 } else if (Store->getMemoryVT() == MVT::i16) { 1244 Mask = 0xffff; 1245 } 1246 SDValue BasePtr = Store->getBasePtr(); 1247 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, 1248 DAG.getConstant(2, MVT::i32)); 1249 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 1250 Chain, Ptr, DAG.getTargetConstant(0, MVT::i32)); 1251 1252 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, 1253 DAG.getConstant(0x3, MVT::i32)); 1254 1255 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1256 DAG.getConstant(3, MVT::i32)); 1257 1258 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, 1259 Store->getValue()); 1260 1261 SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); 1262 1263 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 1264 MaskedValue, ShiftAmt); 1265 1266 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32), 1267 ShiftAmt); 1268 DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, 1269 DAG.getConstant(0xffffffff, MVT::i32)); 1270 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 1271 1272 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 1273 return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1274 Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32)); 1275 } 1276 return SDValue(); 1277 } 1278 1279 SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const { 1280 SDLoc DL(Op); 1281 EVT OVT = Op.getValueType(); 1282 SDValue LHS = Op.getOperand(0); 1283 SDValue RHS = Op.getOperand(1); 1284 MVT INTTY; 1285 MVT FLTTY; 1286 if (!OVT.isVector()) { 1287 INTTY = MVT::i32; 1288 FLTTY = MVT::f32; 1289 } else if (OVT.getVectorNumElements() == 2) { 1290 INTTY = MVT::v2i32; 1291 FLTTY = MVT::v2f32; 1292 } else if (OVT.getVectorNumElements() == 4) { 1293 INTTY = MVT::v4i32; 1294 FLTTY = MVT::v4f32; 1295 } 1296 unsigned bitsize = OVT.getScalarType().getSizeInBits(); 1297 // char|short jq = ia ^ ib; 1298 SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS); 1299 1300 // jq = jq >> (bitsize - 2) 1301 jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); 1302 1303 // jq = jq | 0x1 1304 jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT)); 1305 1306 // jq = (int)jq 1307 jq = DAG.getSExtOrTrunc(jq, DL, INTTY); 1308 1309 // int ia = (int)LHS; 1310 SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY); 1311 1312 // int ib, (int)RHS; 1313 SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY); 1314 1315 // float fa = (float)ia; 1316 SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia); 1317 1318 // float fb = (float)ib; 1319 SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); 1320 1321 // float fq = native_divide(fa, fb); 1322 SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY, 1323 fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb)); 1324 1325 // fq = trunc(fq); 1326 fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); 1327 1328 // float fqneg = -fq; 1329 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); 1330 1331 // float fr = mad(fqneg, fb, fa); 1332 SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY, 1333 DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa); 1334 1335 // int iq = (int)fq; 1336 SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); 1337 1338 // fr = fabs(fr); 1339 fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr); 1340 1341 // fb = fabs(fb); 1342 fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb); 1343 1344 // int cv = fr >= fb; 1345 SDValue cv; 1346 if (INTTY == MVT::i32) { 1347 cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); 1348 } else { 1349 cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); 1350 } 1351 // jq = (cv ? jq : 0); 1352 jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, 1353 DAG.getConstant(0, OVT)); 1354 // dst = iq + jq; 1355 iq = DAG.getSExtOrTrunc(iq, DL, OVT); 1356 iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq); 1357 return iq; 1358 } 1359 1360 SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const { 1361 SDLoc DL(Op); 1362 EVT OVT = Op.getValueType(); 1363 SDValue LHS = Op.getOperand(0); 1364 SDValue RHS = Op.getOperand(1); 1365 // The LowerSDIV32 function generates equivalent to the following IL. 1366 // mov r0, LHS 1367 // mov r1, RHS 1368 // ilt r10, r0, 0 1369 // ilt r11, r1, 0 1370 // iadd r0, r0, r10 1371 // iadd r1, r1, r11 1372 // ixor r0, r0, r10 1373 // ixor r1, r1, r11 1374 // udiv r0, r0, r1 1375 // ixor r10, r10, r11 1376 // iadd r0, r0, r10 1377 // ixor DST, r0, r10 1378 1379 // mov r0, LHS 1380 SDValue r0 = LHS; 1381 1382 // mov r1, RHS 1383 SDValue r1 = RHS; 1384 1385 // ilt r10, r0, 0 1386 SDValue r10 = DAG.getSelectCC(DL, 1387 r0, DAG.getConstant(0, OVT), 1388 DAG.getConstant(-1, OVT), 1389 DAG.getConstant(0, OVT), 1390 ISD::SETLT); 1391 1392 // ilt r11, r1, 0 1393 SDValue r11 = DAG.getSelectCC(DL, 1394 r1, DAG.getConstant(0, OVT), 1395 DAG.getConstant(-1, OVT), 1396 DAG.getConstant(0, OVT), 1397 ISD::SETLT); 1398 1399 // iadd r0, r0, r10 1400 r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); 1401 1402 // iadd r1, r1, r11 1403 r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); 1404 1405 // ixor r0, r0, r10 1406 r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 1407 1408 // ixor r1, r1, r11 1409 r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); 1410 1411 // udiv r0, r0, r1 1412 r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1); 1413 1414 // ixor r10, r10, r11 1415 r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11); 1416 1417 // iadd r0, r0, r10 1418 r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); 1419 1420 // ixor DST, r0, r10 1421 SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 1422 return DST; 1423 } 1424 1425 SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const { 1426 return SDValue(Op.getNode(), 0); 1427 } 1428 1429 SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { 1430 EVT OVT = Op.getValueType().getScalarType(); 1431 1432 if (OVT == MVT::i64) 1433 return LowerSDIV64(Op, DAG); 1434 1435 if (OVT.getScalarType() == MVT::i32) 1436 return LowerSDIV32(Op, DAG); 1437 1438 if (OVT == MVT::i16 || OVT == MVT::i8) { 1439 // FIXME: We should be checking for the masked bits. This isn't reached 1440 // because i8 and i16 are not legal types. 1441 return LowerSDIV24(Op, DAG); 1442 } 1443 1444 return SDValue(Op.getNode(), 0); 1445 } 1446 1447 SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const { 1448 SDLoc DL(Op); 1449 EVT OVT = Op.getValueType(); 1450 SDValue LHS = Op.getOperand(0); 1451 SDValue RHS = Op.getOperand(1); 1452 // The LowerSREM32 function generates equivalent to the following IL. 1453 // mov r0, LHS 1454 // mov r1, RHS 1455 // ilt r10, r0, 0 1456 // ilt r11, r1, 0 1457 // iadd r0, r0, r10 1458 // iadd r1, r1, r11 1459 // ixor r0, r0, r10 1460 // ixor r1, r1, r11 1461 // udiv r20, r0, r1 1462 // umul r20, r20, r1 1463 // sub r0, r0, r20 1464 // iadd r0, r0, r10 1465 // ixor DST, r0, r10 1466 1467 // mov r0, LHS 1468 SDValue r0 = LHS; 1469 1470 // mov r1, RHS 1471 SDValue r1 = RHS; 1472 1473 // ilt r10, r0, 0 1474 SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT); 1475 1476 // ilt r11, r1, 0 1477 SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT); 1478 1479 // iadd r0, r0, r10 1480 r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); 1481 1482 // iadd r1, r1, r11 1483 r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); 1484 1485 // ixor r0, r0, r10 1486 r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 1487 1488 // ixor r1, r1, r11 1489 r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); 1490 1491 // udiv r20, r0, r1 1492 SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1); 1493 1494 // umul r20, r20, r1 1495 r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1); 1496 1497 // sub r0, r0, r20 1498 r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20); 1499 1500 // iadd r0, r0, r10 1501 r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); 1502 1503 // ixor DST, r0, r10 1504 SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 1505 return DST; 1506 } 1507 1508 SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const { 1509 return SDValue(Op.getNode(), 0); 1510 } 1511 1512 SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const { 1513 EVT OVT = Op.getValueType(); 1514 1515 if (OVT.getScalarType() == MVT::i64) 1516 return LowerSREM64(Op, DAG); 1517 1518 if (OVT.getScalarType() == MVT::i32) 1519 return LowerSREM32(Op, DAG); 1520 1521 return SDValue(Op.getNode(), 0); 1522 } 1523 1524 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, 1525 SelectionDAG &DAG) const { 1526 SDLoc DL(Op); 1527 EVT VT = Op.getValueType(); 1528 1529 SDValue Num = Op.getOperand(0); 1530 SDValue Den = Op.getOperand(1); 1531 1532 // RCP = URECIP(Den) = 2^32 / Den + e 1533 // e is rounding error. 1534 SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); 1535 1536 // RCP_LO = umulo(RCP, Den) */ 1537 SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den); 1538 1539 // RCP_HI = mulhu (RCP, Den) */ 1540 SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); 1541 1542 // NEG_RCP_LO = -RCP_LO 1543 SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), 1544 RCP_LO); 1545 1546 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 1547 SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), 1548 NEG_RCP_LO, RCP_LO, 1549 ISD::SETEQ); 1550 // Calculate the rounding error from the URECIP instruction 1551 // E = mulhu(ABS_RCP_LO, RCP) 1552 SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); 1553 1554 // RCP_A_E = RCP + E 1555 SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); 1556 1557 // RCP_S_E = RCP - E 1558 SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); 1559 1560 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 1561 SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), 1562 RCP_A_E, RCP_S_E, 1563 ISD::SETEQ); 1564 // Quotient = mulhu(Tmp0, Num) 1565 SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); 1566 1567 // Num_S_Remainder = Quotient * Den 1568 SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den); 1569 1570 // Remainder = Num - Num_S_Remainder 1571 SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); 1572 1573 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) 1574 SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, 1575 DAG.getConstant(-1, VT), 1576 DAG.getConstant(0, VT), 1577 ISD::SETUGE); 1578 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) 1579 SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, 1580 Num_S_Remainder, 1581 DAG.getConstant(-1, VT), 1582 DAG.getConstant(0, VT), 1583 ISD::SETUGE); 1584 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 1585 SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, 1586 Remainder_GE_Zero); 1587 1588 // Calculate Division result: 1589 1590 // Quotient_A_One = Quotient + 1 1591 SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, 1592 DAG.getConstant(1, VT)); 1593 1594 // Quotient_S_One = Quotient - 1 1595 SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, 1596 DAG.getConstant(1, VT)); 1597 1598 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) 1599 SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), 1600 Quotient, Quotient_A_One, ISD::SETEQ); 1601 1602 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) 1603 Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), 1604 Quotient_S_One, Div, ISD::SETEQ); 1605 1606 // Calculate Rem result: 1607 1608 // Remainder_S_Den = Remainder - Den 1609 SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); 1610 1611 // Remainder_A_Den = Remainder + Den 1612 SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); 1613 1614 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) 1615 SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), 1616 Remainder, Remainder_S_Den, ISD::SETEQ); 1617 1618 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) 1619 Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), 1620 Remainder_A_Den, Rem, ISD::SETEQ); 1621 SDValue Ops[2] = { 1622 Div, 1623 Rem 1624 }; 1625 return DAG.getMergeValues(Ops, DL); 1626 } 1627 1628 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, 1629 SelectionDAG &DAG) const { 1630 SDLoc DL(Op); 1631 EVT VT = Op.getValueType(); 1632 1633 SDValue Zero = DAG.getConstant(0, VT); 1634 SDValue NegOne = DAG.getConstant(-1, VT); 1635 1636 SDValue LHS = Op.getOperand(0); 1637 SDValue RHS = Op.getOperand(1); 1638 1639 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); 1640 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); 1641 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); 1642 SDValue RSign = LHSign; // Remainder sign is the same as LHS 1643 1644 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); 1645 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); 1646 1647 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); 1648 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); 1649 1650 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); 1651 SDValue Rem = Div.getValue(1); 1652 1653 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); 1654 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); 1655 1656 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); 1657 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); 1658 1659 SDValue Res[2] = { 1660 Div, 1661 Rem 1662 }; 1663 return DAG.getMergeValues(Res, DL); 1664 } 1665 1666 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { 1667 SDLoc SL(Op); 1668 SDValue Src = Op.getOperand(0); 1669 1670 // result = trunc(src) 1671 // if (src > 0.0 && src != result) 1672 // result += 1.0 1673 1674 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 1675 1676 const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64); 1677 const SDValue One = DAG.getConstantFP(1.0, MVT::f64); 1678 1679 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); 1680 1681 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); 1682 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 1683 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 1684 1685 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); 1686 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 1687 } 1688 1689 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { 1690 SDLoc SL(Op); 1691 SDValue Src = Op.getOperand(0); 1692 1693 assert(Op.getValueType() == MVT::f64); 1694 1695 const SDValue Zero = DAG.getConstant(0, MVT::i32); 1696 const SDValue One = DAG.getConstant(1, MVT::i32); 1697 1698 SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); 1699 1700 // Extract the upper half, since this is where we will find the sign and 1701 // exponent. 1702 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); 1703 1704 const unsigned FractBits = 52; 1705 const unsigned ExpBits = 11; 1706 1707 // Extract the exponent. 1708 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32, 1709 Hi, 1710 DAG.getConstant(FractBits - 32, MVT::i32), 1711 DAG.getConstant(ExpBits, MVT::i32)); 1712 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, 1713 DAG.getConstant(1023, MVT::i32)); 1714 1715 // Extract the sign bit. 1716 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32); 1717 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); 1718 1719 // Extend back to to 64-bits. 1720 SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 1721 Zero, SignBit); 1722 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); 1723 1724 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); 1725 const SDValue FractMask 1726 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64); 1727 1728 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); 1729 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); 1730 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); 1731 1732 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); 1733 1734 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32); 1735 1736 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); 1737 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); 1738 1739 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); 1740 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); 1741 1742 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); 1743 } 1744 1745 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { 1746 SDLoc SL(Op); 1747 SDValue Src = Op.getOperand(0); 1748 1749 assert(Op.getValueType() == MVT::f64); 1750 1751 APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); 1752 SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64); 1753 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); 1754 1755 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); 1756 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); 1757 1758 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); 1759 1760 APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); 1761 SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64); 1762 1763 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); 1764 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); 1765 1766 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); 1767 } 1768 1769 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { 1770 // FNEARBYINT and FRINT are the same, except in their handling of FP 1771 // exceptions. Those aren't really meaningful for us, and OpenCL only has 1772 // rint, so just treat them as equivalent. 1773 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); 1774 } 1775 1776 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { 1777 SDLoc SL(Op); 1778 SDValue Src = Op.getOperand(0); 1779 1780 // result = trunc(src); 1781 // if (src < 0.0 && src != result) 1782 // result += -1.0. 1783 1784 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 1785 1786 const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64); 1787 const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64); 1788 1789 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); 1790 1791 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); 1792 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 1793 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 1794 1795 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); 1796 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 1797 } 1798 1799 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, 1800 SelectionDAG &DAG) const { 1801 SDValue S0 = Op.getOperand(0); 1802 SDLoc DL(Op); 1803 if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64) 1804 return SDValue(); 1805 1806 // f32 uint_to_fp i64 1807 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, 1808 DAG.getConstant(0, MVT::i32)); 1809 SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); 1810 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, 1811 DAG.getConstant(1, MVT::i32)); 1812 SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); 1813 FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, 1814 DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32 1815 return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); 1816 } 1817 1818 SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, 1819 unsigned BitsDiff, 1820 SelectionDAG &DAG) const { 1821 MVT VT = Op.getSimpleValueType(); 1822 SDLoc DL(Op); 1823 SDValue Shift = DAG.getConstant(BitsDiff, VT); 1824 // Shift left by 'Shift' bits. 1825 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift); 1826 // Signed shift Right by 'Shift' bits. 1827 return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift); 1828 } 1829 1830 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 1831 SelectionDAG &DAG) const { 1832 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 1833 MVT VT = Op.getSimpleValueType(); 1834 MVT ScalarVT = VT.getScalarType(); 1835 1836 if (!VT.isVector()) 1837 return SDValue(); 1838 1839 SDValue Src = Op.getOperand(0); 1840 SDLoc DL(Op); 1841 1842 // TODO: Don't scalarize on Evergreen? 1843 unsigned NElts = VT.getVectorNumElements(); 1844 SmallVector<SDValue, 8> Args; 1845 DAG.ExtractVectorElements(Src, Args, 0, NElts); 1846 1847 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); 1848 for (unsigned I = 0; I < NElts; ++I) 1849 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); 1850 1851 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args); 1852 } 1853 1854 //===----------------------------------------------------------------------===// 1855 // Custom DAG optimizations 1856 //===----------------------------------------------------------------------===// 1857 1858 static bool isU24(SDValue Op, SelectionDAG &DAG) { 1859 APInt KnownZero, KnownOne; 1860 EVT VT = Op.getValueType(); 1861 DAG.computeKnownBits(Op, KnownZero, KnownOne); 1862 1863 return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; 1864 } 1865 1866 static bool isI24(SDValue Op, SelectionDAG &DAG) { 1867 EVT VT = Op.getValueType(); 1868 1869 // In order for this to be a signed 24-bit value, bit 23, must 1870 // be a sign bit. 1871 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated 1872 // as unsigned 24-bit values. 1873 (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; 1874 } 1875 1876 static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { 1877 1878 SelectionDAG &DAG = DCI.DAG; 1879 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 1880 EVT VT = Op.getValueType(); 1881 1882 APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); 1883 APInt KnownZero, KnownOne; 1884 TargetLowering::TargetLoweringOpt TLO(DAG, true, true); 1885 if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) 1886 DCI.CommitTargetLoweringOpt(TLO); 1887 } 1888 1889 template <typename IntTy> 1890 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, 1891 uint32_t Offset, uint32_t Width) { 1892 if (Width + Offset < 32) { 1893 IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width); 1894 return DAG.getConstant(Result, MVT::i32); 1895 } 1896 1897 return DAG.getConstant(Src0 >> Offset, MVT::i32); 1898 } 1899 1900 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, 1901 DAGCombinerInfo &DCI) const { 1902 EVT VT = N->getValueType(0); 1903 1904 if (VT.isVector() || VT.getSizeInBits() > 32) 1905 return SDValue(); 1906 1907 SelectionDAG &DAG = DCI.DAG; 1908 SDLoc DL(N); 1909 1910 SDValue N0 = N->getOperand(0); 1911 SDValue N1 = N->getOperand(1); 1912 SDValue Mul; 1913 1914 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 1915 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 1916 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 1917 Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); 1918 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 1919 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 1920 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 1921 Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); 1922 } else { 1923 return SDValue(); 1924 } 1925 1926 // We need to use sext even for MUL_U24, because MUL_U24 is used 1927 // for signed multiply of 8 and 16-bit types. 1928 return DAG.getSExtOrTrunc(Mul, DL, VT); 1929 } 1930 1931 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, 1932 DAGCombinerInfo &DCI) const { 1933 SelectionDAG &DAG = DCI.DAG; 1934 SDLoc DL(N); 1935 1936 switch(N->getOpcode()) { 1937 default: break; 1938 case ISD::MUL: 1939 return performMulCombine(N, DCI); 1940 case AMDGPUISD::MUL_I24: 1941 case AMDGPUISD::MUL_U24: { 1942 SDValue N0 = N->getOperand(0); 1943 SDValue N1 = N->getOperand(1); 1944 simplifyI24(N0, DCI); 1945 simplifyI24(N1, DCI); 1946 return SDValue(); 1947 } 1948 case ISD::SELECT_CC: { 1949 return CombineMinMax(N, DAG); 1950 } 1951 case AMDGPUISD::BFE_I32: 1952 case AMDGPUISD::BFE_U32: { 1953 assert(!N->getValueType(0).isVector() && 1954 "Vector handling of BFE not implemented"); 1955 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 1956 if (!Width) 1957 break; 1958 1959 uint32_t WidthVal = Width->getZExtValue() & 0x1f; 1960 if (WidthVal == 0) 1961 return DAG.getConstant(0, MVT::i32); 1962 1963 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 1964 if (!Offset) 1965 break; 1966 1967 SDValue BitsFrom = N->getOperand(0); 1968 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; 1969 1970 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; 1971 1972 if (OffsetVal == 0) { 1973 // This is already sign / zero extended, so try to fold away extra BFEs. 1974 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); 1975 1976 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); 1977 if (OpSignBits >= SignBits) 1978 return BitsFrom; 1979 1980 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); 1981 if (Signed) { 1982 // This is a sign_extend_inreg. Replace it to take advantage of existing 1983 // DAG Combines. If not eliminated, we will match back to BFE during 1984 // selection. 1985 1986 // TODO: The sext_inreg of extended types ends, although we can could 1987 // handle them in a single BFE. 1988 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, 1989 DAG.getValueType(SmallVT)); 1990 } 1991 1992 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); 1993 } 1994 1995 if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 1996 if (Signed) { 1997 return constantFoldBFE<int32_t>(DAG, 1998 Val->getSExtValue(), 1999 OffsetVal, 2000 WidthVal); 2001 } 2002 2003 return constantFoldBFE<uint32_t>(DAG, 2004 Val->getZExtValue(), 2005 OffsetVal, 2006 WidthVal); 2007 } 2008 2009 APInt Demanded = APInt::getBitsSet(32, 2010 OffsetVal, 2011 OffsetVal + WidthVal); 2012 2013 if ((OffsetVal + WidthVal) >= 32) { 2014 SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32); 2015 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, 2016 BitsFrom, ShiftVal); 2017 } 2018 2019 APInt KnownZero, KnownOne; 2020 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 2021 !DCI.isBeforeLegalizeOps()); 2022 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 2023 if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || 2024 TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) { 2025 DCI.CommitTargetLoweringOpt(TLO); 2026 } 2027 2028 break; 2029 } 2030 } 2031 return SDValue(); 2032 } 2033 2034 //===----------------------------------------------------------------------===// 2035 // Helper functions 2036 //===----------------------------------------------------------------------===// 2037 2038 void AMDGPUTargetLowering::getOriginalFunctionArgs( 2039 SelectionDAG &DAG, 2040 const Function *F, 2041 const SmallVectorImpl<ISD::InputArg> &Ins, 2042 SmallVectorImpl<ISD::InputArg> &OrigIns) const { 2043 2044 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 2045 if (Ins[i].ArgVT == Ins[i].VT) { 2046 OrigIns.push_back(Ins[i]); 2047 continue; 2048 } 2049 2050 EVT VT; 2051 if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { 2052 // Vector has been split into scalars. 2053 VT = Ins[i].ArgVT.getVectorElementType(); 2054 } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && 2055 Ins[i].ArgVT.getVectorElementType() != 2056 Ins[i].VT.getVectorElementType()) { 2057 // Vector elements have been promoted 2058 VT = Ins[i].ArgVT; 2059 } else { 2060 // Vector has been spilt into smaller vectors. 2061 VT = Ins[i].VT; 2062 } 2063 2064 ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, 2065 Ins[i].OrigArgIndex, Ins[i].PartOffset); 2066 OrigIns.push_back(Arg); 2067 } 2068 } 2069 2070 bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { 2071 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 2072 return CFP->isExactlyValue(1.0); 2073 } 2074 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 2075 return C->isAllOnesValue(); 2076 } 2077 return false; 2078 } 2079 2080 bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { 2081 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 2082 return CFP->getValueAPF().isZero(); 2083 } 2084 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 2085 return C->isNullValue(); 2086 } 2087 return false; 2088 } 2089 2090 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 2091 const TargetRegisterClass *RC, 2092 unsigned Reg, EVT VT) const { 2093 MachineFunction &MF = DAG.getMachineFunction(); 2094 MachineRegisterInfo &MRI = MF.getRegInfo(); 2095 unsigned VirtualRegister; 2096 if (!MRI.isLiveIn(Reg)) { 2097 VirtualRegister = MRI.createVirtualRegister(RC); 2098 MRI.addLiveIn(Reg, VirtualRegister); 2099 } else { 2100 VirtualRegister = MRI.getLiveInVirtReg(Reg); 2101 } 2102 return DAG.getRegister(VirtualRegister, VT); 2103 } 2104 2105 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; 2106 2107 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { 2108 switch (Opcode) { 2109 default: return nullptr; 2110 // AMDIL DAG nodes 2111 NODE_NAME_CASE(CALL); 2112 NODE_NAME_CASE(UMUL); 2113 NODE_NAME_CASE(RET_FLAG); 2114 NODE_NAME_CASE(BRANCH_COND); 2115 2116 // AMDGPU DAG nodes 2117 NODE_NAME_CASE(DWORDADDR) 2118 NODE_NAME_CASE(FRACT) 2119 NODE_NAME_CASE(CLAMP) 2120 NODE_NAME_CASE(FMAX) 2121 NODE_NAME_CASE(SMAX) 2122 NODE_NAME_CASE(UMAX) 2123 NODE_NAME_CASE(FMIN) 2124 NODE_NAME_CASE(SMIN) 2125 NODE_NAME_CASE(UMIN) 2126 NODE_NAME_CASE(URECIP) 2127 NODE_NAME_CASE(DIV_SCALE) 2128 NODE_NAME_CASE(DIV_FMAS) 2129 NODE_NAME_CASE(DIV_FIXUP) 2130 NODE_NAME_CASE(TRIG_PREOP) 2131 NODE_NAME_CASE(RCP) 2132 NODE_NAME_CASE(RSQ) 2133 NODE_NAME_CASE(RSQ_LEGACY) 2134 NODE_NAME_CASE(RSQ_CLAMPED) 2135 NODE_NAME_CASE(DOT4) 2136 NODE_NAME_CASE(BFE_U32) 2137 NODE_NAME_CASE(BFE_I32) 2138 NODE_NAME_CASE(BFI) 2139 NODE_NAME_CASE(BFM) 2140 NODE_NAME_CASE(BREV) 2141 NODE_NAME_CASE(MUL_U24) 2142 NODE_NAME_CASE(MUL_I24) 2143 NODE_NAME_CASE(MAD_U24) 2144 NODE_NAME_CASE(MAD_I24) 2145 NODE_NAME_CASE(EXPORT) 2146 NODE_NAME_CASE(CONST_ADDRESS) 2147 NODE_NAME_CASE(REGISTER_LOAD) 2148 NODE_NAME_CASE(REGISTER_STORE) 2149 NODE_NAME_CASE(LOAD_CONSTANT) 2150 NODE_NAME_CASE(LOAD_INPUT) 2151 NODE_NAME_CASE(SAMPLE) 2152 NODE_NAME_CASE(SAMPLEB) 2153 NODE_NAME_CASE(SAMPLED) 2154 NODE_NAME_CASE(SAMPLEL) 2155 NODE_NAME_CASE(CVT_F32_UBYTE0) 2156 NODE_NAME_CASE(CVT_F32_UBYTE1) 2157 NODE_NAME_CASE(CVT_F32_UBYTE2) 2158 NODE_NAME_CASE(CVT_F32_UBYTE3) 2159 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) 2160 NODE_NAME_CASE(STORE_MSKOR) 2161 NODE_NAME_CASE(TBUFFER_STORE_FORMAT) 2162 } 2163 } 2164 2165 static void computeKnownBitsForMinMax(const SDValue Op0, 2166 const SDValue Op1, 2167 APInt &KnownZero, 2168 APInt &KnownOne, 2169 const SelectionDAG &DAG, 2170 unsigned Depth) { 2171 APInt Op0Zero, Op0One; 2172 APInt Op1Zero, Op1One; 2173 DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth); 2174 DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth); 2175 2176 KnownZero = Op0Zero & Op1Zero; 2177 KnownOne = Op0One & Op1One; 2178 } 2179 2180 void AMDGPUTargetLowering::computeKnownBitsForTargetNode( 2181 const SDValue Op, 2182 APInt &KnownZero, 2183 APInt &KnownOne, 2184 const SelectionDAG &DAG, 2185 unsigned Depth) const { 2186 2187 KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. 2188 2189 APInt KnownZero2; 2190 APInt KnownOne2; 2191 unsigned Opc = Op.getOpcode(); 2192 2193 switch (Opc) { 2194 default: 2195 break; 2196 case ISD::INTRINSIC_WO_CHAIN: { 2197 // FIXME: The intrinsic should just use the node. 2198 switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { 2199 case AMDGPUIntrinsic::AMDGPU_imax: 2200 case AMDGPUIntrinsic::AMDGPU_umax: 2201 case AMDGPUIntrinsic::AMDGPU_imin: 2202 case AMDGPUIntrinsic::AMDGPU_umin: 2203 computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2), 2204 KnownZero, KnownOne, DAG, Depth); 2205 break; 2206 default: 2207 break; 2208 } 2209 2210 break; 2211 } 2212 case AMDGPUISD::SMAX: 2213 case AMDGPUISD::UMAX: 2214 case AMDGPUISD::SMIN: 2215 case AMDGPUISD::UMIN: 2216 computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1), 2217 KnownZero, KnownOne, DAG, Depth); 2218 break; 2219 2220 case AMDGPUISD::BFE_I32: 2221 case AMDGPUISD::BFE_U32: { 2222 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 2223 if (!CWidth) 2224 return; 2225 2226 unsigned BitWidth = 32; 2227 uint32_t Width = CWidth->getZExtValue() & 0x1f; 2228 if (Width == 0) { 2229 KnownZero = APInt::getAllOnesValue(BitWidth); 2230 KnownOne = APInt::getNullValue(BitWidth); 2231 return; 2232 } 2233 2234 // FIXME: This could do a lot more. If offset is 0, should be the same as 2235 // sign_extend_inreg implementation, but that involves duplicating it. 2236 if (Opc == AMDGPUISD::BFE_I32) 2237 KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width); 2238 else 2239 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); 2240 2241 break; 2242 } 2243 } 2244 } 2245 2246 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( 2247 SDValue Op, 2248 const SelectionDAG &DAG, 2249 unsigned Depth) const { 2250 switch (Op.getOpcode()) { 2251 case AMDGPUISD::BFE_I32: { 2252 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 2253 if (!Width) 2254 return 1; 2255 2256 unsigned SignBits = 32 - Width->getZExtValue() + 1; 2257 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 2258 if (!Offset || !Offset->isNullValue()) 2259 return SignBits; 2260 2261 // TODO: Could probably figure something out with non-0 offsets. 2262 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 2263 return std::max(SignBits, Op0SignBits); 2264 } 2265 2266 case AMDGPUISD::BFE_U32: { 2267 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 2268 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; 2269 } 2270 2271 default: 2272 return 1; 2273 } 2274 } 2275