1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief This is the parent TargetLowering class for hardware code gen 12 /// targets. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPUISelLowering.h" 17 #include "AMDGPU.h" 18 #include "AMDGPUFrameLowering.h" 19 #include "AMDGPUIntrinsicInfo.h" 20 #include "AMDGPURegisterInfo.h" 21 #include "AMDGPUSubtarget.h" 22 #include "R600MachineFunctionInfo.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "llvm/CodeGen/CallingConvLower.h" 25 #include "llvm/CodeGen/MachineFunction.h" 26 #include "llvm/CodeGen/MachineRegisterInfo.h" 27 #include "llvm/CodeGen/SelectionDAG.h" 28 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 29 #include "llvm/IR/DataLayout.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/DiagnosticPrinter.h" 32 33 using namespace llvm; 34 35 namespace { 36 37 /// Diagnostic information for unimplemented or unsupported feature reporting. 38 class DiagnosticInfoUnsupported : public DiagnosticInfo { 39 private: 40 const Twine &Description; 41 const Function &Fn; 42 43 static int KindID; 44 45 static int getKindID() { 46 if (KindID == 0) 47 KindID = llvm::getNextAvailablePluginDiagnosticKind(); 48 return KindID; 49 } 50 51 public: 52 DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, 53 DiagnosticSeverity Severity = DS_Error) 54 : DiagnosticInfo(getKindID(), Severity), 55 Description(Desc), 56 Fn(Fn) { } 57 58 const Function &getFunction() const { return Fn; } 59 const Twine &getDescription() const { return Description; } 60 61 void print(DiagnosticPrinter &DP) const override { 62 DP << "unsupported " << getDescription() << " in " << Fn.getName(); 63 } 64 65 static bool classof(const DiagnosticInfo *DI) { 66 return DI->getKind() == getKindID(); 67 } 68 }; 69 70 int DiagnosticInfoUnsupported::KindID = 0; 71 } 72 73 74 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, 75 CCValAssign::LocInfo LocInfo, 76 ISD::ArgFlagsTy ArgFlags, CCState &State) { 77 unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), 78 ArgFlags.getOrigAlign()); 79 State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); 80 81 return true; 82 } 83 84 #include "AMDGPUGenCallingConv.inc" 85 86 // Find a larger type to do a load / store of a vector with. 87 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { 88 unsigned StoreSize = VT.getStoreSizeInBits(); 89 if (StoreSize <= 32) 90 return EVT::getIntegerVT(Ctx, StoreSize); 91 92 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); 93 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); 94 } 95 96 // Type for a vector that will be loaded to. 97 EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { 98 unsigned StoreSize = VT.getStoreSizeInBits(); 99 if (StoreSize <= 32) 100 return EVT::getIntegerVT(Ctx, 32); 101 102 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); 103 } 104 105 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, 106 const AMDGPUSubtarget &STI) 107 : TargetLowering(TM), Subtarget(&STI) { 108 setOperationAction(ISD::Constant, MVT::i32, Legal); 109 setOperationAction(ISD::Constant, MVT::i64, Legal); 110 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 111 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 112 113 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 114 setOperationAction(ISD::BRIND, MVT::Other, Expand); 115 116 // We need to custom lower some of the intrinsics 117 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 118 119 // Library functions. These default to Expand, but we have instructions 120 // for them. 121 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 122 setOperationAction(ISD::FEXP2, MVT::f32, Legal); 123 setOperationAction(ISD::FPOW, MVT::f32, Legal); 124 setOperationAction(ISD::FLOG2, MVT::f32, Legal); 125 setOperationAction(ISD::FABS, MVT::f32, Legal); 126 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 127 setOperationAction(ISD::FRINT, MVT::f32, Legal); 128 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 129 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 130 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 131 132 setOperationAction(ISD::FROUND, MVT::f32, Custom); 133 setOperationAction(ISD::FROUND, MVT::f64, Custom); 134 135 setOperationAction(ISD::FREM, MVT::f32, Custom); 136 setOperationAction(ISD::FREM, MVT::f64, Custom); 137 138 // v_mad_f32 does not support denormals according to some sources. 139 if (!Subtarget->hasFP32Denormals()) 140 setOperationAction(ISD::FMAD, MVT::f32, Legal); 141 142 // Expand to fneg + fadd. 143 setOperationAction(ISD::FSUB, MVT::f64, Expand); 144 145 // Lower floating point store/load to integer store/load to reduce the number 146 // of patterns in tablegen. 147 setOperationAction(ISD::STORE, MVT::f32, Promote); 148 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); 149 150 setOperationAction(ISD::STORE, MVT::v2f32, Promote); 151 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); 152 153 setOperationAction(ISD::STORE, MVT::v4f32, Promote); 154 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); 155 156 setOperationAction(ISD::STORE, MVT::v8f32, Promote); 157 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); 158 159 setOperationAction(ISD::STORE, MVT::v16f32, Promote); 160 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); 161 162 setOperationAction(ISD::STORE, MVT::f64, Promote); 163 AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); 164 165 setOperationAction(ISD::STORE, MVT::v2f64, Promote); 166 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64); 167 168 // Custom lowering of vector stores is required for local address space 169 // stores. 170 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 171 172 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); 173 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); 174 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); 175 176 // XXX: This can be change to Custom, once ExpandVectorStores can 177 // handle 64-bit stores. 178 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); 179 180 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 181 setTruncStoreAction(MVT::i64, MVT::i8, Expand); 182 setTruncStoreAction(MVT::i64, MVT::i1, Expand); 183 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); 184 setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); 185 186 187 setOperationAction(ISD::LOAD, MVT::f32, Promote); 188 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); 189 190 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 191 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); 192 193 setOperationAction(ISD::LOAD, MVT::v4f32, Promote); 194 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); 195 196 setOperationAction(ISD::LOAD, MVT::v8f32, Promote); 197 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); 198 199 setOperationAction(ISD::LOAD, MVT::v16f32, Promote); 200 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); 201 202 setOperationAction(ISD::LOAD, MVT::f64, Promote); 203 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); 204 205 setOperationAction(ISD::LOAD, MVT::v2f64, Promote); 206 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64); 207 208 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 209 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); 210 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); 211 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); 212 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); 213 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); 214 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); 215 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); 216 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); 217 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); 218 219 // There are no 64-bit extloads. These should be done as a 32-bit extload and 220 // an extension to 64-bit. 221 for (MVT VT : MVT::integer_valuetypes()) { 222 setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); 223 setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); 224 setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); 225 } 226 227 for (MVT VT : MVT::integer_vector_valuetypes()) { 228 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); 229 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); 230 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); 231 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); 232 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); 233 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); 234 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); 235 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); 236 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); 237 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); 238 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); 239 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); 240 } 241 242 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 243 244 if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { 245 setOperationAction(ISD::FCEIL, MVT::f64, Custom); 246 setOperationAction(ISD::FTRUNC, MVT::f64, Custom); 247 setOperationAction(ISD::FRINT, MVT::f64, Custom); 248 setOperationAction(ISD::FFLOOR, MVT::f64, Custom); 249 } 250 251 if (!Subtarget->hasBFI()) { 252 // fcopysign can be done in a single instruction with BFI. 253 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 254 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 255 } 256 257 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 258 259 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 260 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 261 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 262 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 263 264 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 265 for (MVT VT : ScalarIntVTs) { 266 setOperationAction(ISD::SREM, VT, Expand); 267 setOperationAction(ISD::SDIV, VT, Expand); 268 269 // GPU does not have divrem function for signed or unsigned. 270 setOperationAction(ISD::SDIVREM, VT, Custom); 271 setOperationAction(ISD::UDIVREM, VT, Custom); 272 273 // GPU does not have [S|U]MUL_LOHI functions as a single instruction. 274 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 275 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 276 277 setOperationAction(ISD::BSWAP, VT, Expand); 278 setOperationAction(ISD::CTTZ, VT, Expand); 279 setOperationAction(ISD::CTLZ, VT, Expand); 280 } 281 282 if (!Subtarget->hasBCNT(32)) 283 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 284 285 if (!Subtarget->hasBCNT(64)) 286 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 287 288 // The hardware supports 32-bit ROTR, but not ROTL. 289 setOperationAction(ISD::ROTL, MVT::i32, Expand); 290 setOperationAction(ISD::ROTL, MVT::i64, Expand); 291 setOperationAction(ISD::ROTR, MVT::i64, Expand); 292 293 setOperationAction(ISD::MUL, MVT::i64, Expand); 294 setOperationAction(ISD::MULHU, MVT::i64, Expand); 295 setOperationAction(ISD::MULHS, MVT::i64, Expand); 296 setOperationAction(ISD::UDIV, MVT::i32, Expand); 297 setOperationAction(ISD::UREM, MVT::i32, Expand); 298 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 299 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 300 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 301 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 302 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 303 304 if (!Subtarget->hasFFBH()) 305 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); 306 307 if (!Subtarget->hasFFBL()) 308 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); 309 310 static const MVT::SimpleValueType VectorIntTypes[] = { 311 MVT::v2i32, MVT::v4i32 312 }; 313 314 for (MVT VT : VectorIntTypes) { 315 // Expand the following operations for the current type by default. 316 setOperationAction(ISD::ADD, VT, Expand); 317 setOperationAction(ISD::AND, VT, Expand); 318 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 319 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 320 setOperationAction(ISD::MUL, VT, Expand); 321 setOperationAction(ISD::OR, VT, Expand); 322 setOperationAction(ISD::SHL, VT, Expand); 323 setOperationAction(ISD::SRA, VT, Expand); 324 setOperationAction(ISD::SRL, VT, Expand); 325 setOperationAction(ISD::ROTL, VT, Expand); 326 setOperationAction(ISD::ROTR, VT, Expand); 327 setOperationAction(ISD::SUB, VT, Expand); 328 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 329 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 330 setOperationAction(ISD::SDIV, VT, Expand); 331 setOperationAction(ISD::UDIV, VT, Expand); 332 setOperationAction(ISD::SREM, VT, Expand); 333 setOperationAction(ISD::UREM, VT, Expand); 334 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 335 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 336 setOperationAction(ISD::SDIVREM, VT, Custom); 337 setOperationAction(ISD::UDIVREM, VT, Custom); 338 setOperationAction(ISD::ADDC, VT, Expand); 339 setOperationAction(ISD::SUBC, VT, Expand); 340 setOperationAction(ISD::ADDE, VT, Expand); 341 setOperationAction(ISD::SUBE, VT, Expand); 342 setOperationAction(ISD::SELECT, VT, Expand); 343 setOperationAction(ISD::VSELECT, VT, Expand); 344 setOperationAction(ISD::SELECT_CC, VT, Expand); 345 setOperationAction(ISD::XOR, VT, Expand); 346 setOperationAction(ISD::BSWAP, VT, Expand); 347 setOperationAction(ISD::CTPOP, VT, Expand); 348 setOperationAction(ISD::CTTZ, VT, Expand); 349 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); 350 setOperationAction(ISD::CTLZ, VT, Expand); 351 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); 352 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 353 } 354 355 static const MVT::SimpleValueType FloatVectorTypes[] = { 356 MVT::v2f32, MVT::v4f32 357 }; 358 359 for (MVT VT : FloatVectorTypes) { 360 setOperationAction(ISD::FABS, VT, Expand); 361 setOperationAction(ISD::FMINNUM, VT, Expand); 362 setOperationAction(ISD::FMAXNUM, VT, Expand); 363 setOperationAction(ISD::FADD, VT, Expand); 364 setOperationAction(ISD::FCEIL, VT, Expand); 365 setOperationAction(ISD::FCOS, VT, Expand); 366 setOperationAction(ISD::FDIV, VT, Expand); 367 setOperationAction(ISD::FEXP2, VT, Expand); 368 setOperationAction(ISD::FLOG2, VT, Expand); 369 setOperationAction(ISD::FREM, VT, Expand); 370 setOperationAction(ISD::FPOW, VT, Expand); 371 setOperationAction(ISD::FFLOOR, VT, Expand); 372 setOperationAction(ISD::FTRUNC, VT, Expand); 373 setOperationAction(ISD::FMUL, VT, Expand); 374 setOperationAction(ISD::FMA, VT, Expand); 375 setOperationAction(ISD::FRINT, VT, Expand); 376 setOperationAction(ISD::FNEARBYINT, VT, Expand); 377 setOperationAction(ISD::FSQRT, VT, Expand); 378 setOperationAction(ISD::FSIN, VT, Expand); 379 setOperationAction(ISD::FSUB, VT, Expand); 380 setOperationAction(ISD::FNEG, VT, Expand); 381 setOperationAction(ISD::SELECT, VT, Expand); 382 setOperationAction(ISD::VSELECT, VT, Expand); 383 setOperationAction(ISD::SELECT_CC, VT, Expand); 384 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 385 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 386 } 387 388 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); 389 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); 390 391 setTargetDAGCombine(ISD::MUL); 392 setTargetDAGCombine(ISD::SELECT); 393 setTargetDAGCombine(ISD::SELECT_CC); 394 setTargetDAGCombine(ISD::STORE); 395 396 setTargetDAGCombine(ISD::FADD); 397 setTargetDAGCombine(ISD::FSUB); 398 399 setBooleanContents(ZeroOrNegativeOneBooleanContent); 400 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 401 402 setSchedulingPreference(Sched::RegPressure); 403 setJumpIsExpensive(true); 404 405 // SI at least has hardware support for floating point exceptions, but no way 406 // of using or handling them is implemented. They are also optional in OpenCL 407 // (Section 7.3) 408 setHasFloatingPointExceptions(false); 409 410 setSelectIsExpensive(false); 411 PredictableSelectIsExpensive = false; 412 413 // There are no integer divide instructions, and these expand to a pretty 414 // large sequence of instructions. 415 setIntDivIsCheap(false); 416 setPow2SDivIsCheap(false); 417 setFsqrtIsCheap(true); 418 419 // FIXME: Need to really handle these. 420 MaxStoresPerMemcpy = 4096; 421 MaxStoresPerMemmove = 4096; 422 MaxStoresPerMemset = 4096; 423 } 424 425 //===----------------------------------------------------------------------===// 426 // Target Information 427 //===----------------------------------------------------------------------===// 428 429 MVT AMDGPUTargetLowering::getVectorIdxTy() const { 430 return MVT::i32; 431 } 432 433 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { 434 return true; 435 } 436 437 // The backend supports 32 and 64 bit floating point immediates. 438 // FIXME: Why are we reporting vectors of FP immediates as legal? 439 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 440 EVT ScalarVT = VT.getScalarType(); 441 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64); 442 } 443 444 // We don't want to shrink f64 / f32 constants. 445 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { 446 EVT ScalarVT = VT.getScalarType(); 447 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); 448 } 449 450 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, 451 ISD::LoadExtType, 452 EVT NewVT) const { 453 454 unsigned NewSize = NewVT.getStoreSizeInBits(); 455 456 // If we are reducing to a 32-bit load, this is always better. 457 if (NewSize == 32) 458 return true; 459 460 EVT OldVT = N->getValueType(0); 461 unsigned OldSize = OldVT.getStoreSizeInBits(); 462 463 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar 464 // extloads, so doing one requires using a buffer_load. In cases where we 465 // still couldn't use a scalar load, using the wider load shouldn't really 466 // hurt anything. 467 468 // If the old size already had to be an extload, there's no harm in continuing 469 // to reduce the width. 470 return (OldSize < 32); 471 } 472 473 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, 474 EVT CastTy) const { 475 if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) 476 return true; 477 478 unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); 479 unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); 480 481 return ((LScalarSize <= CastScalarSize) || 482 (CastScalarSize >= 32) || 483 (LScalarSize < 32)); 484 } 485 486 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also 487 // profitable with the expansion for 64-bit since it's generally good to 488 // speculate things. 489 // FIXME: These should really have the size as a parameter. 490 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const { 491 return true; 492 } 493 494 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { 495 return true; 496 } 497 498 //===---------------------------------------------------------------------===// 499 // Target Properties 500 //===---------------------------------------------------------------------===// 501 502 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { 503 assert(VT.isFloatingPoint()); 504 return VT == MVT::f32 || VT == MVT::f64; 505 } 506 507 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { 508 assert(VT.isFloatingPoint()); 509 return VT == MVT::f32 || VT == MVT::f64; 510 } 511 512 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { 513 // Truncate is just accessing a subregister. 514 return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); 515 } 516 517 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { 518 // Truncate is just accessing a subregister. 519 return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && 520 (Dest->getPrimitiveSizeInBits() % 32 == 0); 521 } 522 523 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { 524 const DataLayout *DL = getDataLayout(); 525 unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); 526 unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); 527 528 return SrcSize == 32 && DestSize == 64; 529 } 530 531 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { 532 // Any register load of a 64-bit value really requires 2 32-bit moves. For all 533 // practical purposes, the extra mov 0 to load a 64-bit is free. As used, 534 // this will enable reducing 64-bit operations the 32-bit, which is always 535 // good. 536 return Src == MVT::i32 && Dest == MVT::i64; 537 } 538 539 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 540 return isZExtFree(Val.getValueType(), VT2); 541 } 542 543 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { 544 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a 545 // limited number of native 64-bit operations. Shrinking an operation to fit 546 // in a single 32-bit register should always be helpful. As currently used, 547 // this is much less general than the name suggests, and is only used in 548 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is 549 // not profitable, and may actually be harmful. 550 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; 551 } 552 553 //===---------------------------------------------------------------------===// 554 // TargetLowering Callbacks 555 //===---------------------------------------------------------------------===// 556 557 void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, 558 const SmallVectorImpl<ISD::InputArg> &Ins) const { 559 560 State.AnalyzeFormalArguments(Ins, CC_AMDGPU); 561 } 562 563 SDValue AMDGPUTargetLowering::LowerReturn( 564 SDValue Chain, 565 CallingConv::ID CallConv, 566 bool isVarArg, 567 const SmallVectorImpl<ISD::OutputArg> &Outs, 568 const SmallVectorImpl<SDValue> &OutVals, 569 SDLoc DL, SelectionDAG &DAG) const { 570 return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); 571 } 572 573 //===---------------------------------------------------------------------===// 574 // Target specific lowering 575 //===---------------------------------------------------------------------===// 576 577 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, 578 SmallVectorImpl<SDValue> &InVals) const { 579 SDValue Callee = CLI.Callee; 580 SelectionDAG &DAG = CLI.DAG; 581 582 const Function &Fn = *DAG.getMachineFunction().getFunction(); 583 584 StringRef FuncName("<unknown>"); 585 586 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) 587 FuncName = G->getSymbol(); 588 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 589 FuncName = G->getGlobal()->getName(); 590 591 DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); 592 DAG.getContext()->diagnose(NoCalls); 593 return SDValue(); 594 } 595 596 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, 597 SelectionDAG &DAG) const { 598 switch (Op.getOpcode()) { 599 default: 600 Op.getNode()->dump(); 601 llvm_unreachable("Custom lowering code for this" 602 "instruction is not implemented yet!"); 603 break; 604 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 605 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 606 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 607 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 608 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 609 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); 610 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); 611 case ISD::FREM: return LowerFREM(Op, DAG); 612 case ISD::FCEIL: return LowerFCEIL(Op, DAG); 613 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); 614 case ISD::FRINT: return LowerFRINT(Op, DAG); 615 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); 616 case ISD::FROUND: return LowerFROUND(Op, DAG); 617 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); 618 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 619 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 620 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 621 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 622 } 623 return Op; 624 } 625 626 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, 627 SmallVectorImpl<SDValue> &Results, 628 SelectionDAG &DAG) const { 629 switch (N->getOpcode()) { 630 case ISD::SIGN_EXTEND_INREG: 631 // Different parts of legalization seem to interpret which type of 632 // sign_extend_inreg is the one to check for custom lowering. The extended 633 // from type is what really matters, but some places check for custom 634 // lowering of the result type. This results in trying to use 635 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do 636 // nothing here and let the illegal result integer be handled normally. 637 return; 638 case ISD::LOAD: { 639 SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); 640 if (!Node) 641 return; 642 643 Results.push_back(SDValue(Node, 0)); 644 Results.push_back(SDValue(Node, 1)); 645 // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode 646 // function 647 DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); 648 return; 649 } 650 case ISD::STORE: { 651 SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); 652 if (Lowered.getNode()) 653 Results.push_back(Lowered); 654 return; 655 } 656 default: 657 return; 658 } 659 } 660 661 // FIXME: This implements accesses to initialized globals in the constant 662 // address space by copying them to private and accessing that. It does not 663 // properly handle illegal types or vectors. The private vector loads are not 664 // scalarized, and the illegal scalars hit an assertion. This technique will not 665 // work well with large initializers, and this should eventually be 666 // removed. Initialized globals should be placed into a data section that the 667 // runtime will load into a buffer before the kernel is executed. Uses of the 668 // global need to be replaced with a pointer loaded from an implicit kernel 669 // argument into this buffer holding the copy of the data, which will remove the 670 // need for any of this. 671 SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, 672 const GlobalValue *GV, 673 const SDValue &InitPtr, 674 SDValue Chain, 675 SelectionDAG &DAG) const { 676 const DataLayout *TD = getDataLayout(); 677 SDLoc DL(InitPtr); 678 Type *InitTy = Init->getType(); 679 680 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { 681 EVT VT = EVT::getEVT(InitTy); 682 PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); 683 return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, 684 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 685 TD->getPrefTypeAlignment(InitTy)); 686 } 687 688 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { 689 EVT VT = EVT::getEVT(CFP->getType()); 690 PointerType *PtrTy = PointerType::get(CFP->getType(), 0); 691 return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr, 692 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 693 TD->getPrefTypeAlignment(CFP->getType())); 694 } 695 696 if (StructType *ST = dyn_cast<StructType>(InitTy)) { 697 const StructLayout *SL = TD->getStructLayout(ST); 698 699 EVT PtrVT = InitPtr.getValueType(); 700 SmallVector<SDValue, 8> Chains; 701 702 for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { 703 SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT); 704 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); 705 706 Constant *Elt = Init->getAggregateElement(I); 707 Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); 708 } 709 710 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 711 } 712 713 if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) { 714 EVT PtrVT = InitPtr.getValueType(); 715 716 unsigned NumElements; 717 if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy)) 718 NumElements = AT->getNumElements(); 719 else if (VectorType *VT = dyn_cast<VectorType>(SeqTy)) 720 NumElements = VT->getNumElements(); 721 else 722 llvm_unreachable("Unexpected type"); 723 724 unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); 725 SmallVector<SDValue, 8> Chains; 726 for (unsigned i = 0; i < NumElements; ++i) { 727 SDValue Offset = DAG.getConstant(i * EltSize, PtrVT); 728 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); 729 730 Constant *Elt = Init->getAggregateElement(i); 731 Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); 732 } 733 734 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 735 } 736 737 if (isa<UndefValue>(Init)) { 738 EVT VT = EVT::getEVT(InitTy); 739 PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); 740 return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, 741 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 742 TD->getPrefTypeAlignment(InitTy)); 743 } 744 745 Init->dump(); 746 llvm_unreachable("Unhandled constant initializer"); 747 } 748 749 static bool hasDefinedInitializer(const GlobalValue *GV) { 750 const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); 751 if (!GVar || !GVar->hasInitializer()) 752 return false; 753 754 if (isa<UndefValue>(GVar->getInitializer())) 755 return false; 756 757 return true; 758 } 759 760 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, 761 SDValue Op, 762 SelectionDAG &DAG) const { 763 764 const DataLayout *TD = getDataLayout(); 765 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); 766 const GlobalValue *GV = G->getGlobal(); 767 768 switch (G->getAddressSpace()) { 769 case AMDGPUAS::LOCAL_ADDRESS: { 770 // XXX: What does the value of G->getOffset() mean? 771 assert(G->getOffset() == 0 && 772 "Do not know what to do with an non-zero offset"); 773 774 // TODO: We could emit code to handle the initialization somewhere. 775 if (hasDefinedInitializer(GV)) 776 break; 777 778 unsigned Offset; 779 if (MFI->LocalMemoryObjects.count(GV) == 0) { 780 uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); 781 Offset = MFI->LDSSize; 782 MFI->LocalMemoryObjects[GV] = Offset; 783 // XXX: Account for alignment? 784 MFI->LDSSize += Size; 785 } else { 786 Offset = MFI->LocalMemoryObjects[GV]; 787 } 788 789 return DAG.getConstant(Offset, getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); 790 } 791 case AMDGPUAS::CONSTANT_ADDRESS: { 792 MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); 793 Type *EltType = GV->getType()->getElementType(); 794 unsigned Size = TD->getTypeAllocSize(EltType); 795 unsigned Alignment = TD->getPrefTypeAlignment(EltType); 796 797 MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS); 798 MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); 799 800 int FI = FrameInfo->CreateStackObject(Size, Alignment, false); 801 SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); 802 803 const GlobalVariable *Var = cast<GlobalVariable>(GV); 804 if (!Var->hasInitializer()) { 805 // This has no use, but bugpoint will hit it. 806 return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); 807 } 808 809 const Constant *Init = Var->getInitializer(); 810 SmallVector<SDNode*, 8> WorkList; 811 812 for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), 813 E = DAG.getEntryNode()->use_end(); I != E; ++I) { 814 if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) 815 continue; 816 WorkList.push_back(*I); 817 } 818 SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); 819 for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(), 820 E = WorkList.end(); I != E; ++I) { 821 SmallVector<SDValue, 8> Ops; 822 Ops.push_back(Chain); 823 for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { 824 Ops.push_back((*I)->getOperand(i)); 825 } 826 DAG.UpdateNodeOperands(*I, Ops); 827 } 828 return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); 829 } 830 } 831 832 const Function &Fn = *DAG.getMachineFunction().getFunction(); 833 DiagnosticInfoUnsupported BadInit(Fn, 834 "initializer for address space"); 835 DAG.getContext()->diagnose(BadInit); 836 return SDValue(); 837 } 838 839 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, 840 SelectionDAG &DAG) const { 841 SmallVector<SDValue, 8> Args; 842 SDValue A = Op.getOperand(0); 843 SDValue B = Op.getOperand(1); 844 845 DAG.ExtractVectorElements(A, Args); 846 DAG.ExtractVectorElements(B, Args); 847 848 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); 849 } 850 851 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 852 SelectionDAG &DAG) const { 853 854 SmallVector<SDValue, 8> Args; 855 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 856 EVT VT = Op.getValueType(); 857 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, 858 VT.getVectorNumElements()); 859 860 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); 861 } 862 863 SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, 864 SelectionDAG &DAG) const { 865 866 MachineFunction &MF = DAG.getMachineFunction(); 867 const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering(); 868 869 FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); 870 871 unsigned FrameIndex = FIN->getIndex(); 872 unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); 873 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), 874 Op.getValueType()); 875 } 876 877 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 878 SelectionDAG &DAG) const { 879 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 880 SDLoc DL(Op); 881 EVT VT = Op.getValueType(); 882 883 switch (IntrinsicID) { 884 default: return Op; 885 case AMDGPUIntrinsic::AMDGPU_abs: 886 case AMDGPUIntrinsic::AMDIL_abs: // Legacy name. 887 return LowerIntrinsicIABS(Op, DAG); 888 case AMDGPUIntrinsic::AMDGPU_lrp: 889 return LowerIntrinsicLRP(Op, DAG); 890 891 case AMDGPUIntrinsic::AMDGPU_clamp: 892 case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. 893 return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, 894 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 895 896 case Intrinsic::AMDGPU_div_scale: { 897 // 3rd parameter required to be a constant. 898 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 899 if (!Param) 900 return DAG.getUNDEF(VT); 901 902 // Translate to the operands expected by the machine instruction. The 903 // first parameter must be the same as the first instruction. 904 SDValue Numerator = Op.getOperand(1); 905 SDValue Denominator = Op.getOperand(2); 906 907 // Note this order is opposite of the machine instruction's operations, 908 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The 909 // intrinsic has the numerator as the first operand to match a normal 910 // division operation. 911 912 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; 913 914 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, 915 Denominator, Numerator); 916 } 917 918 case Intrinsic::AMDGPU_div_fmas: 919 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, 920 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), 921 Op.getOperand(4)); 922 923 case Intrinsic::AMDGPU_div_fixup: 924 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, 925 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 926 927 case Intrinsic::AMDGPU_trig_preop: 928 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, 929 Op.getOperand(1), Op.getOperand(2)); 930 931 case Intrinsic::AMDGPU_rcp: 932 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); 933 934 case Intrinsic::AMDGPU_rsq: 935 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 936 937 case AMDGPUIntrinsic::AMDGPU_legacy_rsq: 938 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 939 940 case Intrinsic::AMDGPU_rsq_clamped: 941 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 942 Type *Type = VT.getTypeForEVT(*DAG.getContext()); 943 APFloat Max = APFloat::getLargest(Type->getFltSemantics()); 944 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); 945 946 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 947 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, 948 DAG.getConstantFP(Max, VT)); 949 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, 950 DAG.getConstantFP(Min, VT)); 951 } else { 952 return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); 953 } 954 955 case Intrinsic::AMDGPU_ldexp: 956 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), 957 Op.getOperand(2)); 958 959 case AMDGPUIntrinsic::AMDGPU_imax: 960 return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), 961 Op.getOperand(2)); 962 case AMDGPUIntrinsic::AMDGPU_umax: 963 return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1), 964 Op.getOperand(2)); 965 case AMDGPUIntrinsic::AMDGPU_imin: 966 return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1), 967 Op.getOperand(2)); 968 case AMDGPUIntrinsic::AMDGPU_umin: 969 return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), 970 Op.getOperand(2)); 971 972 case AMDGPUIntrinsic::AMDGPU_umul24: 973 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, 974 Op.getOperand(1), Op.getOperand(2)); 975 976 case AMDGPUIntrinsic::AMDGPU_imul24: 977 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, 978 Op.getOperand(1), Op.getOperand(2)); 979 980 case AMDGPUIntrinsic::AMDGPU_umad24: 981 return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT, 982 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 983 984 case AMDGPUIntrinsic::AMDGPU_imad24: 985 return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, 986 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 987 988 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: 989 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); 990 991 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: 992 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); 993 994 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: 995 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); 996 997 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: 998 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); 999 1000 case AMDGPUIntrinsic::AMDGPU_bfe_i32: 1001 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, 1002 Op.getOperand(1), 1003 Op.getOperand(2), 1004 Op.getOperand(3)); 1005 1006 case AMDGPUIntrinsic::AMDGPU_bfe_u32: 1007 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, 1008 Op.getOperand(1), 1009 Op.getOperand(2), 1010 Op.getOperand(3)); 1011 1012 case AMDGPUIntrinsic::AMDGPU_bfi: 1013 return DAG.getNode(AMDGPUISD::BFI, DL, VT, 1014 Op.getOperand(1), 1015 Op.getOperand(2), 1016 Op.getOperand(3)); 1017 1018 case AMDGPUIntrinsic::AMDGPU_bfm: 1019 return DAG.getNode(AMDGPUISD::BFM, DL, VT, 1020 Op.getOperand(1), 1021 Op.getOperand(2)); 1022 1023 case AMDGPUIntrinsic::AMDGPU_brev: 1024 return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); 1025 1026 case Intrinsic::AMDGPU_class: 1027 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, 1028 Op.getOperand(1), Op.getOperand(2)); 1029 1030 case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. 1031 return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); 1032 1033 case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. 1034 return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); 1035 case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. 1036 return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); 1037 } 1038 } 1039 1040 ///IABS(a) = SMAX(sub(0, a), a) 1041 SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, 1042 SelectionDAG &DAG) const { 1043 SDLoc DL(Op); 1044 EVT VT = Op.getValueType(); 1045 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), 1046 Op.getOperand(1)); 1047 1048 return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1)); 1049 } 1050 1051 /// Linear Interpolation 1052 /// LRP(a, b, c) = muladd(a, b, (1 - a) * c) 1053 SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, 1054 SelectionDAG &DAG) const { 1055 SDLoc DL(Op); 1056 EVT VT = Op.getValueType(); 1057 SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, 1058 DAG.getConstantFP(1.0f, MVT::f32), 1059 Op.getOperand(1)); 1060 SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, 1061 Op.getOperand(3)); 1062 return DAG.getNode(ISD::FADD, DL, VT, 1063 DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), 1064 OneSubAC); 1065 } 1066 1067 /// \brief Generate Min/Max node 1068 SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, 1069 EVT VT, 1070 SDValue LHS, 1071 SDValue RHS, 1072 SDValue True, 1073 SDValue False, 1074 SDValue CC, 1075 DAGCombinerInfo &DCI) const { 1076 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1077 return SDValue(); 1078 1079 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) 1080 return SDValue(); 1081 1082 SelectionDAG &DAG = DCI.DAG; 1083 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1084 switch (CCOpcode) { 1085 case ISD::SETOEQ: 1086 case ISD::SETONE: 1087 case ISD::SETUNE: 1088 case ISD::SETNE: 1089 case ISD::SETUEQ: 1090 case ISD::SETEQ: 1091 case ISD::SETFALSE: 1092 case ISD::SETFALSE2: 1093 case ISD::SETTRUE: 1094 case ISD::SETTRUE2: 1095 case ISD::SETUO: 1096 case ISD::SETO: 1097 break; 1098 case ISD::SETULE: 1099 case ISD::SETULT: { 1100 if (LHS == True) 1101 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1102 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1103 } 1104 case ISD::SETOLE: 1105 case ISD::SETOLT: 1106 case ISD::SETLE: 1107 case ISD::SETLT: { 1108 // Ordered. Assume ordered for undefined. 1109 1110 // Only do this after legalization to avoid interfering with other combines 1111 // which might occur. 1112 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1113 !DCI.isCalledByLegalizer()) 1114 return SDValue(); 1115 1116 // We need to permute the operands to get the correct NaN behavior. The 1117 // selected operand is the second one based on the failing compare with NaN, 1118 // so permute it based on the compare type the hardware uses. 1119 if (LHS == True) 1120 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1121 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1122 } 1123 case ISD::SETUGE: 1124 case ISD::SETUGT: { 1125 if (LHS == True) 1126 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1127 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1128 } 1129 case ISD::SETGT: 1130 case ISD::SETGE: 1131 case ISD::SETOGE: 1132 case ISD::SETOGT: { 1133 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1134 !DCI.isCalledByLegalizer()) 1135 return SDValue(); 1136 1137 if (LHS == True) 1138 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1139 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1140 } 1141 case ISD::SETCC_INVALID: 1142 llvm_unreachable("Invalid setcc condcode!"); 1143 } 1144 return SDValue(); 1145 } 1146 1147 /// \brief Generate Min/Max node 1148 SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL, 1149 EVT VT, 1150 SDValue LHS, 1151 SDValue RHS, 1152 SDValue True, 1153 SDValue False, 1154 SDValue CC, 1155 SelectionDAG &DAG) const { 1156 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) 1157 return SDValue(); 1158 1159 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1160 switch (CCOpcode) { 1161 case ISD::SETULE: 1162 case ISD::SETULT: { 1163 unsigned Opc = (LHS == True) ? AMDGPUISD::UMIN : AMDGPUISD::UMAX; 1164 return DAG.getNode(Opc, DL, VT, LHS, RHS); 1165 } 1166 case ISD::SETLE: 1167 case ISD::SETLT: { 1168 unsigned Opc = (LHS == True) ? AMDGPUISD::SMIN : AMDGPUISD::SMAX; 1169 return DAG.getNode(Opc, DL, VT, LHS, RHS); 1170 } 1171 case ISD::SETGT: 1172 case ISD::SETGE: { 1173 unsigned Opc = (LHS == True) ? AMDGPUISD::SMAX : AMDGPUISD::SMIN; 1174 return DAG.getNode(Opc, DL, VT, LHS, RHS); 1175 } 1176 case ISD::SETUGE: 1177 case ISD::SETUGT: { 1178 unsigned Opc = (LHS == True) ? AMDGPUISD::UMAX : AMDGPUISD::UMIN; 1179 return DAG.getNode(Opc, DL, VT, LHS, RHS); 1180 } 1181 default: 1182 return SDValue(); 1183 } 1184 } 1185 1186 SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, 1187 SelectionDAG &DAG) const { 1188 LoadSDNode *Load = cast<LoadSDNode>(Op); 1189 EVT MemVT = Load->getMemoryVT(); 1190 EVT MemEltVT = MemVT.getVectorElementType(); 1191 1192 EVT LoadVT = Op.getValueType(); 1193 EVT EltVT = LoadVT.getVectorElementType(); 1194 EVT PtrVT = Load->getBasePtr().getValueType(); 1195 1196 unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); 1197 SmallVector<SDValue, 8> Loads; 1198 SmallVector<SDValue, 8> Chains; 1199 1200 SDLoc SL(Op); 1201 unsigned MemEltSize = MemEltVT.getStoreSize(); 1202 MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); 1203 1204 for (unsigned i = 0; i < NumElts; ++i) { 1205 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), 1206 DAG.getConstant(i * MemEltSize, PtrVT)); 1207 1208 SDValue NewLoad 1209 = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, 1210 Load->getChain(), Ptr, 1211 SrcValue.getWithOffset(i * MemEltSize), 1212 MemEltVT, Load->isVolatile(), Load->isNonTemporal(), 1213 Load->isInvariant(), Load->getAlignment()); 1214 Loads.push_back(NewLoad.getValue(0)); 1215 Chains.push_back(NewLoad.getValue(1)); 1216 } 1217 1218 SDValue Ops[] = { 1219 DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), 1220 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) 1221 }; 1222 1223 return DAG.getMergeValues(Ops, SL); 1224 } 1225 1226 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, 1227 SelectionDAG &DAG) const { 1228 EVT VT = Op.getValueType(); 1229 1230 // If this is a 2 element vector, we really want to scalarize and not create 1231 // weird 1 element vectors. 1232 if (VT.getVectorNumElements() == 2) 1233 return ScalarizeVectorLoad(Op, DAG); 1234 1235 LoadSDNode *Load = cast<LoadSDNode>(Op); 1236 SDValue BasePtr = Load->getBasePtr(); 1237 EVT PtrVT = BasePtr.getValueType(); 1238 EVT MemVT = Load->getMemoryVT(); 1239 SDLoc SL(Op); 1240 MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); 1241 1242 EVT LoVT, HiVT; 1243 EVT LoMemVT, HiMemVT; 1244 SDValue Lo, Hi; 1245 1246 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); 1247 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); 1248 std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); 1249 SDValue LoLoad 1250 = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, 1251 Load->getChain(), BasePtr, 1252 SrcValue, 1253 LoMemVT, Load->isVolatile(), Load->isNonTemporal(), 1254 Load->isInvariant(), Load->getAlignment()); 1255 1256 SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 1257 DAG.getConstant(LoMemVT.getStoreSize(), PtrVT)); 1258 1259 SDValue HiLoad 1260 = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, 1261 Load->getChain(), HiPtr, 1262 SrcValue.getWithOffset(LoMemVT.getStoreSize()), 1263 HiMemVT, Load->isVolatile(), Load->isNonTemporal(), 1264 Load->isInvariant(), Load->getAlignment()); 1265 1266 SDValue Ops[] = { 1267 DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), 1268 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 1269 LoLoad.getValue(1), HiLoad.getValue(1)) 1270 }; 1271 1272 return DAG.getMergeValues(Ops, SL); 1273 } 1274 1275 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, 1276 SelectionDAG &DAG) const { 1277 StoreSDNode *Store = cast<StoreSDNode>(Op); 1278 EVT MemVT = Store->getMemoryVT(); 1279 unsigned MemBits = MemVT.getSizeInBits(); 1280 1281 // Byte stores are really expensive, so if possible, try to pack 32-bit vector 1282 // truncating store into an i32 store. 1283 // XXX: We could also handle optimize other vector bitwidths. 1284 if (!MemVT.isVector() || MemBits > 32) { 1285 return SDValue(); 1286 } 1287 1288 SDLoc DL(Op); 1289 SDValue Value = Store->getValue(); 1290 EVT VT = Value.getValueType(); 1291 EVT ElemVT = VT.getVectorElementType(); 1292 SDValue Ptr = Store->getBasePtr(); 1293 EVT MemEltVT = MemVT.getVectorElementType(); 1294 unsigned MemEltBits = MemEltVT.getSizeInBits(); 1295 unsigned MemNumElements = MemVT.getVectorNumElements(); 1296 unsigned PackedSize = MemVT.getStoreSizeInBits(); 1297 SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32); 1298 1299 assert(Value.getValueType().getScalarSizeInBits() >= 32); 1300 1301 SDValue PackedValue; 1302 for (unsigned i = 0; i < MemNumElements; ++i) { 1303 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, 1304 DAG.getConstant(i, MVT::i32)); 1305 Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); 1306 Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg 1307 1308 SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32); 1309 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); 1310 1311 if (i == 0) { 1312 PackedValue = Elt; 1313 } else { 1314 PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); 1315 } 1316 } 1317 1318 if (PackedSize < 32) { 1319 EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); 1320 return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, 1321 Store->getMemOperand()->getPointerInfo(), 1322 PackedVT, 1323 Store->isNonTemporal(), Store->isVolatile(), 1324 Store->getAlignment()); 1325 } 1326 1327 return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, 1328 Store->getMemOperand()->getPointerInfo(), 1329 Store->isVolatile(), Store->isNonTemporal(), 1330 Store->getAlignment()); 1331 } 1332 1333 SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op, 1334 SelectionDAG &DAG) const { 1335 StoreSDNode *Store = cast<StoreSDNode>(Op); 1336 EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); 1337 EVT EltVT = Store->getValue().getValueType().getVectorElementType(); 1338 EVT PtrVT = Store->getBasePtr().getValueType(); 1339 unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); 1340 SDLoc SL(Op); 1341 1342 SmallVector<SDValue, 8> Chains; 1343 1344 unsigned EltSize = MemEltVT.getStoreSize(); 1345 MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); 1346 1347 for (unsigned i = 0, e = NumElts; i != e; ++i) { 1348 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, 1349 Store->getValue(), 1350 DAG.getConstant(i, MVT::i32)); 1351 1352 SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), PtrVT); 1353 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset); 1354 SDValue NewStore = 1355 DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, 1356 SrcValue.getWithOffset(i * EltSize), 1357 MemEltVT, Store->isNonTemporal(), Store->isVolatile(), 1358 Store->getAlignment()); 1359 Chains.push_back(NewStore); 1360 } 1361 1362 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); 1363 } 1364 1365 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, 1366 SelectionDAG &DAG) const { 1367 StoreSDNode *Store = cast<StoreSDNode>(Op); 1368 SDValue Val = Store->getValue(); 1369 EVT VT = Val.getValueType(); 1370 1371 // If this is a 2 element vector, we really want to scalarize and not create 1372 // weird 1 element vectors. 1373 if (VT.getVectorNumElements() == 2) 1374 return ScalarizeVectorStore(Op, DAG); 1375 1376 EVT MemVT = Store->getMemoryVT(); 1377 SDValue Chain = Store->getChain(); 1378 SDValue BasePtr = Store->getBasePtr(); 1379 SDLoc SL(Op); 1380 1381 EVT LoVT, HiVT; 1382 EVT LoMemVT, HiMemVT; 1383 SDValue Lo, Hi; 1384 1385 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); 1386 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); 1387 std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); 1388 1389 EVT PtrVT = BasePtr.getValueType(); 1390 SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 1391 DAG.getConstant(LoMemVT.getStoreSize(), PtrVT)); 1392 1393 MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); 1394 SDValue LoStore 1395 = DAG.getTruncStore(Chain, SL, Lo, 1396 BasePtr, 1397 SrcValue, 1398 LoMemVT, 1399 Store->isNonTemporal(), 1400 Store->isVolatile(), 1401 Store->getAlignment()); 1402 SDValue HiStore 1403 = DAG.getTruncStore(Chain, SL, Hi, 1404 HiPtr, 1405 SrcValue.getWithOffset(LoMemVT.getStoreSize()), 1406 HiMemVT, 1407 Store->isNonTemporal(), 1408 Store->isVolatile(), 1409 Store->getAlignment()); 1410 1411 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); 1412 } 1413 1414 1415 SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1416 SDLoc DL(Op); 1417 LoadSDNode *Load = cast<LoadSDNode>(Op); 1418 ISD::LoadExtType ExtType = Load->getExtensionType(); 1419 EVT VT = Op.getValueType(); 1420 EVT MemVT = Load->getMemoryVT(); 1421 1422 if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { 1423 assert(VT == MVT::i1 && "Only i1 non-extloads expected"); 1424 // FIXME: Copied from PPC 1425 // First, load into 32 bits, then truncate to 1 bit. 1426 1427 SDValue Chain = Load->getChain(); 1428 SDValue BasePtr = Load->getBasePtr(); 1429 MachineMemOperand *MMO = Load->getMemOperand(); 1430 1431 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, 1432 BasePtr, MVT::i8, MMO); 1433 1434 SDValue Ops[] = { 1435 DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), 1436 NewLD.getValue(1) 1437 }; 1438 1439 return DAG.getMergeValues(Ops, DL); 1440 } 1441 1442 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS || 1443 Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || 1444 ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) 1445 return SDValue(); 1446 1447 1448 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), 1449 DAG.getConstant(2, MVT::i32)); 1450 SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), 1451 Load->getChain(), Ptr, 1452 DAG.getTargetConstant(0, MVT::i32), 1453 Op.getOperand(2)); 1454 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, 1455 Load->getBasePtr(), 1456 DAG.getConstant(0x3, MVT::i32)); 1457 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1458 DAG.getConstant(3, MVT::i32)); 1459 1460 Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); 1461 1462 EVT MemEltVT = MemVT.getScalarType(); 1463 if (ExtType == ISD::SEXTLOAD) { 1464 SDValue MemEltVTNode = DAG.getValueType(MemEltVT); 1465 1466 SDValue Ops[] = { 1467 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), 1468 Load->getChain() 1469 }; 1470 1471 return DAG.getMergeValues(Ops, DL); 1472 } 1473 1474 SDValue Ops[] = { 1475 DAG.getZeroExtendInReg(Ret, DL, MemEltVT), 1476 Load->getChain() 1477 }; 1478 1479 return DAG.getMergeValues(Ops, DL); 1480 } 1481 1482 SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1483 SDLoc DL(Op); 1484 SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); 1485 if (Result.getNode()) { 1486 return Result; 1487 } 1488 1489 StoreSDNode *Store = cast<StoreSDNode>(Op); 1490 SDValue Chain = Store->getChain(); 1491 if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1492 Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && 1493 Store->getValue().getValueType().isVector()) { 1494 return ScalarizeVectorStore(Op, DAG); 1495 } 1496 1497 EVT MemVT = Store->getMemoryVT(); 1498 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && 1499 MemVT.bitsLT(MVT::i32)) { 1500 unsigned Mask = 0; 1501 if (Store->getMemoryVT() == MVT::i8) { 1502 Mask = 0xff; 1503 } else if (Store->getMemoryVT() == MVT::i16) { 1504 Mask = 0xffff; 1505 } 1506 SDValue BasePtr = Store->getBasePtr(); 1507 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, 1508 DAG.getConstant(2, MVT::i32)); 1509 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 1510 Chain, Ptr, DAG.getTargetConstant(0, MVT::i32)); 1511 1512 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, 1513 DAG.getConstant(0x3, MVT::i32)); 1514 1515 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1516 DAG.getConstant(3, MVT::i32)); 1517 1518 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, 1519 Store->getValue()); 1520 1521 SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); 1522 1523 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 1524 MaskedValue, ShiftAmt); 1525 1526 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32), 1527 ShiftAmt); 1528 DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, 1529 DAG.getConstant(0xffffffff, MVT::i32)); 1530 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 1531 1532 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 1533 return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1534 Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32)); 1535 } 1536 return SDValue(); 1537 } 1538 1539 // This is a shortcut for integer division because we have fast i32<->f32 1540 // conversions, and fast f32 reciprocal instructions. The fractional part of a 1541 // float is enough to accurately represent up to a 24-bit integer. 1542 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const { 1543 SDLoc DL(Op); 1544 EVT VT = Op.getValueType(); 1545 SDValue LHS = Op.getOperand(0); 1546 SDValue RHS = Op.getOperand(1); 1547 MVT IntVT = MVT::i32; 1548 MVT FltVT = MVT::f32; 1549 1550 ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; 1551 ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; 1552 1553 if (VT.isVector()) { 1554 unsigned NElts = VT.getVectorNumElements(); 1555 IntVT = MVT::getVectorVT(MVT::i32, NElts); 1556 FltVT = MVT::getVectorVT(MVT::f32, NElts); 1557 } 1558 1559 unsigned BitSize = VT.getScalarType().getSizeInBits(); 1560 1561 SDValue jq = DAG.getConstant(1, IntVT); 1562 1563 if (sign) { 1564 // char|short jq = ia ^ ib; 1565 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); 1566 1567 // jq = jq >> (bitsize - 2) 1568 jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT)); 1569 1570 // jq = jq | 0x1 1571 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT)); 1572 1573 // jq = (int)jq 1574 jq = DAG.getSExtOrTrunc(jq, DL, IntVT); 1575 } 1576 1577 // int ia = (int)LHS; 1578 SDValue ia = sign ? 1579 DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT); 1580 1581 // int ib, (int)RHS; 1582 SDValue ib = sign ? 1583 DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT); 1584 1585 // float fa = (float)ia; 1586 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); 1587 1588 // float fb = (float)ib; 1589 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); 1590 1591 // float fq = native_divide(fa, fb); 1592 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, 1593 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); 1594 1595 // fq = trunc(fq); 1596 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); 1597 1598 // float fqneg = -fq; 1599 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); 1600 1601 // float fr = mad(fqneg, fb, fa); 1602 SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT, 1603 DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa); 1604 1605 // int iq = (int)fq; 1606 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); 1607 1608 // fr = fabs(fr); 1609 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); 1610 1611 // fb = fabs(fb); 1612 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); 1613 1614 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT); 1615 1616 // int cv = fr >= fb; 1617 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); 1618 1619 // jq = (cv ? jq : 0); 1620 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, VT)); 1621 1622 // dst = trunc/extend to legal type 1623 iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); 1624 1625 // dst = iq + jq; 1626 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); 1627 1628 // Rem needs compensation, it's easier to recompute it 1629 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); 1630 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); 1631 1632 SDValue Res[2] = { 1633 Div, 1634 Rem 1635 }; 1636 return DAG.getMergeValues(Res, DL); 1637 } 1638 1639 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, 1640 SelectionDAG &DAG, 1641 SmallVectorImpl<SDValue> &Results) const { 1642 assert(Op.getValueType() == MVT::i64); 1643 1644 SDLoc DL(Op); 1645 EVT VT = Op.getValueType(); 1646 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 1647 1648 SDValue one = DAG.getConstant(1, HalfVT); 1649 SDValue zero = DAG.getConstant(0, HalfVT); 1650 1651 //HiLo split 1652 SDValue LHS = Op.getOperand(0); 1653 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); 1654 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); 1655 1656 SDValue RHS = Op.getOperand(1); 1657 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); 1658 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); 1659 1660 if (VT == MVT::i64 && 1661 DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && 1662 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { 1663 1664 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 1665 LHS_Lo, RHS_Lo); 1666 1667 SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero); 1668 SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero); 1669 Results.push_back(DIV); 1670 Results.push_back(REM); 1671 return; 1672 } 1673 1674 // Get Speculative values 1675 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); 1676 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); 1677 1678 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); 1679 SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero); 1680 1681 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); 1682 SDValue DIV_Lo = zero; 1683 1684 const unsigned halfBitWidth = HalfVT.getSizeInBits(); 1685 1686 for (unsigned i = 0; i < halfBitWidth; ++i) { 1687 const unsigned bitPos = halfBitWidth - i - 1; 1688 SDValue POS = DAG.getConstant(bitPos, HalfVT); 1689 // Get value of high bit 1690 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); 1691 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); 1692 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); 1693 1694 // Shift 1695 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, VT)); 1696 // Add LHS high bit 1697 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); 1698 1699 SDValue BIT = DAG.getConstant(1 << bitPos, HalfVT); 1700 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); 1701 1702 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); 1703 1704 // Update REM 1705 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); 1706 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); 1707 } 1708 1709 SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); 1710 Results.push_back(DIV); 1711 Results.push_back(REM); 1712 } 1713 1714 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, 1715 SelectionDAG &DAG) const { 1716 SDLoc DL(Op); 1717 EVT VT = Op.getValueType(); 1718 1719 if (VT == MVT::i64) { 1720 SmallVector<SDValue, 2> Results; 1721 LowerUDIVREM64(Op, DAG, Results); 1722 return DAG.getMergeValues(Results, DL); 1723 } 1724 1725 SDValue Num = Op.getOperand(0); 1726 SDValue Den = Op.getOperand(1); 1727 1728 if (VT == MVT::i32) { 1729 if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) && 1730 DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) { 1731 // TODO: We technically could do this for i64, but shouldn't that just be 1732 // handled by something generally reducing 64-bit division on 32-bit 1733 // values to 32-bit? 1734 return LowerDIVREM24(Op, DAG, false); 1735 } 1736 } 1737 1738 // RCP = URECIP(Den) = 2^32 / Den + e 1739 // e is rounding error. 1740 SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); 1741 1742 // RCP_LO = mul(RCP, Den) */ 1743 SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); 1744 1745 // RCP_HI = mulhu (RCP, Den) */ 1746 SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); 1747 1748 // NEG_RCP_LO = -RCP_LO 1749 SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), 1750 RCP_LO); 1751 1752 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 1753 SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), 1754 NEG_RCP_LO, RCP_LO, 1755 ISD::SETEQ); 1756 // Calculate the rounding error from the URECIP instruction 1757 // E = mulhu(ABS_RCP_LO, RCP) 1758 SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); 1759 1760 // RCP_A_E = RCP + E 1761 SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); 1762 1763 // RCP_S_E = RCP - E 1764 SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); 1765 1766 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 1767 SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), 1768 RCP_A_E, RCP_S_E, 1769 ISD::SETEQ); 1770 // Quotient = mulhu(Tmp0, Num) 1771 SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); 1772 1773 // Num_S_Remainder = Quotient * Den 1774 SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); 1775 1776 // Remainder = Num - Num_S_Remainder 1777 SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); 1778 1779 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) 1780 SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, 1781 DAG.getConstant(-1, VT), 1782 DAG.getConstant(0, VT), 1783 ISD::SETUGE); 1784 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) 1785 SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, 1786 Num_S_Remainder, 1787 DAG.getConstant(-1, VT), 1788 DAG.getConstant(0, VT), 1789 ISD::SETUGE); 1790 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 1791 SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, 1792 Remainder_GE_Zero); 1793 1794 // Calculate Division result: 1795 1796 // Quotient_A_One = Quotient + 1 1797 SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, 1798 DAG.getConstant(1, VT)); 1799 1800 // Quotient_S_One = Quotient - 1 1801 SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, 1802 DAG.getConstant(1, VT)); 1803 1804 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) 1805 SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), 1806 Quotient, Quotient_A_One, ISD::SETEQ); 1807 1808 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) 1809 Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), 1810 Quotient_S_One, Div, ISD::SETEQ); 1811 1812 // Calculate Rem result: 1813 1814 // Remainder_S_Den = Remainder - Den 1815 SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); 1816 1817 // Remainder_A_Den = Remainder + Den 1818 SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); 1819 1820 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) 1821 SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), 1822 Remainder, Remainder_S_Den, ISD::SETEQ); 1823 1824 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) 1825 Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), 1826 Remainder_A_Den, Rem, ISD::SETEQ); 1827 SDValue Ops[2] = { 1828 Div, 1829 Rem 1830 }; 1831 return DAG.getMergeValues(Ops, DL); 1832 } 1833 1834 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, 1835 SelectionDAG &DAG) const { 1836 SDLoc DL(Op); 1837 EVT VT = Op.getValueType(); 1838 1839 SDValue LHS = Op.getOperand(0); 1840 SDValue RHS = Op.getOperand(1); 1841 1842 SDValue Zero = DAG.getConstant(0, VT); 1843 SDValue NegOne = DAG.getConstant(-1, VT); 1844 1845 if (VT == MVT::i32 && 1846 DAG.ComputeNumSignBits(LHS) > 8 && 1847 DAG.ComputeNumSignBits(RHS) > 8) { 1848 return LowerDIVREM24(Op, DAG, true); 1849 } 1850 if (VT == MVT::i64 && 1851 DAG.ComputeNumSignBits(LHS) > 32 && 1852 DAG.ComputeNumSignBits(RHS) > 32) { 1853 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 1854 1855 //HiLo split 1856 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); 1857 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); 1858 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 1859 LHS_Lo, RHS_Lo); 1860 SDValue Res[2] = { 1861 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), 1862 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) 1863 }; 1864 return DAG.getMergeValues(Res, DL); 1865 } 1866 1867 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); 1868 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); 1869 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); 1870 SDValue RSign = LHSign; // Remainder sign is the same as LHS 1871 1872 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); 1873 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); 1874 1875 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); 1876 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); 1877 1878 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); 1879 SDValue Rem = Div.getValue(1); 1880 1881 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); 1882 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); 1883 1884 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); 1885 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); 1886 1887 SDValue Res[2] = { 1888 Div, 1889 Rem 1890 }; 1891 return DAG.getMergeValues(Res, DL); 1892 } 1893 1894 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) 1895 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { 1896 SDLoc SL(Op); 1897 EVT VT = Op.getValueType(); 1898 SDValue X = Op.getOperand(0); 1899 SDValue Y = Op.getOperand(1); 1900 1901 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); 1902 SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); 1903 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); 1904 1905 return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); 1906 } 1907 1908 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { 1909 SDLoc SL(Op); 1910 SDValue Src = Op.getOperand(0); 1911 1912 // result = trunc(src) 1913 // if (src > 0.0 && src != result) 1914 // result += 1.0 1915 1916 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 1917 1918 const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64); 1919 const SDValue One = DAG.getConstantFP(1.0, MVT::f64); 1920 1921 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); 1922 1923 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); 1924 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 1925 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 1926 1927 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); 1928 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 1929 } 1930 1931 static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) { 1932 const unsigned FractBits = 52; 1933 const unsigned ExpBits = 11; 1934 1935 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, 1936 Hi, 1937 DAG.getConstant(FractBits - 32, MVT::i32), 1938 DAG.getConstant(ExpBits, MVT::i32)); 1939 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, 1940 DAG.getConstant(1023, MVT::i32)); 1941 1942 return Exp; 1943 } 1944 1945 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { 1946 SDLoc SL(Op); 1947 SDValue Src = Op.getOperand(0); 1948 1949 assert(Op.getValueType() == MVT::f64); 1950 1951 const SDValue Zero = DAG.getConstant(0, MVT::i32); 1952 const SDValue One = DAG.getConstant(1, MVT::i32); 1953 1954 SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); 1955 1956 // Extract the upper half, since this is where we will find the sign and 1957 // exponent. 1958 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); 1959 1960 SDValue Exp = extractF64Exponent(Hi, SL, DAG); 1961 1962 const unsigned FractBits = 52; 1963 1964 // Extract the sign bit. 1965 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32); 1966 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); 1967 1968 // Extend back to to 64-bits. 1969 SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 1970 Zero, SignBit); 1971 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); 1972 1973 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); 1974 const SDValue FractMask 1975 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64); 1976 1977 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); 1978 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); 1979 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); 1980 1981 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); 1982 1983 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32); 1984 1985 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); 1986 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); 1987 1988 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); 1989 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); 1990 1991 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); 1992 } 1993 1994 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { 1995 SDLoc SL(Op); 1996 SDValue Src = Op.getOperand(0); 1997 1998 assert(Op.getValueType() == MVT::f64); 1999 2000 APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); 2001 SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64); 2002 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); 2003 2004 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); 2005 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); 2006 2007 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); 2008 2009 APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); 2010 SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64); 2011 2012 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); 2013 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); 2014 2015 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); 2016 } 2017 2018 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { 2019 // FNEARBYINT and FRINT are the same, except in their handling of FP 2020 // exceptions. Those aren't really meaningful for us, and OpenCL only has 2021 // rint, so just treat them as equivalent. 2022 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); 2023 } 2024 2025 // XXX - May require not supporting f32 denormals? 2026 SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const { 2027 SDLoc SL(Op); 2028 SDValue X = Op.getOperand(0); 2029 2030 SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); 2031 2032 SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); 2033 2034 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); 2035 2036 const SDValue Zero = DAG.getConstantFP(0.0, MVT::f32); 2037 const SDValue One = DAG.getConstantFP(1.0, MVT::f32); 2038 const SDValue Half = DAG.getConstantFP(0.5, MVT::f32); 2039 2040 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); 2041 2042 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); 2043 2044 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); 2045 2046 SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero); 2047 2048 return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel); 2049 } 2050 2051 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { 2052 SDLoc SL(Op); 2053 SDValue X = Op.getOperand(0); 2054 2055 SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); 2056 2057 const SDValue Zero = DAG.getConstant(0, MVT::i32); 2058 const SDValue One = DAG.getConstant(1, MVT::i32); 2059 const SDValue NegOne = DAG.getConstant(-1, MVT::i32); 2060 const SDValue FiftyOne = DAG.getConstant(51, MVT::i32); 2061 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); 2062 2063 2064 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 2065 2066 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); 2067 2068 SDValue Exp = extractF64Exponent(Hi, SL, DAG); 2069 2070 const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), MVT::i64); 2071 2072 SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); 2073 SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, 2074 DAG.getConstant(INT64_C(0x0008000000000000), MVT::i64), 2075 Exp); 2076 2077 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); 2078 SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, 2079 DAG.getConstant(0, MVT::i64), Tmp0, 2080 ISD::SETNE); 2081 2082 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, 2083 D, DAG.getConstant(0, MVT::i64)); 2084 SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); 2085 2086 K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); 2087 K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); 2088 2089 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); 2090 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); 2091 SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); 2092 2093 SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, 2094 ExpEqNegOne, 2095 DAG.getConstantFP(1.0, MVT::f64), 2096 DAG.getConstantFP(0.0, MVT::f64)); 2097 2098 SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); 2099 2100 K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); 2101 K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); 2102 2103 return K; 2104 } 2105 2106 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2107 EVT VT = Op.getValueType(); 2108 2109 if (VT == MVT::f32) 2110 return LowerFROUND32(Op, DAG); 2111 2112 if (VT == MVT::f64) 2113 return LowerFROUND64(Op, DAG); 2114 2115 llvm_unreachable("unhandled type"); 2116 } 2117 2118 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { 2119 SDLoc SL(Op); 2120 SDValue Src = Op.getOperand(0); 2121 2122 // result = trunc(src); 2123 // if (src < 0.0 && src != result) 2124 // result += -1.0. 2125 2126 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2127 2128 const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64); 2129 const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64); 2130 2131 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); 2132 2133 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); 2134 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2135 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2136 2137 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); 2138 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2139 } 2140 2141 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, 2142 bool Signed) const { 2143 SDLoc SL(Op); 2144 SDValue Src = Op.getOperand(0); 2145 2146 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); 2147 2148 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, 2149 DAG.getConstant(0, MVT::i32)); 2150 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, 2151 DAG.getConstant(1, MVT::i32)); 2152 2153 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, 2154 SL, MVT::f64, Hi); 2155 2156 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); 2157 2158 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, 2159 DAG.getConstant(32, MVT::i32)); 2160 2161 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); 2162 } 2163 2164 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, 2165 SelectionDAG &DAG) const { 2166 SDValue S0 = Op.getOperand(0); 2167 if (S0.getValueType() != MVT::i64) 2168 return SDValue(); 2169 2170 EVT DestVT = Op.getValueType(); 2171 if (DestVT == MVT::f64) 2172 return LowerINT_TO_FP64(Op, DAG, false); 2173 2174 assert(DestVT == MVT::f32); 2175 2176 SDLoc DL(Op); 2177 2178 // f32 uint_to_fp i64 2179 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, 2180 DAG.getConstant(0, MVT::i32)); 2181 SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); 2182 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, 2183 DAG.getConstant(1, MVT::i32)); 2184 SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); 2185 FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, 2186 DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32 2187 return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); 2188 } 2189 2190 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, 2191 SelectionDAG &DAG) const { 2192 SDValue Src = Op.getOperand(0); 2193 if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64) 2194 return LowerINT_TO_FP64(Op, DAG, true); 2195 2196 return SDValue(); 2197 } 2198 2199 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, 2200 bool Signed) const { 2201 SDLoc SL(Op); 2202 2203 SDValue Src = Op.getOperand(0); 2204 2205 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2206 2207 SDValue K0 2208 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), MVT::f64); 2209 SDValue K1 2210 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), MVT::f64); 2211 2212 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); 2213 2214 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); 2215 2216 2217 SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc); 2218 2219 SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL, 2220 MVT::i32, FloorMul); 2221 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); 2222 2223 SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi); 2224 2225 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); 2226 } 2227 2228 SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, 2229 SelectionDAG &DAG) const { 2230 SDValue Src = Op.getOperand(0); 2231 2232 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) 2233 return LowerFP64_TO_INT(Op, DAG, true); 2234 2235 return SDValue(); 2236 } 2237 2238 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, 2239 SelectionDAG &DAG) const { 2240 SDValue Src = Op.getOperand(0); 2241 2242 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) 2243 return LowerFP64_TO_INT(Op, DAG, false); 2244 2245 return SDValue(); 2246 } 2247 2248 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 2249 SelectionDAG &DAG) const { 2250 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 2251 MVT VT = Op.getSimpleValueType(); 2252 MVT ScalarVT = VT.getScalarType(); 2253 2254 if (!VT.isVector()) 2255 return SDValue(); 2256 2257 SDValue Src = Op.getOperand(0); 2258 SDLoc DL(Op); 2259 2260 // TODO: Don't scalarize on Evergreen? 2261 unsigned NElts = VT.getVectorNumElements(); 2262 SmallVector<SDValue, 8> Args; 2263 DAG.ExtractVectorElements(Src, Args, 0, NElts); 2264 2265 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); 2266 for (unsigned I = 0; I < NElts; ++I) 2267 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); 2268 2269 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args); 2270 } 2271 2272 //===----------------------------------------------------------------------===// 2273 // Custom DAG optimizations 2274 //===----------------------------------------------------------------------===// 2275 2276 static bool isU24(SDValue Op, SelectionDAG &DAG) { 2277 APInt KnownZero, KnownOne; 2278 EVT VT = Op.getValueType(); 2279 DAG.computeKnownBits(Op, KnownZero, KnownOne); 2280 2281 return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; 2282 } 2283 2284 static bool isI24(SDValue Op, SelectionDAG &DAG) { 2285 EVT VT = Op.getValueType(); 2286 2287 // In order for this to be a signed 24-bit value, bit 23, must 2288 // be a sign bit. 2289 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated 2290 // as unsigned 24-bit values. 2291 (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; 2292 } 2293 2294 static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { 2295 2296 SelectionDAG &DAG = DCI.DAG; 2297 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 2298 EVT VT = Op.getValueType(); 2299 2300 APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); 2301 APInt KnownZero, KnownOne; 2302 TargetLowering::TargetLoweringOpt TLO(DAG, true, true); 2303 if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) 2304 DCI.CommitTargetLoweringOpt(TLO); 2305 } 2306 2307 template <typename IntTy> 2308 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, 2309 uint32_t Offset, uint32_t Width) { 2310 if (Width + Offset < 32) { 2311 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); 2312 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); 2313 return DAG.getConstant(Result, MVT::i32); 2314 } 2315 2316 return DAG.getConstant(Src0 >> Offset, MVT::i32); 2317 } 2318 2319 static bool usesAllNormalStores(SDNode *LoadVal) { 2320 for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { 2321 if (!ISD::isNormalStore(*I)) 2322 return false; 2323 } 2324 2325 return true; 2326 } 2327 2328 // If we have a copy of an illegal type, replace it with a load / store of an 2329 // equivalently sized legal type. This avoids intermediate bit pack / unpack 2330 // instructions emitted when handling extloads and truncstores. Ideally we could 2331 // recognize the pack / unpack pattern to eliminate it. 2332 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, 2333 DAGCombinerInfo &DCI) const { 2334 if (!DCI.isBeforeLegalize()) 2335 return SDValue(); 2336 2337 StoreSDNode *SN = cast<StoreSDNode>(N); 2338 SDValue Value = SN->getValue(); 2339 EVT VT = Value.getValueType(); 2340 2341 if (isTypeLegal(VT) || SN->isVolatile() || 2342 !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8) 2343 return SDValue(); 2344 2345 LoadSDNode *LoadVal = cast<LoadSDNode>(Value); 2346 if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) 2347 return SDValue(); 2348 2349 EVT MemVT = LoadVal->getMemoryVT(); 2350 2351 SDLoc SL(N); 2352 SelectionDAG &DAG = DCI.DAG; 2353 EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); 2354 2355 SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, 2356 LoadVT, SL, 2357 LoadVal->getChain(), 2358 LoadVal->getBasePtr(), 2359 LoadVal->getOffset(), 2360 LoadVT, 2361 LoadVal->getMemOperand()); 2362 2363 SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); 2364 DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); 2365 2366 return DAG.getStore(SN->getChain(), SL, NewLoad, 2367 SN->getBasePtr(), SN->getMemOperand()); 2368 } 2369 2370 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, 2371 DAGCombinerInfo &DCI) const { 2372 EVT VT = N->getValueType(0); 2373 2374 if (VT.isVector() || VT.getSizeInBits() > 32) 2375 return SDValue(); 2376 2377 SelectionDAG &DAG = DCI.DAG; 2378 SDLoc DL(N); 2379 2380 SDValue N0 = N->getOperand(0); 2381 SDValue N1 = N->getOperand(1); 2382 SDValue Mul; 2383 2384 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 2385 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 2386 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 2387 Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); 2388 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 2389 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 2390 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 2391 Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); 2392 } else { 2393 return SDValue(); 2394 } 2395 2396 // We need to use sext even for MUL_U24, because MUL_U24 is used 2397 // for signed multiply of 8 and 16-bit types. 2398 return DAG.getSExtOrTrunc(Mul, DL, VT); 2399 } 2400 2401 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, 2402 DAGCombinerInfo &DCI) const { 2403 SelectionDAG &DAG = DCI.DAG; 2404 SDLoc DL(N); 2405 2406 switch(N->getOpcode()) { 2407 default: break; 2408 case ISD::MUL: 2409 return performMulCombine(N, DCI); 2410 case AMDGPUISD::MUL_I24: 2411 case AMDGPUISD::MUL_U24: { 2412 SDValue N0 = N->getOperand(0); 2413 SDValue N1 = N->getOperand(1); 2414 simplifyI24(N0, DCI); 2415 simplifyI24(N1, DCI); 2416 return SDValue(); 2417 } 2418 case ISD::SELECT: { 2419 SDValue Cond = N->getOperand(0); 2420 if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { 2421 SDLoc DL(N); 2422 EVT VT = N->getValueType(0); 2423 SDValue LHS = Cond.getOperand(0); 2424 SDValue RHS = Cond.getOperand(1); 2425 SDValue CC = Cond.getOperand(2); 2426 2427 SDValue True = N->getOperand(1); 2428 SDValue False = N->getOperand(2); 2429 2430 if (VT == MVT::f32) 2431 return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); 2432 2433 // TODO: Implement min / max Evergreen instructions. 2434 if (VT == MVT::i32 && 2435 Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 2436 return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); 2437 } 2438 } 2439 2440 break; 2441 } 2442 case AMDGPUISD::BFE_I32: 2443 case AMDGPUISD::BFE_U32: { 2444 assert(!N->getValueType(0).isVector() && 2445 "Vector handling of BFE not implemented"); 2446 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 2447 if (!Width) 2448 break; 2449 2450 uint32_t WidthVal = Width->getZExtValue() & 0x1f; 2451 if (WidthVal == 0) 2452 return DAG.getConstant(0, MVT::i32); 2453 2454 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 2455 if (!Offset) 2456 break; 2457 2458 SDValue BitsFrom = N->getOperand(0); 2459 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; 2460 2461 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; 2462 2463 if (OffsetVal == 0) { 2464 // This is already sign / zero extended, so try to fold away extra BFEs. 2465 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); 2466 2467 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); 2468 if (OpSignBits >= SignBits) 2469 return BitsFrom; 2470 2471 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); 2472 if (Signed) { 2473 // This is a sign_extend_inreg. Replace it to take advantage of existing 2474 // DAG Combines. If not eliminated, we will match back to BFE during 2475 // selection. 2476 2477 // TODO: The sext_inreg of extended types ends, although we can could 2478 // handle them in a single BFE. 2479 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, 2480 DAG.getValueType(SmallVT)); 2481 } 2482 2483 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); 2484 } 2485 2486 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) { 2487 if (Signed) { 2488 return constantFoldBFE<int32_t>(DAG, 2489 CVal->getSExtValue(), 2490 OffsetVal, 2491 WidthVal); 2492 } 2493 2494 return constantFoldBFE<uint32_t>(DAG, 2495 CVal->getZExtValue(), 2496 OffsetVal, 2497 WidthVal); 2498 } 2499 2500 if ((OffsetVal + WidthVal) >= 32) { 2501 SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32); 2502 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, 2503 BitsFrom, ShiftVal); 2504 } 2505 2506 if (BitsFrom.hasOneUse()) { 2507 APInt Demanded = APInt::getBitsSet(32, 2508 OffsetVal, 2509 OffsetVal + WidthVal); 2510 2511 APInt KnownZero, KnownOne; 2512 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 2513 !DCI.isBeforeLegalizeOps()); 2514 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 2515 if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || 2516 TLI.SimplifyDemandedBits(BitsFrom, Demanded, 2517 KnownZero, KnownOne, TLO)) { 2518 DCI.CommitTargetLoweringOpt(TLO); 2519 } 2520 } 2521 2522 break; 2523 } 2524 2525 case ISD::STORE: 2526 return performStoreCombine(N, DCI); 2527 } 2528 return SDValue(); 2529 } 2530 2531 //===----------------------------------------------------------------------===// 2532 // Helper functions 2533 //===----------------------------------------------------------------------===// 2534 2535 void AMDGPUTargetLowering::getOriginalFunctionArgs( 2536 SelectionDAG &DAG, 2537 const Function *F, 2538 const SmallVectorImpl<ISD::InputArg> &Ins, 2539 SmallVectorImpl<ISD::InputArg> &OrigIns) const { 2540 2541 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 2542 if (Ins[i].ArgVT == Ins[i].VT) { 2543 OrigIns.push_back(Ins[i]); 2544 continue; 2545 } 2546 2547 EVT VT; 2548 if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { 2549 // Vector has been split into scalars. 2550 VT = Ins[i].ArgVT.getVectorElementType(); 2551 } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && 2552 Ins[i].ArgVT.getVectorElementType() != 2553 Ins[i].VT.getVectorElementType()) { 2554 // Vector elements have been promoted 2555 VT = Ins[i].ArgVT; 2556 } else { 2557 // Vector has been spilt into smaller vectors. 2558 VT = Ins[i].VT; 2559 } 2560 2561 ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, 2562 Ins[i].OrigArgIndex, Ins[i].PartOffset); 2563 OrigIns.push_back(Arg); 2564 } 2565 } 2566 2567 bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { 2568 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 2569 return CFP->isExactlyValue(1.0); 2570 } 2571 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 2572 return C->isAllOnesValue(); 2573 } 2574 return false; 2575 } 2576 2577 bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { 2578 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 2579 return CFP->getValueAPF().isZero(); 2580 } 2581 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 2582 return C->isNullValue(); 2583 } 2584 return false; 2585 } 2586 2587 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 2588 const TargetRegisterClass *RC, 2589 unsigned Reg, EVT VT) const { 2590 MachineFunction &MF = DAG.getMachineFunction(); 2591 MachineRegisterInfo &MRI = MF.getRegInfo(); 2592 unsigned VirtualRegister; 2593 if (!MRI.isLiveIn(Reg)) { 2594 VirtualRegister = MRI.createVirtualRegister(RC); 2595 MRI.addLiveIn(Reg, VirtualRegister); 2596 } else { 2597 VirtualRegister = MRI.getLiveInVirtReg(Reg); 2598 } 2599 return DAG.getRegister(VirtualRegister, VT); 2600 } 2601 2602 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; 2603 2604 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { 2605 switch (Opcode) { 2606 default: return nullptr; 2607 // AMDIL DAG nodes 2608 NODE_NAME_CASE(CALL); 2609 NODE_NAME_CASE(UMUL); 2610 NODE_NAME_CASE(RET_FLAG); 2611 NODE_NAME_CASE(BRANCH_COND); 2612 2613 // AMDGPU DAG nodes 2614 NODE_NAME_CASE(DWORDADDR) 2615 NODE_NAME_CASE(FRACT) 2616 NODE_NAME_CASE(CLAMP) 2617 NODE_NAME_CASE(FMAX_LEGACY) 2618 NODE_NAME_CASE(SMAX) 2619 NODE_NAME_CASE(UMAX) 2620 NODE_NAME_CASE(FMIN_LEGACY) 2621 NODE_NAME_CASE(SMIN) 2622 NODE_NAME_CASE(UMIN) 2623 NODE_NAME_CASE(FMAX3) 2624 NODE_NAME_CASE(SMAX3) 2625 NODE_NAME_CASE(UMAX3) 2626 NODE_NAME_CASE(FMIN3) 2627 NODE_NAME_CASE(SMIN3) 2628 NODE_NAME_CASE(UMIN3) 2629 NODE_NAME_CASE(URECIP) 2630 NODE_NAME_CASE(DIV_SCALE) 2631 NODE_NAME_CASE(DIV_FMAS) 2632 NODE_NAME_CASE(DIV_FIXUP) 2633 NODE_NAME_CASE(TRIG_PREOP) 2634 NODE_NAME_CASE(RCP) 2635 NODE_NAME_CASE(RSQ) 2636 NODE_NAME_CASE(RSQ_LEGACY) 2637 NODE_NAME_CASE(RSQ_CLAMPED) 2638 NODE_NAME_CASE(LDEXP) 2639 NODE_NAME_CASE(FP_CLASS) 2640 NODE_NAME_CASE(DOT4) 2641 NODE_NAME_CASE(BFE_U32) 2642 NODE_NAME_CASE(BFE_I32) 2643 NODE_NAME_CASE(BFI) 2644 NODE_NAME_CASE(BFM) 2645 NODE_NAME_CASE(BREV) 2646 NODE_NAME_CASE(MUL_U24) 2647 NODE_NAME_CASE(MUL_I24) 2648 NODE_NAME_CASE(MAD_U24) 2649 NODE_NAME_CASE(MAD_I24) 2650 NODE_NAME_CASE(EXPORT) 2651 NODE_NAME_CASE(CONST_ADDRESS) 2652 NODE_NAME_CASE(REGISTER_LOAD) 2653 NODE_NAME_CASE(REGISTER_STORE) 2654 NODE_NAME_CASE(LOAD_CONSTANT) 2655 NODE_NAME_CASE(LOAD_INPUT) 2656 NODE_NAME_CASE(SAMPLE) 2657 NODE_NAME_CASE(SAMPLEB) 2658 NODE_NAME_CASE(SAMPLED) 2659 NODE_NAME_CASE(SAMPLEL) 2660 NODE_NAME_CASE(CVT_F32_UBYTE0) 2661 NODE_NAME_CASE(CVT_F32_UBYTE1) 2662 NODE_NAME_CASE(CVT_F32_UBYTE2) 2663 NODE_NAME_CASE(CVT_F32_UBYTE3) 2664 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) 2665 NODE_NAME_CASE(CONST_DATA_PTR) 2666 NODE_NAME_CASE(STORE_MSKOR) 2667 NODE_NAME_CASE(TBUFFER_STORE_FORMAT) 2668 } 2669 } 2670 2671 SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, 2672 DAGCombinerInfo &DCI, 2673 unsigned &RefinementSteps, 2674 bool &UseOneConstNR) const { 2675 SelectionDAG &DAG = DCI.DAG; 2676 EVT VT = Operand.getValueType(); 2677 2678 if (VT == MVT::f32) { 2679 RefinementSteps = 0; 2680 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); 2681 } 2682 2683 // TODO: There is also f64 rsq instruction, but the documentation is less 2684 // clear on its precision. 2685 2686 return SDValue(); 2687 } 2688 2689 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, 2690 DAGCombinerInfo &DCI, 2691 unsigned &RefinementSteps) const { 2692 SelectionDAG &DAG = DCI.DAG; 2693 EVT VT = Operand.getValueType(); 2694 2695 if (VT == MVT::f32) { 2696 // Reciprocal, < 1 ulp error. 2697 // 2698 // This reciprocal approximation converges to < 0.5 ulp error with one 2699 // newton rhapson performed with two fused multiple adds (FMAs). 2700 2701 RefinementSteps = 0; 2702 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); 2703 } 2704 2705 // TODO: There is also f64 rcp instruction, but the documentation is less 2706 // clear on its precision. 2707 2708 return SDValue(); 2709 } 2710 2711 static void computeKnownBitsForMinMax(const SDValue Op0, 2712 const SDValue Op1, 2713 APInt &KnownZero, 2714 APInt &KnownOne, 2715 const SelectionDAG &DAG, 2716 unsigned Depth) { 2717 APInt Op0Zero, Op0One; 2718 APInt Op1Zero, Op1One; 2719 DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth); 2720 DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth); 2721 2722 KnownZero = Op0Zero & Op1Zero; 2723 KnownOne = Op0One & Op1One; 2724 } 2725 2726 void AMDGPUTargetLowering::computeKnownBitsForTargetNode( 2727 const SDValue Op, 2728 APInt &KnownZero, 2729 APInt &KnownOne, 2730 const SelectionDAG &DAG, 2731 unsigned Depth) const { 2732 2733 KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. 2734 2735 APInt KnownZero2; 2736 APInt KnownOne2; 2737 unsigned Opc = Op.getOpcode(); 2738 2739 switch (Opc) { 2740 default: 2741 break; 2742 case ISD::INTRINSIC_WO_CHAIN: { 2743 // FIXME: The intrinsic should just use the node. 2744 switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { 2745 case AMDGPUIntrinsic::AMDGPU_imax: 2746 case AMDGPUIntrinsic::AMDGPU_umax: 2747 case AMDGPUIntrinsic::AMDGPU_imin: 2748 case AMDGPUIntrinsic::AMDGPU_umin: 2749 computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2), 2750 KnownZero, KnownOne, DAG, Depth); 2751 break; 2752 default: 2753 break; 2754 } 2755 2756 break; 2757 } 2758 case AMDGPUISD::SMAX: 2759 case AMDGPUISD::UMAX: 2760 case AMDGPUISD::SMIN: 2761 case AMDGPUISD::UMIN: 2762 computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1), 2763 KnownZero, KnownOne, DAG, Depth); 2764 break; 2765 2766 case AMDGPUISD::BFE_I32: 2767 case AMDGPUISD::BFE_U32: { 2768 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 2769 if (!CWidth) 2770 return; 2771 2772 unsigned BitWidth = 32; 2773 uint32_t Width = CWidth->getZExtValue() & 0x1f; 2774 2775 if (Opc == AMDGPUISD::BFE_U32) 2776 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); 2777 2778 break; 2779 } 2780 } 2781 } 2782 2783 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( 2784 SDValue Op, 2785 const SelectionDAG &DAG, 2786 unsigned Depth) const { 2787 switch (Op.getOpcode()) { 2788 case AMDGPUISD::BFE_I32: { 2789 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 2790 if (!Width) 2791 return 1; 2792 2793 unsigned SignBits = 32 - Width->getZExtValue() + 1; 2794 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 2795 if (!Offset || !Offset->isNullValue()) 2796 return SignBits; 2797 2798 // TODO: Could probably figure something out with non-0 offsets. 2799 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 2800 return std::max(SignBits, Op0SignBits); 2801 } 2802 2803 case AMDGPUISD::BFE_U32: { 2804 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 2805 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; 2806 } 2807 2808 default: 2809 return 1; 2810 } 2811 } 2812