1 // 2 // The LLVM Compiler Infrastructure 3 // 4 // This file is distributed under the University of Illinois Open Source 5 // License. See LICENSE.TXT for details. 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "NVPTX.h" 16 #include "NVPTXTargetMachine.h" 17 #include "NVPTXTargetObjectFile.h" 18 #include "NVPTXUtilities.h" 19 #include "llvm/CodeGen/Analysis.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineFunction.h" 22 #include "llvm/CodeGen/MachineInstrBuilder.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 25 #include "llvm/IR/CallSite.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/GlobalValue.h" 29 #include "llvm/IR/IntrinsicInst.h" 30 #include "llvm/IR/Intrinsics.h" 31 #include "llvm/IR/Module.h" 32 #include "llvm/MC/MCSectionELF.h" 33 #include "llvm/Support/CommandLine.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/ErrorHandling.h" 36 #include "llvm/Support/MathExtras.h" 37 #include "llvm/Support/raw_ostream.h" 38 #include <sstream> 39 40 #undef DEBUG_TYPE 41 #define DEBUG_TYPE "nvptx-lower" 42 43 using namespace llvm; 44 45 static unsigned int uniqueCallSite = 0; 46 47 static cl::opt<bool> sched4reg( 48 "nvptx-sched4reg", 49 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 50 51 static bool IsPTXVectorType(MVT VT) { 52 switch (VT.SimpleTy) { 53 default: 54 return false; 55 case MVT::v2i1: 56 case MVT::v4i1: 57 case MVT::v2i8: 58 case MVT::v4i8: 59 case MVT::v2i16: 60 case MVT::v4i16: 61 case MVT::v2i32: 62 case MVT::v4i32: 63 case MVT::v2i64: 64 case MVT::v2f32: 65 case MVT::v4f32: 66 case MVT::v2f64: 67 return true; 68 } 69 } 70 71 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 72 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 73 /// into their primitive components. 74 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 75 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 76 /// LowerCall, and LowerReturn. 77 static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty, 78 SmallVectorImpl<EVT> &ValueVTs, 79 SmallVectorImpl<uint64_t> *Offsets = nullptr, 80 uint64_t StartingOffset = 0) { 81 SmallVector<EVT, 16> TempVTs; 82 SmallVector<uint64_t, 16> TempOffsets; 83 84 ComputeValueVTs(TLI, Ty, TempVTs, &TempOffsets, StartingOffset); 85 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 86 EVT VT = TempVTs[i]; 87 uint64_t Off = TempOffsets[i]; 88 if (VT.isVector()) 89 for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) { 90 ValueVTs.push_back(VT.getVectorElementType()); 91 if (Offsets) 92 Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize()); 93 } 94 else { 95 ValueVTs.push_back(VT); 96 if (Offsets) 97 Offsets->push_back(Off); 98 } 99 } 100 } 101 102 // NVPTXTargetLowering Constructor. 103 NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) 104 : TargetLowering(TM, new NVPTXTargetObjectFile()), nvTM(&TM), 105 nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) { 106 107 // always lower memset, memcpy, and memmove intrinsics to load/store 108 // instructions, rather 109 // then generating calls to memset, mempcy or memmove. 110 MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; 111 MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; 112 MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; 113 114 setBooleanContents(ZeroOrNegativeOneBooleanContent); 115 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 116 117 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 118 // condition branches. 119 setJumpIsExpensive(true); 120 121 // By default, use the Source scheduling 122 if (sched4reg) 123 setSchedulingPreference(Sched::RegPressure); 124 else 125 setSchedulingPreference(Sched::Source); 126 127 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 128 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 129 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 130 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 131 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 132 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 133 134 // Operations not directly supported by NVPTX. 135 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 136 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 137 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 138 setOperationAction(ISD::SELECT_CC, MVT::i8, Expand); 139 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); 140 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 141 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 142 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 143 setOperationAction(ISD::BR_CC, MVT::f64, Expand); 144 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 145 setOperationAction(ISD::BR_CC, MVT::i8, Expand); 146 setOperationAction(ISD::BR_CC, MVT::i16, Expand); 147 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 148 setOperationAction(ISD::BR_CC, MVT::i64, Expand); 149 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 150 // For others we will expand to a SHL/SRA pair. 151 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 152 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 153 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 154 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 155 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 156 157 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 158 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 159 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 160 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 161 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 162 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 163 164 if (nvptxSubtarget.hasROT64()) { 165 setOperationAction(ISD::ROTL, MVT::i64, Legal); 166 setOperationAction(ISD::ROTR, MVT::i64, Legal); 167 } else { 168 setOperationAction(ISD::ROTL, MVT::i64, Expand); 169 setOperationAction(ISD::ROTR, MVT::i64, Expand); 170 } 171 if (nvptxSubtarget.hasROT32()) { 172 setOperationAction(ISD::ROTL, MVT::i32, Legal); 173 setOperationAction(ISD::ROTR, MVT::i32, Legal); 174 } else { 175 setOperationAction(ISD::ROTL, MVT::i32, Expand); 176 setOperationAction(ISD::ROTR, MVT::i32, Expand); 177 } 178 179 setOperationAction(ISD::ROTL, MVT::i16, Expand); 180 setOperationAction(ISD::ROTR, MVT::i16, Expand); 181 setOperationAction(ISD::ROTL, MVT::i8, Expand); 182 setOperationAction(ISD::ROTR, MVT::i8, Expand); 183 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 184 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 185 setOperationAction(ISD::BSWAP, MVT::i64, Expand); 186 187 // Indirect branch is not supported. 188 // This also disables Jump Table creation. 189 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 190 setOperationAction(ISD::BRIND, MVT::Other, Expand); 191 192 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 193 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 194 195 // We want to legalize constant related memmove and memcopy 196 // intrinsics. 197 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 198 199 // Turn FP extload into load/fextend 200 setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); 201 // Turn FP truncstore into trunc + store. 202 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 203 204 // PTX does not support load / store predicate registers 205 setOperationAction(ISD::LOAD, MVT::i1, Custom); 206 setOperationAction(ISD::STORE, MVT::i1, Custom); 207 208 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 209 setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); 210 setTruncStoreAction(MVT::i64, MVT::i1, Expand); 211 setTruncStoreAction(MVT::i32, MVT::i1, Expand); 212 setTruncStoreAction(MVT::i16, MVT::i1, Expand); 213 setTruncStoreAction(MVT::i8, MVT::i1, Expand); 214 215 // This is legal in NVPTX 216 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 217 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 218 219 // TRAP can be lowered to PTX trap 220 setOperationAction(ISD::TRAP, MVT::Other, Legal); 221 222 setOperationAction(ISD::ADDC, MVT::i64, Expand); 223 setOperationAction(ISD::ADDE, MVT::i64, Expand); 224 225 // Register custom handling for vector loads/stores 226 for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE; 227 ++i) { 228 MVT VT = (MVT::SimpleValueType) i; 229 if (IsPTXVectorType(VT)) { 230 setOperationAction(ISD::LOAD, VT, Custom); 231 setOperationAction(ISD::STORE, VT, Custom); 232 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 233 } 234 } 235 236 // Custom handling for i8 intrinsics 237 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 238 239 setOperationAction(ISD::CTLZ, MVT::i16, Legal); 240 setOperationAction(ISD::CTLZ, MVT::i32, Legal); 241 setOperationAction(ISD::CTLZ, MVT::i64, Legal); 242 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Legal); 243 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Legal); 244 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Legal); 245 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 246 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 247 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 248 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Expand); 249 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); 250 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 251 setOperationAction(ISD::CTPOP, MVT::i16, Legal); 252 setOperationAction(ISD::CTPOP, MVT::i32, Legal); 253 setOperationAction(ISD::CTPOP, MVT::i64, Legal); 254 255 // We have some custom DAG combine patterns for these nodes 256 setTargetDAGCombine(ISD::ADD); 257 setTargetDAGCombine(ISD::AND); 258 setTargetDAGCombine(ISD::FADD); 259 setTargetDAGCombine(ISD::MUL); 260 setTargetDAGCombine(ISD::SHL); 261 262 // Now deduce the information based on the above mentioned 263 // actions 264 computeRegisterProperties(); 265 } 266 267 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 268 switch (Opcode) { 269 default: 270 return nullptr; 271 case NVPTXISD::CALL: 272 return "NVPTXISD::CALL"; 273 case NVPTXISD::RET_FLAG: 274 return "NVPTXISD::RET_FLAG"; 275 case NVPTXISD::Wrapper: 276 return "NVPTXISD::Wrapper"; 277 case NVPTXISD::DeclareParam: 278 return "NVPTXISD::DeclareParam"; 279 case NVPTXISD::DeclareScalarParam: 280 return "NVPTXISD::DeclareScalarParam"; 281 case NVPTXISD::DeclareRet: 282 return "NVPTXISD::DeclareRet"; 283 case NVPTXISD::DeclareRetParam: 284 return "NVPTXISD::DeclareRetParam"; 285 case NVPTXISD::PrintCall: 286 return "NVPTXISD::PrintCall"; 287 case NVPTXISD::LoadParam: 288 return "NVPTXISD::LoadParam"; 289 case NVPTXISD::LoadParamV2: 290 return "NVPTXISD::LoadParamV2"; 291 case NVPTXISD::LoadParamV4: 292 return "NVPTXISD::LoadParamV4"; 293 case NVPTXISD::StoreParam: 294 return "NVPTXISD::StoreParam"; 295 case NVPTXISD::StoreParamV2: 296 return "NVPTXISD::StoreParamV2"; 297 case NVPTXISD::StoreParamV4: 298 return "NVPTXISD::StoreParamV4"; 299 case NVPTXISD::StoreParamS32: 300 return "NVPTXISD::StoreParamS32"; 301 case NVPTXISD::StoreParamU32: 302 return "NVPTXISD::StoreParamU32"; 303 case NVPTXISD::CallArgBegin: 304 return "NVPTXISD::CallArgBegin"; 305 case NVPTXISD::CallArg: 306 return "NVPTXISD::CallArg"; 307 case NVPTXISD::LastCallArg: 308 return "NVPTXISD::LastCallArg"; 309 case NVPTXISD::CallArgEnd: 310 return "NVPTXISD::CallArgEnd"; 311 case NVPTXISD::CallVoid: 312 return "NVPTXISD::CallVoid"; 313 case NVPTXISD::CallVal: 314 return "NVPTXISD::CallVal"; 315 case NVPTXISD::CallSymbol: 316 return "NVPTXISD::CallSymbol"; 317 case NVPTXISD::Prototype: 318 return "NVPTXISD::Prototype"; 319 case NVPTXISD::MoveParam: 320 return "NVPTXISD::MoveParam"; 321 case NVPTXISD::StoreRetval: 322 return "NVPTXISD::StoreRetval"; 323 case NVPTXISD::StoreRetvalV2: 324 return "NVPTXISD::StoreRetvalV2"; 325 case NVPTXISD::StoreRetvalV4: 326 return "NVPTXISD::StoreRetvalV4"; 327 case NVPTXISD::PseudoUseParam: 328 return "NVPTXISD::PseudoUseParam"; 329 case NVPTXISD::RETURN: 330 return "NVPTXISD::RETURN"; 331 case NVPTXISD::CallSeqBegin: 332 return "NVPTXISD::CallSeqBegin"; 333 case NVPTXISD::CallSeqEnd: 334 return "NVPTXISD::CallSeqEnd"; 335 case NVPTXISD::CallPrototype: 336 return "NVPTXISD::CallPrototype"; 337 case NVPTXISD::LoadV2: 338 return "NVPTXISD::LoadV2"; 339 case NVPTXISD::LoadV4: 340 return "NVPTXISD::LoadV4"; 341 case NVPTXISD::LDGV2: 342 return "NVPTXISD::LDGV2"; 343 case NVPTXISD::LDGV4: 344 return "NVPTXISD::LDGV4"; 345 case NVPTXISD::LDUV2: 346 return "NVPTXISD::LDUV2"; 347 case NVPTXISD::LDUV4: 348 return "NVPTXISD::LDUV4"; 349 case NVPTXISD::StoreV2: 350 return "NVPTXISD::StoreV2"; 351 case NVPTXISD::StoreV4: 352 return "NVPTXISD::StoreV4"; 353 case NVPTXISD::FUN_SHFL_CLAMP: 354 return "NVPTXISD::FUN_SHFL_CLAMP"; 355 case NVPTXISD::FUN_SHFR_CLAMP: 356 return "NVPTXISD::FUN_SHFR_CLAMP"; 357 case NVPTXISD::IMAD: 358 return "NVPTXISD::IMAD"; 359 case NVPTXISD::MUL_WIDE_SIGNED: 360 return "NVPTXISD::MUL_WIDE_SIGNED"; 361 case NVPTXISD::MUL_WIDE_UNSIGNED: 362 return "NVPTXISD::MUL_WIDE_UNSIGNED"; 363 case NVPTXISD::Tex1DFloatI32: return "NVPTXISD::Tex1DFloatI32"; 364 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; 365 case NVPTXISD::Tex1DFloatFloatLevel: 366 return "NVPTXISD::Tex1DFloatFloatLevel"; 367 case NVPTXISD::Tex1DFloatFloatGrad: 368 return "NVPTXISD::Tex1DFloatFloatGrad"; 369 case NVPTXISD::Tex1DI32I32: return "NVPTXISD::Tex1DI32I32"; 370 case NVPTXISD::Tex1DI32Float: return "NVPTXISD::Tex1DI32Float"; 371 case NVPTXISD::Tex1DI32FloatLevel: 372 return "NVPTXISD::Tex1DI32FloatLevel"; 373 case NVPTXISD::Tex1DI32FloatGrad: 374 return "NVPTXISD::Tex1DI32FloatGrad"; 375 case NVPTXISD::Tex1DArrayFloatI32: return "NVPTXISD::Tex2DArrayFloatI32"; 376 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; 377 case NVPTXISD::Tex1DArrayFloatFloatLevel: 378 return "NVPTXISD::Tex2DArrayFloatFloatLevel"; 379 case NVPTXISD::Tex1DArrayFloatFloatGrad: 380 return "NVPTXISD::Tex2DArrayFloatFloatGrad"; 381 case NVPTXISD::Tex1DArrayI32I32: return "NVPTXISD::Tex2DArrayI32I32"; 382 case NVPTXISD::Tex1DArrayI32Float: return "NVPTXISD::Tex2DArrayI32Float"; 383 case NVPTXISD::Tex1DArrayI32FloatLevel: 384 return "NVPTXISD::Tex2DArrayI32FloatLevel"; 385 case NVPTXISD::Tex1DArrayI32FloatGrad: 386 return "NVPTXISD::Tex2DArrayI32FloatGrad"; 387 case NVPTXISD::Tex2DFloatI32: return "NVPTXISD::Tex2DFloatI32"; 388 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; 389 case NVPTXISD::Tex2DFloatFloatLevel: 390 return "NVPTXISD::Tex2DFloatFloatLevel"; 391 case NVPTXISD::Tex2DFloatFloatGrad: 392 return "NVPTXISD::Tex2DFloatFloatGrad"; 393 case NVPTXISD::Tex2DI32I32: return "NVPTXISD::Tex2DI32I32"; 394 case NVPTXISD::Tex2DI32Float: return "NVPTXISD::Tex2DI32Float"; 395 case NVPTXISD::Tex2DI32FloatLevel: 396 return "NVPTXISD::Tex2DI32FloatLevel"; 397 case NVPTXISD::Tex2DI32FloatGrad: 398 return "NVPTXISD::Tex2DI32FloatGrad"; 399 case NVPTXISD::Tex2DArrayFloatI32: return "NVPTXISD::Tex2DArrayFloatI32"; 400 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; 401 case NVPTXISD::Tex2DArrayFloatFloatLevel: 402 return "NVPTXISD::Tex2DArrayFloatFloatLevel"; 403 case NVPTXISD::Tex2DArrayFloatFloatGrad: 404 return "NVPTXISD::Tex2DArrayFloatFloatGrad"; 405 case NVPTXISD::Tex2DArrayI32I32: return "NVPTXISD::Tex2DArrayI32I32"; 406 case NVPTXISD::Tex2DArrayI32Float: return "NVPTXISD::Tex2DArrayI32Float"; 407 case NVPTXISD::Tex2DArrayI32FloatLevel: 408 return "NVPTXISD::Tex2DArrayI32FloatLevel"; 409 case NVPTXISD::Tex2DArrayI32FloatGrad: 410 return "NVPTXISD::Tex2DArrayI32FloatGrad"; 411 case NVPTXISD::Tex3DFloatI32: return "NVPTXISD::Tex3DFloatI32"; 412 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; 413 case NVPTXISD::Tex3DFloatFloatLevel: 414 return "NVPTXISD::Tex3DFloatFloatLevel"; 415 case NVPTXISD::Tex3DFloatFloatGrad: 416 return "NVPTXISD::Tex3DFloatFloatGrad"; 417 case NVPTXISD::Tex3DI32I32: return "NVPTXISD::Tex3DI32I32"; 418 case NVPTXISD::Tex3DI32Float: return "NVPTXISD::Tex3DI32Float"; 419 case NVPTXISD::Tex3DI32FloatLevel: 420 return "NVPTXISD::Tex3DI32FloatLevel"; 421 case NVPTXISD::Tex3DI32FloatGrad: 422 return "NVPTXISD::Tex3DI32FloatGrad"; 423 424 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; 425 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; 426 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; 427 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; 428 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; 429 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; 430 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; 431 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; 432 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; 433 434 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; 435 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; 436 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; 437 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; 438 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; 439 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; 440 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; 441 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; 442 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; 443 444 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; 445 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; 446 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; 447 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; 448 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; 449 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; 450 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; 451 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; 452 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; 453 454 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; 455 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; 456 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; 457 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; 458 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; 459 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; 460 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; 461 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; 462 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; 463 464 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; 465 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; 466 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; 467 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; 468 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; 469 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; 470 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; 471 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; 472 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; 473 } 474 } 475 476 TargetLoweringBase::LegalizeTypeAction 477 NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const { 478 if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) 479 return TypeSplitVector; 480 481 return TargetLoweringBase::getPreferredVectorAction(VT); 482 } 483 484 SDValue 485 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 486 SDLoc dl(Op); 487 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 488 Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy()); 489 return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op); 490 } 491 492 std::string 493 NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, 494 const SmallVectorImpl<ISD::OutputArg> &Outs, 495 unsigned retAlignment, 496 const ImmutableCallSite *CS) const { 497 498 bool isABI = (nvptxSubtarget.getSmVersion() >= 20); 499 assert(isABI && "Non-ABI compilation is not supported"); 500 if (!isABI) 501 return ""; 502 503 std::stringstream O; 504 O << "prototype_" << uniqueCallSite << " : .callprototype "; 505 506 if (retTy->getTypeID() == Type::VoidTyID) { 507 O << "()"; 508 } else { 509 O << "("; 510 if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) { 511 unsigned size = 0; 512 if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) { 513 size = ITy->getBitWidth(); 514 if (size < 32) 515 size = 32; 516 } else { 517 assert(retTy->isFloatingPointTy() && 518 "Floating point type expected here"); 519 size = retTy->getPrimitiveSizeInBits(); 520 } 521 522 O << ".param .b" << size << " _"; 523 } else if (isa<PointerType>(retTy)) { 524 O << ".param .b" << getPointerTy().getSizeInBits() << " _"; 525 } else { 526 if((retTy->getTypeID() == Type::StructTyID) || 527 isa<VectorType>(retTy)) { 528 O << ".param .align " 529 << retAlignment 530 << " .b8 _[" 531 << getDataLayout()->getTypeAllocSize(retTy) << "]"; 532 } else { 533 assert(false && "Unknown return type"); 534 } 535 } 536 O << ") "; 537 } 538 O << "_ ("; 539 540 bool first = true; 541 MVT thePointerTy = getPointerTy(); 542 543 unsigned OIdx = 0; 544 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 545 Type *Ty = Args[i].Ty; 546 if (!first) { 547 O << ", "; 548 } 549 first = false; 550 551 if (Outs[OIdx].Flags.isByVal() == false) { 552 if (Ty->isAggregateType() || Ty->isVectorTy()) { 553 unsigned align = 0; 554 const CallInst *CallI = cast<CallInst>(CS->getInstruction()); 555 const DataLayout *TD = getDataLayout(); 556 // +1 because index 0 is reserved for return type alignment 557 if (!llvm::getAlign(*CallI, i + 1, align)) 558 align = TD->getABITypeAlignment(Ty); 559 unsigned sz = TD->getTypeAllocSize(Ty); 560 O << ".param .align " << align << " .b8 "; 561 O << "_"; 562 O << "[" << sz << "]"; 563 // update the index for Outs 564 SmallVector<EVT, 16> vtparts; 565 ComputeValueVTs(*this, Ty, vtparts); 566 if (unsigned len = vtparts.size()) 567 OIdx += len - 1; 568 continue; 569 } 570 // i8 types in IR will be i16 types in SDAG 571 assert((getValueType(Ty) == Outs[OIdx].VT || 572 (getValueType(Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 573 "type mismatch between callee prototype and arguments"); 574 // scalar type 575 unsigned sz = 0; 576 if (isa<IntegerType>(Ty)) { 577 sz = cast<IntegerType>(Ty)->getBitWidth(); 578 if (sz < 32) 579 sz = 32; 580 } else if (isa<PointerType>(Ty)) 581 sz = thePointerTy.getSizeInBits(); 582 else 583 sz = Ty->getPrimitiveSizeInBits(); 584 O << ".param .b" << sz << " "; 585 O << "_"; 586 continue; 587 } 588 const PointerType *PTy = dyn_cast<PointerType>(Ty); 589 assert(PTy && "Param with byval attribute should be a pointer type"); 590 Type *ETy = PTy->getElementType(); 591 592 unsigned align = Outs[OIdx].Flags.getByValAlign(); 593 unsigned sz = getDataLayout()->getTypeAllocSize(ETy); 594 O << ".param .align " << align << " .b8 "; 595 O << "_"; 596 O << "[" << sz << "]"; 597 } 598 O << ");"; 599 return O.str(); 600 } 601 602 unsigned 603 NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, 604 const ImmutableCallSite *CS, 605 Type *Ty, 606 unsigned Idx) const { 607 const DataLayout *TD = getDataLayout(); 608 unsigned Align = 0; 609 const Value *DirectCallee = CS->getCalledFunction(); 610 611 if (!DirectCallee) { 612 // We don't have a direct function symbol, but that may be because of 613 // constant cast instructions in the call. 614 const Instruction *CalleeI = CS->getInstruction(); 615 assert(CalleeI && "Call target is not a function or derived value?"); 616 617 // With bitcast'd call targets, the instruction will be the call 618 if (isa<CallInst>(CalleeI)) { 619 // Check if we have call alignment metadata 620 if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align)) 621 return Align; 622 623 const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue(); 624 // Ignore any bitcast instructions 625 while(isa<ConstantExpr>(CalleeV)) { 626 const ConstantExpr *CE = cast<ConstantExpr>(CalleeV); 627 if (!CE->isCast()) 628 break; 629 // Look through the bitcast 630 CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0); 631 } 632 633 // We have now looked past all of the bitcasts. Do we finally have a 634 // Function? 635 if (isa<Function>(CalleeV)) 636 DirectCallee = CalleeV; 637 } 638 } 639 640 // Check for function alignment information if we found that the 641 // ultimate target is a Function 642 if (DirectCallee) 643 if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align)) 644 return Align; 645 646 // Call is indirect or alignment information is not available, fall back to 647 // the ABI type alignment 648 return TD->getABITypeAlignment(Ty); 649 } 650 651 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 652 SmallVectorImpl<SDValue> &InVals) const { 653 SelectionDAG &DAG = CLI.DAG; 654 SDLoc dl = CLI.DL; 655 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 656 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 657 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 658 SDValue Chain = CLI.Chain; 659 SDValue Callee = CLI.Callee; 660 bool &isTailCall = CLI.IsTailCall; 661 ArgListTy &Args = CLI.getArgs(); 662 Type *retTy = CLI.RetTy; 663 ImmutableCallSite *CS = CLI.CS; 664 665 bool isABI = (nvptxSubtarget.getSmVersion() >= 20); 666 assert(isABI && "Non-ABI compilation is not supported"); 667 if (!isABI) 668 return Chain; 669 const DataLayout *TD = getDataLayout(); 670 MachineFunction &MF = DAG.getMachineFunction(); 671 const Function *F = MF.getFunction(); 672 673 SDValue tempChain = Chain; 674 Chain = 675 DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(uniqueCallSite, true), 676 dl); 677 SDValue InFlag = Chain.getValue(1); 678 679 unsigned paramCount = 0; 680 // Args.size() and Outs.size() need not match. 681 // Outs.size() will be larger 682 // * if there is an aggregate argument with multiple fields (each field 683 // showing up separately in Outs) 684 // * if there is a vector argument with more than typical vector-length 685 // elements (generally if more than 4) where each vector element is 686 // individually present in Outs. 687 // So a different index should be used for indexing into Outs/OutVals. 688 // See similar issue in LowerFormalArguments. 689 unsigned OIdx = 0; 690 // Declare the .params or .reg need to pass values 691 // to the function 692 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 693 EVT VT = Outs[OIdx].VT; 694 Type *Ty = Args[i].Ty; 695 696 if (Outs[OIdx].Flags.isByVal() == false) { 697 if (Ty->isAggregateType()) { 698 // aggregate 699 SmallVector<EVT, 16> vtparts; 700 SmallVector<uint64_t, 16> Offsets; 701 ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0); 702 703 unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1); 704 // declare .param .align <align> .b8 .param<n>[<size>]; 705 unsigned sz = TD->getTypeAllocSize(Ty); 706 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 707 SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32), 708 DAG.getConstant(paramCount, MVT::i32), 709 DAG.getConstant(sz, MVT::i32), InFlag }; 710 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 711 DeclareParamOps); 712 InFlag = Chain.getValue(1); 713 for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { 714 EVT elemtype = vtparts[j]; 715 unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]); 716 if (elemtype.isInteger() && (sz < 8)) 717 sz = 8; 718 SDValue StVal = OutVals[OIdx]; 719 if (elemtype.getSizeInBits() < 16) { 720 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 721 } 722 SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 723 SDValue CopyParamOps[] = { Chain, 724 DAG.getConstant(paramCount, MVT::i32), 725 DAG.getConstant(Offsets[j], MVT::i32), 726 StVal, InFlag }; 727 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, 728 CopyParamVTs, CopyParamOps, 729 elemtype, MachinePointerInfo(), 730 ArgAlign); 731 InFlag = Chain.getValue(1); 732 ++OIdx; 733 } 734 if (vtparts.size() > 0) 735 --OIdx; 736 ++paramCount; 737 continue; 738 } 739 if (Ty->isVectorTy()) { 740 EVT ObjectVT = getValueType(Ty); 741 unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1); 742 // declare .param .align <align> .b8 .param<n>[<size>]; 743 unsigned sz = TD->getTypeAllocSize(Ty); 744 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 745 SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32), 746 DAG.getConstant(paramCount, MVT::i32), 747 DAG.getConstant(sz, MVT::i32), InFlag }; 748 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 749 DeclareParamOps); 750 InFlag = Chain.getValue(1); 751 unsigned NumElts = ObjectVT.getVectorNumElements(); 752 EVT EltVT = ObjectVT.getVectorElementType(); 753 EVT MemVT = EltVT; 754 bool NeedExtend = false; 755 if (EltVT.getSizeInBits() < 16) { 756 NeedExtend = true; 757 EltVT = MVT::i16; 758 } 759 760 // V1 store 761 if (NumElts == 1) { 762 SDValue Elt = OutVals[OIdx++]; 763 if (NeedExtend) 764 Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt); 765 766 SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 767 SDValue CopyParamOps[] = { Chain, 768 DAG.getConstant(paramCount, MVT::i32), 769 DAG.getConstant(0, MVT::i32), Elt, 770 InFlag }; 771 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, 772 CopyParamVTs, CopyParamOps, 773 MemVT, MachinePointerInfo()); 774 InFlag = Chain.getValue(1); 775 } else if (NumElts == 2) { 776 SDValue Elt0 = OutVals[OIdx++]; 777 SDValue Elt1 = OutVals[OIdx++]; 778 if (NeedExtend) { 779 Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0); 780 Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1); 781 } 782 783 SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 784 SDValue CopyParamOps[] = { Chain, 785 DAG.getConstant(paramCount, MVT::i32), 786 DAG.getConstant(0, MVT::i32), Elt0, Elt1, 787 InFlag }; 788 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl, 789 CopyParamVTs, CopyParamOps, 790 MemVT, MachinePointerInfo()); 791 InFlag = Chain.getValue(1); 792 } else { 793 unsigned curOffset = 0; 794 // V4 stores 795 // We have at least 4 elements (<3 x Ty> expands to 4 elements) and 796 // the 797 // vector will be expanded to a power of 2 elements, so we know we can 798 // always round up to the next multiple of 4 when creating the vector 799 // stores. 800 // e.g. 4 elem => 1 st.v4 801 // 6 elem => 2 st.v4 802 // 8 elem => 2 st.v4 803 // 11 elem => 3 st.v4 804 unsigned VecSize = 4; 805 if (EltVT.getSizeInBits() == 64) 806 VecSize = 2; 807 808 // This is potentially only part of a vector, so assume all elements 809 // are packed together. 810 unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize; 811 812 for (unsigned i = 0; i < NumElts; i += VecSize) { 813 // Get values 814 SDValue StoreVal; 815 SmallVector<SDValue, 8> Ops; 816 Ops.push_back(Chain); 817 Ops.push_back(DAG.getConstant(paramCount, MVT::i32)); 818 Ops.push_back(DAG.getConstant(curOffset, MVT::i32)); 819 820 unsigned Opc = NVPTXISD::StoreParamV2; 821 822 StoreVal = OutVals[OIdx++]; 823 if (NeedExtend) 824 StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); 825 Ops.push_back(StoreVal); 826 827 if (i + 1 < NumElts) { 828 StoreVal = OutVals[OIdx++]; 829 if (NeedExtend) 830 StoreVal = 831 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); 832 } else { 833 StoreVal = DAG.getUNDEF(EltVT); 834 } 835 Ops.push_back(StoreVal); 836 837 if (VecSize == 4) { 838 Opc = NVPTXISD::StoreParamV4; 839 if (i + 2 < NumElts) { 840 StoreVal = OutVals[OIdx++]; 841 if (NeedExtend) 842 StoreVal = 843 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); 844 } else { 845 StoreVal = DAG.getUNDEF(EltVT); 846 } 847 Ops.push_back(StoreVal); 848 849 if (i + 3 < NumElts) { 850 StoreVal = OutVals[OIdx++]; 851 if (NeedExtend) 852 StoreVal = 853 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); 854 } else { 855 StoreVal = DAG.getUNDEF(EltVT); 856 } 857 Ops.push_back(StoreVal); 858 } 859 860 Ops.push_back(InFlag); 861 862 SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 863 Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops, 864 MemVT, MachinePointerInfo()); 865 InFlag = Chain.getValue(1); 866 curOffset += PerStoreOffset; 867 } 868 } 869 ++paramCount; 870 --OIdx; 871 continue; 872 } 873 // Plain scalar 874 // for ABI, declare .param .b<size> .param<n>; 875 unsigned sz = VT.getSizeInBits(); 876 bool needExtend = false; 877 if (VT.isInteger()) { 878 if (sz < 16) 879 needExtend = true; 880 if (sz < 32) 881 sz = 32; 882 } 883 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 884 SDValue DeclareParamOps[] = { Chain, 885 DAG.getConstant(paramCount, MVT::i32), 886 DAG.getConstant(sz, MVT::i32), 887 DAG.getConstant(0, MVT::i32), InFlag }; 888 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 889 DeclareParamOps); 890 InFlag = Chain.getValue(1); 891 SDValue OutV = OutVals[OIdx]; 892 if (needExtend) { 893 // zext/sext i1 to i16 894 unsigned opc = ISD::ZERO_EXTEND; 895 if (Outs[OIdx].Flags.isSExt()) 896 opc = ISD::SIGN_EXTEND; 897 OutV = DAG.getNode(opc, dl, MVT::i16, OutV); 898 } 899 SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 900 SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), 901 DAG.getConstant(0, MVT::i32), OutV, InFlag }; 902 903 unsigned opcode = NVPTXISD::StoreParam; 904 if (Outs[OIdx].Flags.isZExt()) 905 opcode = NVPTXISD::StoreParamU32; 906 else if (Outs[OIdx].Flags.isSExt()) 907 opcode = NVPTXISD::StoreParamS32; 908 Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps, 909 VT, MachinePointerInfo()); 910 911 InFlag = Chain.getValue(1); 912 ++paramCount; 913 continue; 914 } 915 // struct or vector 916 SmallVector<EVT, 16> vtparts; 917 SmallVector<uint64_t, 16> Offsets; 918 const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty); 919 assert(PTy && "Type of a byval parameter should be pointer"); 920 ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0); 921 922 // declare .param .align <align> .b8 .param<n>[<size>]; 923 unsigned sz = Outs[OIdx].Flags.getByValSize(); 924 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 925 unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign(); 926 // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, 927 // so we don't need to worry about natural alignment or not. 928 // See TargetLowering::LowerCallTo(). 929 SDValue DeclareParamOps[] = { 930 Chain, DAG.getConstant(Outs[OIdx].Flags.getByValAlign(), MVT::i32), 931 DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32), 932 InFlag 933 }; 934 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 935 DeclareParamOps); 936 InFlag = Chain.getValue(1); 937 for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { 938 EVT elemtype = vtparts[j]; 939 int curOffset = Offsets[j]; 940 unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); 941 SDValue srcAddr = 942 DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx], 943 DAG.getConstant(curOffset, getPointerTy())); 944 SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, 945 MachinePointerInfo(), false, false, false, 946 PartAlign); 947 if (elemtype.getSizeInBits() < 16) { 948 theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); 949 } 950 SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 951 SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), 952 DAG.getConstant(curOffset, MVT::i32), theVal, 953 InFlag }; 954 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, 955 CopyParamOps, elemtype, 956 MachinePointerInfo()); 957 958 InFlag = Chain.getValue(1); 959 } 960 ++paramCount; 961 } 962 963 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 964 unsigned retAlignment = 0; 965 966 // Handle Result 967 if (Ins.size() > 0) { 968 SmallVector<EVT, 16> resvtparts; 969 ComputeValueVTs(*this, retTy, resvtparts); 970 971 // Declare 972 // .param .align 16 .b8 retval0[<size-in-bytes>], or 973 // .param .b<size-in-bits> retval0 974 unsigned resultsz = TD->getTypeAllocSizeInBits(retTy); 975 if (retTy->isSingleValueType()) { 976 // Scalar needs to be at least 32bit wide 977 if (resultsz < 32) 978 resultsz = 32; 979 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 980 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32), 981 DAG.getConstant(resultsz, MVT::i32), 982 DAG.getConstant(0, MVT::i32), InFlag }; 983 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 984 DeclareRetOps); 985 InFlag = Chain.getValue(1); 986 } else { 987 retAlignment = getArgumentAlignment(Callee, CS, retTy, 0); 988 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 989 SDValue DeclareRetOps[] = { Chain, 990 DAG.getConstant(retAlignment, MVT::i32), 991 DAG.getConstant(resultsz / 8, MVT::i32), 992 DAG.getConstant(0, MVT::i32), InFlag }; 993 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 994 DeclareRetOps); 995 InFlag = Chain.getValue(1); 996 } 997 } 998 999 if (!Func) { 1000 // This is indirect function call case : PTX requires a prototype of the 1001 // form 1002 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 1003 // to be emitted, and the label has to used as the last arg of call 1004 // instruction. 1005 // The prototype is embedded in a string and put as the operand for a 1006 // CallPrototype SDNode which will print out to the value of the string. 1007 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1008 std::string Proto = getPrototype(retTy, Args, Outs, retAlignment, CS); 1009 const char *ProtoStr = 1010 nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); 1011 SDValue ProtoOps[] = { 1012 Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag, 1013 }; 1014 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 1015 InFlag = Chain.getValue(1); 1016 } 1017 // Op to just print "call" 1018 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1019 SDValue PrintCallOps[] = { 1020 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, MVT::i32), InFlag 1021 }; 1022 Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall), 1023 dl, PrintCallVTs, PrintCallOps); 1024 InFlag = Chain.getValue(1); 1025 1026 // Ops to print out the function name 1027 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1028 SDValue CallVoidOps[] = { Chain, Callee, InFlag }; 1029 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 1030 InFlag = Chain.getValue(1); 1031 1032 // Ops to print out the param list 1033 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1034 SDValue CallArgBeginOps[] = { Chain, InFlag }; 1035 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 1036 CallArgBeginOps); 1037 InFlag = Chain.getValue(1); 1038 1039 for (unsigned i = 0, e = paramCount; i != e; ++i) { 1040 unsigned opcode; 1041 if (i == (e - 1)) 1042 opcode = NVPTXISD::LastCallArg; 1043 else 1044 opcode = NVPTXISD::CallArg; 1045 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1046 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, MVT::i32), 1047 DAG.getConstant(i, MVT::i32), InFlag }; 1048 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 1049 InFlag = Chain.getValue(1); 1050 } 1051 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1052 SDValue CallArgEndOps[] = { Chain, DAG.getConstant(Func ? 1 : 0, MVT::i32), 1053 InFlag }; 1054 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 1055 InFlag = Chain.getValue(1); 1056 1057 if (!Func) { 1058 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1059 SDValue PrototypeOps[] = { Chain, DAG.getConstant(uniqueCallSite, MVT::i32), 1060 InFlag }; 1061 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 1062 InFlag = Chain.getValue(1); 1063 } 1064 1065 // Generate loads from param memory/moves from registers for result 1066 if (Ins.size() > 0) { 1067 if (retTy && retTy->isVectorTy()) { 1068 EVT ObjectVT = getValueType(retTy); 1069 unsigned NumElts = ObjectVT.getVectorNumElements(); 1070 EVT EltVT = ObjectVT.getVectorElementType(); 1071 assert(nvTM->getTargetLowering()->getNumRegisters(F->getContext(), 1072 ObjectVT) == NumElts && 1073 "Vector was not scalarized"); 1074 unsigned sz = EltVT.getSizeInBits(); 1075 bool needTruncate = sz < 8 ? true : false; 1076 1077 if (NumElts == 1) { 1078 // Just a simple load 1079 SmallVector<EVT, 4> LoadRetVTs; 1080 if (EltVT == MVT::i1 || EltVT == MVT::i8) { 1081 // If loading i1/i8 result, generate 1082 // load.b8 i16 1083 // if i1 1084 // trunc i16 to i1 1085 LoadRetVTs.push_back(MVT::i16); 1086 } else 1087 LoadRetVTs.push_back(EltVT); 1088 LoadRetVTs.push_back(MVT::Other); 1089 LoadRetVTs.push_back(MVT::Glue); 1090 SmallVector<SDValue, 4> LoadRetOps; 1091 LoadRetOps.push_back(Chain); 1092 LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); 1093 LoadRetOps.push_back(DAG.getConstant(0, MVT::i32)); 1094 LoadRetOps.push_back(InFlag); 1095 SDValue retval = DAG.getMemIntrinsicNode( 1096 NVPTXISD::LoadParam, dl, 1097 DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo()); 1098 Chain = retval.getValue(1); 1099 InFlag = retval.getValue(2); 1100 SDValue Ret0 = retval; 1101 if (needTruncate) 1102 Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0); 1103 InVals.push_back(Ret0); 1104 } else if (NumElts == 2) { 1105 // LoadV2 1106 SmallVector<EVT, 4> LoadRetVTs; 1107 if (EltVT == MVT::i1 || EltVT == MVT::i8) { 1108 // If loading i1/i8 result, generate 1109 // load.b8 i16 1110 // if i1 1111 // trunc i16 to i1 1112 LoadRetVTs.push_back(MVT::i16); 1113 LoadRetVTs.push_back(MVT::i16); 1114 } else { 1115 LoadRetVTs.push_back(EltVT); 1116 LoadRetVTs.push_back(EltVT); 1117 } 1118 LoadRetVTs.push_back(MVT::Other); 1119 LoadRetVTs.push_back(MVT::Glue); 1120 SmallVector<SDValue, 4> LoadRetOps; 1121 LoadRetOps.push_back(Chain); 1122 LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); 1123 LoadRetOps.push_back(DAG.getConstant(0, MVT::i32)); 1124 LoadRetOps.push_back(InFlag); 1125 SDValue retval = DAG.getMemIntrinsicNode( 1126 NVPTXISD::LoadParamV2, dl, 1127 DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo()); 1128 Chain = retval.getValue(2); 1129 InFlag = retval.getValue(3); 1130 SDValue Ret0 = retval.getValue(0); 1131 SDValue Ret1 = retval.getValue(1); 1132 if (needTruncate) { 1133 Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0); 1134 InVals.push_back(Ret0); 1135 Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1); 1136 InVals.push_back(Ret1); 1137 } else { 1138 InVals.push_back(Ret0); 1139 InVals.push_back(Ret1); 1140 } 1141 } else { 1142 // Split into N LoadV4 1143 unsigned Ofst = 0; 1144 unsigned VecSize = 4; 1145 unsigned Opc = NVPTXISD::LoadParamV4; 1146 if (EltVT.getSizeInBits() == 64) { 1147 VecSize = 2; 1148 Opc = NVPTXISD::LoadParamV2; 1149 } 1150 EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize); 1151 for (unsigned i = 0; i < NumElts; i += VecSize) { 1152 SmallVector<EVT, 8> LoadRetVTs; 1153 if (EltVT == MVT::i1 || EltVT == MVT::i8) { 1154 // If loading i1/i8 result, generate 1155 // load.b8 i16 1156 // if i1 1157 // trunc i16 to i1 1158 for (unsigned j = 0; j < VecSize; ++j) 1159 LoadRetVTs.push_back(MVT::i16); 1160 } else { 1161 for (unsigned j = 0; j < VecSize; ++j) 1162 LoadRetVTs.push_back(EltVT); 1163 } 1164 LoadRetVTs.push_back(MVT::Other); 1165 LoadRetVTs.push_back(MVT::Glue); 1166 SmallVector<SDValue, 4> LoadRetOps; 1167 LoadRetOps.push_back(Chain); 1168 LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); 1169 LoadRetOps.push_back(DAG.getConstant(Ofst, MVT::i32)); 1170 LoadRetOps.push_back(InFlag); 1171 SDValue retval = DAG.getMemIntrinsicNode( 1172 Opc, dl, DAG.getVTList(LoadRetVTs), 1173 LoadRetOps, EltVT, MachinePointerInfo()); 1174 if (VecSize == 2) { 1175 Chain = retval.getValue(2); 1176 InFlag = retval.getValue(3); 1177 } else { 1178 Chain = retval.getValue(4); 1179 InFlag = retval.getValue(5); 1180 } 1181 1182 for (unsigned j = 0; j < VecSize; ++j) { 1183 if (i + j >= NumElts) 1184 break; 1185 SDValue Elt = retval.getValue(j); 1186 if (needTruncate) 1187 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 1188 InVals.push_back(Elt); 1189 } 1190 Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); 1191 } 1192 } 1193 } else { 1194 SmallVector<EVT, 16> VTs; 1195 SmallVector<uint64_t, 16> Offsets; 1196 ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0); 1197 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 1198 unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0); 1199 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 1200 unsigned sz = VTs[i].getSizeInBits(); 1201 unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]); 1202 bool needTruncate = sz < 8 ? true : false; 1203 if (VTs[i].isInteger() && (sz < 8)) 1204 sz = 8; 1205 1206 SmallVector<EVT, 4> LoadRetVTs; 1207 EVT TheLoadType = VTs[i]; 1208 if (retTy->isIntegerTy() && 1209 TD->getTypeAllocSizeInBits(retTy) < 32) { 1210 // This is for integer types only, and specifically not for 1211 // aggregates. 1212 LoadRetVTs.push_back(MVT::i32); 1213 TheLoadType = MVT::i32; 1214 } else if (sz < 16) { 1215 // If loading i1/i8 result, generate 1216 // load i8 (-> i16) 1217 // trunc i16 to i1/i8 1218 LoadRetVTs.push_back(MVT::i16); 1219 } else 1220 LoadRetVTs.push_back(Ins[i].VT); 1221 LoadRetVTs.push_back(MVT::Other); 1222 LoadRetVTs.push_back(MVT::Glue); 1223 1224 SmallVector<SDValue, 4> LoadRetOps; 1225 LoadRetOps.push_back(Chain); 1226 LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); 1227 LoadRetOps.push_back(DAG.getConstant(Offsets[i], MVT::i32)); 1228 LoadRetOps.push_back(InFlag); 1229 SDValue retval = DAG.getMemIntrinsicNode( 1230 NVPTXISD::LoadParam, dl, 1231 DAG.getVTList(LoadRetVTs), LoadRetOps, 1232 TheLoadType, MachinePointerInfo(), AlignI); 1233 Chain = retval.getValue(1); 1234 InFlag = retval.getValue(2); 1235 SDValue Ret0 = retval.getValue(0); 1236 if (needTruncate) 1237 Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0); 1238 InVals.push_back(Ret0); 1239 } 1240 } 1241 } 1242 1243 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(uniqueCallSite, true), 1244 DAG.getIntPtrConstant(uniqueCallSite + 1, true), 1245 InFlag, dl); 1246 uniqueCallSite++; 1247 1248 // set isTailCall to false for now, until we figure out how to express 1249 // tail call optimization in PTX 1250 isTailCall = false; 1251 return Chain; 1252 } 1253 1254 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 1255 // (see LegalizeDAG.cpp). This is slow and uses local memory. 1256 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 1257 SDValue 1258 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 1259 SDNode *Node = Op.getNode(); 1260 SDLoc dl(Node); 1261 SmallVector<SDValue, 8> Ops; 1262 unsigned NumOperands = Node->getNumOperands(); 1263 for (unsigned i = 0; i < NumOperands; ++i) { 1264 SDValue SubOp = Node->getOperand(i); 1265 EVT VVT = SubOp.getNode()->getValueType(0); 1266 EVT EltVT = VVT.getVectorElementType(); 1267 unsigned NumSubElem = VVT.getVectorNumElements(); 1268 for (unsigned j = 0; j < NumSubElem; ++j) { 1269 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 1270 DAG.getIntPtrConstant(j))); 1271 } 1272 } 1273 return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops); 1274 } 1275 1276 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 1277 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 1278 /// amount, or 1279 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 1280 /// amount. 1281 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 1282 SelectionDAG &DAG) const { 1283 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 1284 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 1285 1286 EVT VT = Op.getValueType(); 1287 unsigned VTBits = VT.getSizeInBits(); 1288 SDLoc dl(Op); 1289 SDValue ShOpLo = Op.getOperand(0); 1290 SDValue ShOpHi = Op.getOperand(1); 1291 SDValue ShAmt = Op.getOperand(2); 1292 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 1293 1294 if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) { 1295 1296 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 1297 // {dHi, dLo} = {aHi, aLo} >> Amt 1298 // dHi = aHi >> Amt 1299 // dLo = shf.r.clamp aLo, aHi, Amt 1300 1301 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 1302 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 1303 ShAmt); 1304 1305 SDValue Ops[2] = { Lo, Hi }; 1306 return DAG.getMergeValues(Ops, dl); 1307 } 1308 else { 1309 1310 // {dHi, dLo} = {aHi, aLo} >> Amt 1311 // - if (Amt>=size) then 1312 // dLo = aHi >> (Amt-size) 1313 // dHi = aHi >> Amt (this is either all 0 or all 1) 1314 // else 1315 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 1316 // dHi = aHi >> Amt 1317 1318 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 1319 DAG.getConstant(VTBits, MVT::i32), ShAmt); 1320 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 1321 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 1322 DAG.getConstant(VTBits, MVT::i32)); 1323 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 1324 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 1325 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 1326 1327 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 1328 DAG.getConstant(VTBits, MVT::i32), ISD::SETGE); 1329 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 1330 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 1331 1332 SDValue Ops[2] = { Lo, Hi }; 1333 return DAG.getMergeValues(Ops, dl); 1334 } 1335 } 1336 1337 /// LowerShiftLeftParts - Lower SHL_PARTS, which 1338 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 1339 /// amount, or 1340 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 1341 /// amount. 1342 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 1343 SelectionDAG &DAG) const { 1344 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 1345 assert(Op.getOpcode() == ISD::SHL_PARTS); 1346 1347 EVT VT = Op.getValueType(); 1348 unsigned VTBits = VT.getSizeInBits(); 1349 SDLoc dl(Op); 1350 SDValue ShOpLo = Op.getOperand(0); 1351 SDValue ShOpHi = Op.getOperand(1); 1352 SDValue ShAmt = Op.getOperand(2); 1353 1354 if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) { 1355 1356 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 1357 // {dHi, dLo} = {aHi, aLo} << Amt 1358 // dHi = shf.l.clamp aLo, aHi, Amt 1359 // dLo = aLo << Amt 1360 1361 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 1362 ShAmt); 1363 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 1364 1365 SDValue Ops[2] = { Lo, Hi }; 1366 return DAG.getMergeValues(Ops, dl); 1367 } 1368 else { 1369 1370 // {dHi, dLo} = {aHi, aLo} << Amt 1371 // - if (Amt>=size) then 1372 // dLo = aLo << Amt (all 0) 1373 // dLo = aLo << (Amt-size) 1374 // else 1375 // dLo = aLo << Amt 1376 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 1377 1378 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 1379 DAG.getConstant(VTBits, MVT::i32), ShAmt); 1380 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 1381 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 1382 DAG.getConstant(VTBits, MVT::i32)); 1383 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 1384 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 1385 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 1386 1387 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 1388 DAG.getConstant(VTBits, MVT::i32), ISD::SETGE); 1389 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 1390 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 1391 1392 SDValue Ops[2] = { Lo, Hi }; 1393 return DAG.getMergeValues(Ops, dl); 1394 } 1395 } 1396 1397 SDValue 1398 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 1399 switch (Op.getOpcode()) { 1400 case ISD::RETURNADDR: 1401 return SDValue(); 1402 case ISD::FRAMEADDR: 1403 return SDValue(); 1404 case ISD::GlobalAddress: 1405 return LowerGlobalAddress(Op, DAG); 1406 case ISD::INTRINSIC_W_CHAIN: 1407 return Op; 1408 case ISD::BUILD_VECTOR: 1409 case ISD::EXTRACT_SUBVECTOR: 1410 return Op; 1411 case ISD::CONCAT_VECTORS: 1412 return LowerCONCAT_VECTORS(Op, DAG); 1413 case ISD::STORE: 1414 return LowerSTORE(Op, DAG); 1415 case ISD::LOAD: 1416 return LowerLOAD(Op, DAG); 1417 case ISD::SHL_PARTS: 1418 return LowerShiftLeftParts(Op, DAG); 1419 case ISD::SRA_PARTS: 1420 case ISD::SRL_PARTS: 1421 return LowerShiftRightParts(Op, DAG); 1422 default: 1423 llvm_unreachable("Custom lowering not defined for operation"); 1424 } 1425 } 1426 1427 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1428 if (Op.getValueType() == MVT::i1) 1429 return LowerLOADi1(Op, DAG); 1430 else 1431 return SDValue(); 1432 } 1433 1434 // v = ld i1* addr 1435 // => 1436 // v1 = ld i8* addr (-> i16) 1437 // v = trunc i16 to i1 1438 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 1439 SDNode *Node = Op.getNode(); 1440 LoadSDNode *LD = cast<LoadSDNode>(Node); 1441 SDLoc dl(Node); 1442 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 1443 assert(Node->getValueType(0) == MVT::i1 && 1444 "Custom lowering for i1 load only"); 1445 SDValue newLD = 1446 DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), 1447 LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(), 1448 LD->isInvariant(), LD->getAlignment()); 1449 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 1450 // The legalizer (the caller) is expecting two values from the legalized 1451 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 1452 // in LegalizeDAG.cpp which also uses MergeValues. 1453 SDValue Ops[] = { result, LD->getChain() }; 1454 return DAG.getMergeValues(Ops, dl); 1455 } 1456 1457 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1458 EVT ValVT = Op.getOperand(1).getValueType(); 1459 if (ValVT == MVT::i1) 1460 return LowerSTOREi1(Op, DAG); 1461 else if (ValVT.isVector()) 1462 return LowerSTOREVector(Op, DAG); 1463 else 1464 return SDValue(); 1465 } 1466 1467 SDValue 1468 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 1469 SDNode *N = Op.getNode(); 1470 SDValue Val = N->getOperand(1); 1471 SDLoc DL(N); 1472 EVT ValVT = Val.getValueType(); 1473 1474 if (ValVT.isVector()) { 1475 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 1476 // legal. We can (and should) split that into 2 stores of <2 x double> here 1477 // but I'm leaving that as a TODO for now. 1478 if (!ValVT.isSimple()) 1479 return SDValue(); 1480 switch (ValVT.getSimpleVT().SimpleTy) { 1481 default: 1482 return SDValue(); 1483 case MVT::v2i8: 1484 case MVT::v2i16: 1485 case MVT::v2i32: 1486 case MVT::v2i64: 1487 case MVT::v2f32: 1488 case MVT::v2f64: 1489 case MVT::v4i8: 1490 case MVT::v4i16: 1491 case MVT::v4i32: 1492 case MVT::v4f32: 1493 // This is a "native" vector type 1494 break; 1495 } 1496 1497 unsigned Opcode = 0; 1498 EVT EltVT = ValVT.getVectorElementType(); 1499 unsigned NumElts = ValVT.getVectorNumElements(); 1500 1501 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 1502 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 1503 // stored type to i16 and propagate the "real" type as the memory type. 1504 bool NeedExt = false; 1505 if (EltVT.getSizeInBits() < 16) 1506 NeedExt = true; 1507 1508 switch (NumElts) { 1509 default: 1510 return SDValue(); 1511 case 2: 1512 Opcode = NVPTXISD::StoreV2; 1513 break; 1514 case 4: { 1515 Opcode = NVPTXISD::StoreV4; 1516 break; 1517 } 1518 } 1519 1520 SmallVector<SDValue, 8> Ops; 1521 1522 // First is the chain 1523 Ops.push_back(N->getOperand(0)); 1524 1525 // Then the split values 1526 for (unsigned i = 0; i < NumElts; ++i) { 1527 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 1528 DAG.getIntPtrConstant(i)); 1529 if (NeedExt) 1530 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 1531 Ops.push_back(ExtVal); 1532 } 1533 1534 // Then any remaining arguments 1535 for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i) { 1536 Ops.push_back(N->getOperand(i)); 1537 } 1538 1539 MemSDNode *MemSD = cast<MemSDNode>(N); 1540 1541 SDValue NewSt = DAG.getMemIntrinsicNode( 1542 Opcode, DL, DAG.getVTList(MVT::Other), Ops, 1543 MemSD->getMemoryVT(), MemSD->getMemOperand()); 1544 1545 //return DCI.CombineTo(N, NewSt, true); 1546 return NewSt; 1547 } 1548 1549 return SDValue(); 1550 } 1551 1552 // st i1 v, addr 1553 // => 1554 // v1 = zxt v to i16 1555 // st.u8 i16, addr 1556 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 1557 SDNode *Node = Op.getNode(); 1558 SDLoc dl(Node); 1559 StoreSDNode *ST = cast<StoreSDNode>(Node); 1560 SDValue Tmp1 = ST->getChain(); 1561 SDValue Tmp2 = ST->getBasePtr(); 1562 SDValue Tmp3 = ST->getValue(); 1563 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 1564 unsigned Alignment = ST->getAlignment(); 1565 bool isVolatile = ST->isVolatile(); 1566 bool isNonTemporal = ST->isNonTemporal(); 1567 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 1568 SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, 1569 ST->getPointerInfo(), MVT::i8, isNonTemporal, 1570 isVolatile, Alignment); 1571 return Result; 1572 } 1573 1574 SDValue NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname, 1575 int idx, EVT v) const { 1576 std::string *name = nvTM->getManagedStrPool()->getManagedString(inname); 1577 std::stringstream suffix; 1578 suffix << idx; 1579 *name += suffix.str(); 1580 return DAG.getTargetExternalSymbol(name->c_str(), v); 1581 } 1582 1583 SDValue 1584 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { 1585 std::string ParamSym; 1586 raw_string_ostream ParamStr(ParamSym); 1587 1588 ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx; 1589 ParamStr.flush(); 1590 1591 std::string *SavedStr = 1592 nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str()); 1593 return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); 1594 } 1595 1596 SDValue NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) { 1597 return getExtSymb(DAG, ".HLPPARAM", idx); 1598 } 1599 1600 // Check to see if the kernel argument is image*_t or sampler_t 1601 1602 bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) { 1603 static const char *const specialTypes[] = { "struct._image2d_t", 1604 "struct._image3d_t", 1605 "struct._sampler_t" }; 1606 1607 const Type *Ty = arg->getType(); 1608 const PointerType *PTy = dyn_cast<PointerType>(Ty); 1609 1610 if (!PTy) 1611 return false; 1612 1613 if (!context) 1614 return false; 1615 1616 const StructType *STy = dyn_cast<StructType>(PTy->getElementType()); 1617 const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : ""; 1618 1619 for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i) 1620 if (TypeName == specialTypes[i]) 1621 return true; 1622 1623 return false; 1624 } 1625 1626 SDValue NVPTXTargetLowering::LowerFormalArguments( 1627 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 1628 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, 1629 SmallVectorImpl<SDValue> &InVals) const { 1630 MachineFunction &MF = DAG.getMachineFunction(); 1631 const DataLayout *TD = getDataLayout(); 1632 1633 const Function *F = MF.getFunction(); 1634 const AttributeSet &PAL = F->getAttributes(); 1635 const TargetLowering *TLI = DAG.getTarget().getTargetLowering(); 1636 1637 SDValue Root = DAG.getRoot(); 1638 std::vector<SDValue> OutChains; 1639 1640 bool isKernel = llvm::isKernelFunction(*F); 1641 bool isABI = (nvptxSubtarget.getSmVersion() >= 20); 1642 assert(isABI && "Non-ABI compilation is not supported"); 1643 if (!isABI) 1644 return Chain; 1645 1646 std::vector<Type *> argTypes; 1647 std::vector<const Argument *> theArgs; 1648 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); 1649 I != E; ++I) { 1650 theArgs.push_back(I); 1651 argTypes.push_back(I->getType()); 1652 } 1653 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 1654 // Ins.size() will be larger 1655 // * if there is an aggregate argument with multiple fields (each field 1656 // showing up separately in Ins) 1657 // * if there is a vector argument with more than typical vector-length 1658 // elements (generally if more than 4) where each vector element is 1659 // individually present in Ins. 1660 // So a different index should be used for indexing into Ins. 1661 // See similar issue in LowerCall. 1662 unsigned InsIdx = 0; 1663 1664 int idx = 0; 1665 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { 1666 Type *Ty = argTypes[i]; 1667 1668 // If the kernel argument is image*_t or sampler_t, convert it to 1669 // a i32 constant holding the parameter position. This can later 1670 // matched in the AsmPrinter to output the correct mangled name. 1671 if (isImageOrSamplerVal( 1672 theArgs[i], 1673 (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() 1674 : nullptr))) { 1675 assert(isKernel && "Only kernels can have image/sampler params"); 1676 InVals.push_back(DAG.getConstant(i + 1, MVT::i32)); 1677 continue; 1678 } 1679 1680 if (theArgs[i]->use_empty()) { 1681 // argument is dead 1682 if (Ty->isAggregateType()) { 1683 SmallVector<EVT, 16> vtparts; 1684 1685 ComputePTXValueVTs(*this, Ty, vtparts); 1686 assert(vtparts.size() > 0 && "empty aggregate type not expected"); 1687 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 1688 ++parti) { 1689 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 1690 ++InsIdx; 1691 } 1692 if (vtparts.size() > 0) 1693 --InsIdx; 1694 continue; 1695 } 1696 if (Ty->isVectorTy()) { 1697 EVT ObjectVT = getValueType(Ty); 1698 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 1699 for (unsigned parti = 0; parti < NumRegs; ++parti) { 1700 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 1701 ++InsIdx; 1702 } 1703 if (NumRegs > 0) 1704 --InsIdx; 1705 continue; 1706 } 1707 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 1708 continue; 1709 } 1710 1711 // In the following cases, assign a node order of "idx+1" 1712 // to newly created nodes. The SDNodes for params have to 1713 // appear in the same order as their order of appearance 1714 // in the original function. "idx+1" holds that order. 1715 if (PAL.hasAttribute(i + 1, Attribute::ByVal) == false) { 1716 if (Ty->isAggregateType()) { 1717 SmallVector<EVT, 16> vtparts; 1718 SmallVector<uint64_t, 16> offsets; 1719 1720 // NOTE: Here, we lose the ability to issue vector loads for vectors 1721 // that are a part of a struct. This should be investigated in the 1722 // future. 1723 ComputePTXValueVTs(*this, Ty, vtparts, &offsets, 0); 1724 assert(vtparts.size() > 0 && "empty aggregate type not expected"); 1725 bool aggregateIsPacked = false; 1726 if (StructType *STy = llvm::dyn_cast<StructType>(Ty)) 1727 aggregateIsPacked = STy->isPacked(); 1728 1729 SDValue Arg = getParamSymbol(DAG, idx, getPointerTy()); 1730 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 1731 ++parti) { 1732 EVT partVT = vtparts[parti]; 1733 Value *srcValue = Constant::getNullValue( 1734 PointerType::get(partVT.getTypeForEVT(F->getContext()), 1735 llvm::ADDRESS_SPACE_PARAM)); 1736 SDValue srcAddr = 1737 DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, 1738 DAG.getConstant(offsets[parti], getPointerTy())); 1739 unsigned partAlign = 1740 aggregateIsPacked ? 1 1741 : TD->getABITypeAlignment( 1742 partVT.getTypeForEVT(F->getContext())); 1743 SDValue p; 1744 if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) { 1745 ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 1746 ISD::SEXTLOAD : ISD::ZEXTLOAD; 1747 p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr, 1748 MachinePointerInfo(srcValue), partVT, false, 1749 false, partAlign); 1750 } else { 1751 p = DAG.getLoad(partVT, dl, Root, srcAddr, 1752 MachinePointerInfo(srcValue), false, false, false, 1753 partAlign); 1754 } 1755 if (p.getNode()) 1756 p.getNode()->setIROrder(idx + 1); 1757 InVals.push_back(p); 1758 ++InsIdx; 1759 } 1760 if (vtparts.size() > 0) 1761 --InsIdx; 1762 continue; 1763 } 1764 if (Ty->isVectorTy()) { 1765 EVT ObjectVT = getValueType(Ty); 1766 SDValue Arg = getParamSymbol(DAG, idx, getPointerTy()); 1767 unsigned NumElts = ObjectVT.getVectorNumElements(); 1768 assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts && 1769 "Vector was not scalarized"); 1770 unsigned Ofst = 0; 1771 EVT EltVT = ObjectVT.getVectorElementType(); 1772 1773 // V1 load 1774 // f32 = load ... 1775 if (NumElts == 1) { 1776 // We only have one element, so just directly load it 1777 Value *SrcValue = Constant::getNullValue(PointerType::get( 1778 EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); 1779 SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, 1780 DAG.getConstant(Ofst, getPointerTy())); 1781 SDValue P = DAG.getLoad( 1782 EltVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false, 1783 false, true, 1784 TD->getABITypeAlignment(EltVT.getTypeForEVT(F->getContext()))); 1785 if (P.getNode()) 1786 P.getNode()->setIROrder(idx + 1); 1787 1788 if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) 1789 P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P); 1790 InVals.push_back(P); 1791 Ofst += TD->getTypeAllocSize(EltVT.getTypeForEVT(F->getContext())); 1792 ++InsIdx; 1793 } else if (NumElts == 2) { 1794 // V2 load 1795 // f32,f32 = load ... 1796 EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2); 1797 Value *SrcValue = Constant::getNullValue(PointerType::get( 1798 VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); 1799 SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, 1800 DAG.getConstant(Ofst, getPointerTy())); 1801 SDValue P = DAG.getLoad( 1802 VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false, 1803 false, true, 1804 TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext()))); 1805 if (P.getNode()) 1806 P.getNode()->setIROrder(idx + 1); 1807 1808 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P, 1809 DAG.getIntPtrConstant(0)); 1810 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P, 1811 DAG.getIntPtrConstant(1)); 1812 1813 if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) { 1814 Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0); 1815 Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1); 1816 } 1817 1818 InVals.push_back(Elt0); 1819 InVals.push_back(Elt1); 1820 Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); 1821 InsIdx += 2; 1822 } else { 1823 // V4 loads 1824 // We have at least 4 elements (<3 x Ty> expands to 4 elements) and 1825 // the 1826 // vector will be expanded to a power of 2 elements, so we know we can 1827 // always round up to the next multiple of 4 when creating the vector 1828 // loads. 1829 // e.g. 4 elem => 1 ld.v4 1830 // 6 elem => 2 ld.v4 1831 // 8 elem => 2 ld.v4 1832 // 11 elem => 3 ld.v4 1833 unsigned VecSize = 4; 1834 if (EltVT.getSizeInBits() == 64) { 1835 VecSize = 2; 1836 } 1837 EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize); 1838 for (unsigned i = 0; i < NumElts; i += VecSize) { 1839 Value *SrcValue = Constant::getNullValue( 1840 PointerType::get(VecVT.getTypeForEVT(F->getContext()), 1841 llvm::ADDRESS_SPACE_PARAM)); 1842 SDValue SrcAddr = 1843 DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, 1844 DAG.getConstant(Ofst, getPointerTy())); 1845 SDValue P = DAG.getLoad( 1846 VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false, 1847 false, true, 1848 TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext()))); 1849 if (P.getNode()) 1850 P.getNode()->setIROrder(idx + 1); 1851 1852 for (unsigned j = 0; j < VecSize; ++j) { 1853 if (i + j >= NumElts) 1854 break; 1855 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P, 1856 DAG.getIntPtrConstant(j)); 1857 if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) 1858 Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt); 1859 InVals.push_back(Elt); 1860 } 1861 Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); 1862 } 1863 InsIdx += NumElts; 1864 } 1865 1866 if (NumElts > 0) 1867 --InsIdx; 1868 continue; 1869 } 1870 // A plain scalar. 1871 EVT ObjectVT = getValueType(Ty); 1872 // If ABI, load from the param symbol 1873 SDValue Arg = getParamSymbol(DAG, idx, getPointerTy()); 1874 Value *srcValue = Constant::getNullValue(PointerType::get( 1875 ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); 1876 SDValue p; 1877 if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) { 1878 ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 1879 ISD::SEXTLOAD : ISD::ZEXTLOAD; 1880 p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, Arg, 1881 MachinePointerInfo(srcValue), ObjectVT, false, false, 1882 TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); 1883 } else { 1884 p = DAG.getLoad(Ins[InsIdx].VT, dl, Root, Arg, 1885 MachinePointerInfo(srcValue), false, false, false, 1886 TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); 1887 } 1888 if (p.getNode()) 1889 p.getNode()->setIROrder(idx + 1); 1890 InVals.push_back(p); 1891 continue; 1892 } 1893 1894 // Param has ByVal attribute 1895 // Return MoveParam(param symbol). 1896 // Ideally, the param symbol can be returned directly, 1897 // but when SDNode builder decides to use it in a CopyToReg(), 1898 // machine instruction fails because TargetExternalSymbol 1899 // (not lowered) is target dependent, and CopyToReg assumes 1900 // the source is lowered. 1901 EVT ObjectVT = getValueType(Ty); 1902 assert(ObjectVT == Ins[InsIdx].VT && 1903 "Ins type did not match function type"); 1904 SDValue Arg = getParamSymbol(DAG, idx, getPointerTy()); 1905 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 1906 if (p.getNode()) 1907 p.getNode()->setIROrder(idx + 1); 1908 if (isKernel) 1909 InVals.push_back(p); 1910 else { 1911 SDValue p2 = DAG.getNode( 1912 ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT, 1913 DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, MVT::i32), p); 1914 InVals.push_back(p2); 1915 } 1916 } 1917 1918 // Clang will check explicit VarArg and issue error if any. However, Clang 1919 // will let code with 1920 // implicit var arg like f() pass. See bug 617733. 1921 // We treat this case as if the arg list is empty. 1922 // if (F.isVarArg()) { 1923 // assert(0 && "VarArg not supported yet!"); 1924 //} 1925 1926 if (!OutChains.empty()) 1927 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 1928 1929 return Chain; 1930 } 1931 1932 1933 SDValue 1934 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 1935 bool isVarArg, 1936 const SmallVectorImpl<ISD::OutputArg> &Outs, 1937 const SmallVectorImpl<SDValue> &OutVals, 1938 SDLoc dl, SelectionDAG &DAG) const { 1939 MachineFunction &MF = DAG.getMachineFunction(); 1940 const Function *F = MF.getFunction(); 1941 Type *RetTy = F->getReturnType(); 1942 const DataLayout *TD = getDataLayout(); 1943 1944 bool isABI = (nvptxSubtarget.getSmVersion() >= 20); 1945 assert(isABI && "Non-ABI compilation is not supported"); 1946 if (!isABI) 1947 return Chain; 1948 1949 if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) { 1950 // If we have a vector type, the OutVals array will be the scalarized 1951 // components and we have combine them into 1 or more vector stores. 1952 unsigned NumElts = VTy->getNumElements(); 1953 assert(NumElts == Outs.size() && "Bad scalarization of return value"); 1954 1955 // const_cast can be removed in later LLVM versions 1956 EVT EltVT = getValueType(RetTy).getVectorElementType(); 1957 bool NeedExtend = false; 1958 if (EltVT.getSizeInBits() < 16) 1959 NeedExtend = true; 1960 1961 // V1 store 1962 if (NumElts == 1) { 1963 SDValue StoreVal = OutVals[0]; 1964 // We only have one element, so just directly store it 1965 if (NeedExtend) 1966 StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); 1967 SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal }; 1968 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, 1969 DAG.getVTList(MVT::Other), Ops, 1970 EltVT, MachinePointerInfo()); 1971 1972 } else if (NumElts == 2) { 1973 // V2 store 1974 SDValue StoreVal0 = OutVals[0]; 1975 SDValue StoreVal1 = OutVals[1]; 1976 1977 if (NeedExtend) { 1978 StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0); 1979 StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1); 1980 } 1981 1982 SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal0, 1983 StoreVal1 }; 1984 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl, 1985 DAG.getVTList(MVT::Other), Ops, 1986 EltVT, MachinePointerInfo()); 1987 } else { 1988 // V4 stores 1989 // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the 1990 // vector will be expanded to a power of 2 elements, so we know we can 1991 // always round up to the next multiple of 4 when creating the vector 1992 // stores. 1993 // e.g. 4 elem => 1 st.v4 1994 // 6 elem => 2 st.v4 1995 // 8 elem => 2 st.v4 1996 // 11 elem => 3 st.v4 1997 1998 unsigned VecSize = 4; 1999 if (OutVals[0].getValueType().getSizeInBits() == 64) 2000 VecSize = 2; 2001 2002 unsigned Offset = 0; 2003 2004 EVT VecVT = 2005 EVT::getVectorVT(F->getContext(), EltVT, VecSize); 2006 unsigned PerStoreOffset = 2007 TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); 2008 2009 for (unsigned i = 0; i < NumElts; i += VecSize) { 2010 // Get values 2011 SDValue StoreVal; 2012 SmallVector<SDValue, 8> Ops; 2013 Ops.push_back(Chain); 2014 Ops.push_back(DAG.getConstant(Offset, MVT::i32)); 2015 unsigned Opc = NVPTXISD::StoreRetvalV2; 2016 EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType(); 2017 2018 StoreVal = OutVals[i]; 2019 if (NeedExtend) 2020 StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); 2021 Ops.push_back(StoreVal); 2022 2023 if (i + 1 < NumElts) { 2024 StoreVal = OutVals[i + 1]; 2025 if (NeedExtend) 2026 StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); 2027 } else { 2028 StoreVal = DAG.getUNDEF(ExtendedVT); 2029 } 2030 Ops.push_back(StoreVal); 2031 2032 if (VecSize == 4) { 2033 Opc = NVPTXISD::StoreRetvalV4; 2034 if (i + 2 < NumElts) { 2035 StoreVal = OutVals[i + 2]; 2036 if (NeedExtend) 2037 StoreVal = 2038 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); 2039 } else { 2040 StoreVal = DAG.getUNDEF(ExtendedVT); 2041 } 2042 Ops.push_back(StoreVal); 2043 2044 if (i + 3 < NumElts) { 2045 StoreVal = OutVals[i + 3]; 2046 if (NeedExtend) 2047 StoreVal = 2048 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); 2049 } else { 2050 StoreVal = DAG.getUNDEF(ExtendedVT); 2051 } 2052 Ops.push_back(StoreVal); 2053 } 2054 2055 // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size()); 2056 Chain = 2057 DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops, 2058 EltVT, MachinePointerInfo()); 2059 Offset += PerStoreOffset; 2060 } 2061 } 2062 } else { 2063 SmallVector<EVT, 16> ValVTs; 2064 SmallVector<uint64_t, 16> Offsets; 2065 ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0); 2066 assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition"); 2067 2068 for (unsigned i = 0, e = Outs.size(); i != e; ++i) { 2069 SDValue theVal = OutVals[i]; 2070 EVT TheValType = theVal.getValueType(); 2071 unsigned numElems = 1; 2072 if (TheValType.isVector()) 2073 numElems = TheValType.getVectorNumElements(); 2074 for (unsigned j = 0, je = numElems; j != je; ++j) { 2075 SDValue TmpVal = theVal; 2076 if (TheValType.isVector()) 2077 TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 2078 TheValType.getVectorElementType(), TmpVal, 2079 DAG.getIntPtrConstant(j)); 2080 EVT TheStoreType = ValVTs[i]; 2081 if (RetTy->isIntegerTy() && 2082 TD->getTypeAllocSizeInBits(RetTy) < 32) { 2083 // The following zero-extension is for integer types only, and 2084 // specifically not for aggregates. 2085 TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal); 2086 TheStoreType = MVT::i32; 2087 } 2088 else if (TmpVal.getValueType().getSizeInBits() < 16) 2089 TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal); 2090 2091 SDValue Ops[] = { 2092 Chain, 2093 DAG.getConstant(Offsets[i], MVT::i32), 2094 TmpVal }; 2095 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, 2096 DAG.getVTList(MVT::Other), Ops, 2097 TheStoreType, 2098 MachinePointerInfo()); 2099 } 2100 } 2101 } 2102 2103 return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); 2104 } 2105 2106 2107 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 2108 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 2109 SelectionDAG &DAG) const { 2110 if (Constraint.length() > 1) 2111 return; 2112 else 2113 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 2114 } 2115 2116 // NVPTX suuport vector of legal types of any length in Intrinsics because the 2117 // NVPTX specific type legalizer 2118 // will legalize them to the PTX supported length. 2119 bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const { 2120 if (isTypeLegal(VT)) 2121 return true; 2122 if (VT.isVector()) { 2123 MVT eVT = VT.getVectorElementType(); 2124 if (isTypeLegal(eVT)) 2125 return true; 2126 } 2127 return false; 2128 } 2129 2130 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 2131 switch (Intrinsic) { 2132 default: 2133 return 0; 2134 2135 case Intrinsic::nvvm_tex_1d_v4f32_i32: 2136 return NVPTXISD::Tex1DFloatI32; 2137 case Intrinsic::nvvm_tex_1d_v4f32_f32: 2138 return NVPTXISD::Tex1DFloatFloat; 2139 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 2140 return NVPTXISD::Tex1DFloatFloatLevel; 2141 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 2142 return NVPTXISD::Tex1DFloatFloatGrad; 2143 case Intrinsic::nvvm_tex_1d_v4i32_i32: 2144 return NVPTXISD::Tex1DI32I32; 2145 case Intrinsic::nvvm_tex_1d_v4i32_f32: 2146 return NVPTXISD::Tex1DI32Float; 2147 case Intrinsic::nvvm_tex_1d_level_v4i32_f32: 2148 return NVPTXISD::Tex1DI32FloatLevel; 2149 case Intrinsic::nvvm_tex_1d_grad_v4i32_f32: 2150 return NVPTXISD::Tex1DI32FloatGrad; 2151 2152 case Intrinsic::nvvm_tex_1d_array_v4f32_i32: 2153 return NVPTXISD::Tex1DArrayFloatI32; 2154 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 2155 return NVPTXISD::Tex1DArrayFloatFloat; 2156 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 2157 return NVPTXISD::Tex1DArrayFloatFloatLevel; 2158 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 2159 return NVPTXISD::Tex1DArrayFloatFloatGrad; 2160 case Intrinsic::nvvm_tex_1d_array_v4i32_i32: 2161 return NVPTXISD::Tex1DArrayI32I32; 2162 case Intrinsic::nvvm_tex_1d_array_v4i32_f32: 2163 return NVPTXISD::Tex1DArrayI32Float; 2164 case Intrinsic::nvvm_tex_1d_array_level_v4i32_f32: 2165 return NVPTXISD::Tex1DArrayI32FloatLevel; 2166 case Intrinsic::nvvm_tex_1d_array_grad_v4i32_f32: 2167 return NVPTXISD::Tex1DArrayI32FloatGrad; 2168 2169 case Intrinsic::nvvm_tex_2d_v4f32_i32: 2170 return NVPTXISD::Tex2DFloatI32; 2171 case Intrinsic::nvvm_tex_2d_v4f32_f32: 2172 return NVPTXISD::Tex2DFloatFloat; 2173 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 2174 return NVPTXISD::Tex2DFloatFloatLevel; 2175 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 2176 return NVPTXISD::Tex2DFloatFloatGrad; 2177 case Intrinsic::nvvm_tex_2d_v4i32_i32: 2178 return NVPTXISD::Tex2DI32I32; 2179 case Intrinsic::nvvm_tex_2d_v4i32_f32: 2180 return NVPTXISD::Tex2DI32Float; 2181 case Intrinsic::nvvm_tex_2d_level_v4i32_f32: 2182 return NVPTXISD::Tex2DI32FloatLevel; 2183 case Intrinsic::nvvm_tex_2d_grad_v4i32_f32: 2184 return NVPTXISD::Tex2DI32FloatGrad; 2185 2186 case Intrinsic::nvvm_tex_2d_array_v4f32_i32: 2187 return NVPTXISD::Tex2DArrayFloatI32; 2188 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 2189 return NVPTXISD::Tex2DArrayFloatFloat; 2190 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 2191 return NVPTXISD::Tex2DArrayFloatFloatLevel; 2192 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 2193 return NVPTXISD::Tex2DArrayFloatFloatGrad; 2194 case Intrinsic::nvvm_tex_2d_array_v4i32_i32: 2195 return NVPTXISD::Tex2DArrayI32I32; 2196 case Intrinsic::nvvm_tex_2d_array_v4i32_f32: 2197 return NVPTXISD::Tex2DArrayI32Float; 2198 case Intrinsic::nvvm_tex_2d_array_level_v4i32_f32: 2199 return NVPTXISD::Tex2DArrayI32FloatLevel; 2200 case Intrinsic::nvvm_tex_2d_array_grad_v4i32_f32: 2201 return NVPTXISD::Tex2DArrayI32FloatGrad; 2202 2203 case Intrinsic::nvvm_tex_3d_v4f32_i32: 2204 return NVPTXISD::Tex3DFloatI32; 2205 case Intrinsic::nvvm_tex_3d_v4f32_f32: 2206 return NVPTXISD::Tex3DFloatFloat; 2207 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 2208 return NVPTXISD::Tex3DFloatFloatLevel; 2209 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 2210 return NVPTXISD::Tex3DFloatFloatGrad; 2211 case Intrinsic::nvvm_tex_3d_v4i32_i32: 2212 return NVPTXISD::Tex3DI32I32; 2213 case Intrinsic::nvvm_tex_3d_v4i32_f32: 2214 return NVPTXISD::Tex3DI32Float; 2215 case Intrinsic::nvvm_tex_3d_level_v4i32_f32: 2216 return NVPTXISD::Tex3DI32FloatLevel; 2217 case Intrinsic::nvvm_tex_3d_grad_v4i32_f32: 2218 return NVPTXISD::Tex3DI32FloatGrad; 2219 } 2220 } 2221 2222 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 2223 switch (Intrinsic) { 2224 default: 2225 return 0; 2226 case Intrinsic::nvvm_suld_1d_i8_trap: 2227 return NVPTXISD::Suld1DI8Trap; 2228 case Intrinsic::nvvm_suld_1d_i16_trap: 2229 return NVPTXISD::Suld1DI16Trap; 2230 case Intrinsic::nvvm_suld_1d_i32_trap: 2231 return NVPTXISD::Suld1DI32Trap; 2232 case Intrinsic::nvvm_suld_1d_v2i8_trap: 2233 return NVPTXISD::Suld1DV2I8Trap; 2234 case Intrinsic::nvvm_suld_1d_v2i16_trap: 2235 return NVPTXISD::Suld1DV2I16Trap; 2236 case Intrinsic::nvvm_suld_1d_v2i32_trap: 2237 return NVPTXISD::Suld1DV2I32Trap; 2238 case Intrinsic::nvvm_suld_1d_v4i8_trap: 2239 return NVPTXISD::Suld1DV4I8Trap; 2240 case Intrinsic::nvvm_suld_1d_v4i16_trap: 2241 return NVPTXISD::Suld1DV4I16Trap; 2242 case Intrinsic::nvvm_suld_1d_v4i32_trap: 2243 return NVPTXISD::Suld1DV4I32Trap; 2244 case Intrinsic::nvvm_suld_1d_array_i8_trap: 2245 return NVPTXISD::Suld1DArrayI8Trap; 2246 case Intrinsic::nvvm_suld_1d_array_i16_trap: 2247 return NVPTXISD::Suld1DArrayI16Trap; 2248 case Intrinsic::nvvm_suld_1d_array_i32_trap: 2249 return NVPTXISD::Suld1DArrayI32Trap; 2250 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 2251 return NVPTXISD::Suld1DArrayV2I8Trap; 2252 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 2253 return NVPTXISD::Suld1DArrayV2I16Trap; 2254 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 2255 return NVPTXISD::Suld1DArrayV2I32Trap; 2256 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 2257 return NVPTXISD::Suld1DArrayV4I8Trap; 2258 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 2259 return NVPTXISD::Suld1DArrayV4I16Trap; 2260 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 2261 return NVPTXISD::Suld1DArrayV4I32Trap; 2262 case Intrinsic::nvvm_suld_2d_i8_trap: 2263 return NVPTXISD::Suld2DI8Trap; 2264 case Intrinsic::nvvm_suld_2d_i16_trap: 2265 return NVPTXISD::Suld2DI16Trap; 2266 case Intrinsic::nvvm_suld_2d_i32_trap: 2267 return NVPTXISD::Suld2DI32Trap; 2268 case Intrinsic::nvvm_suld_2d_v2i8_trap: 2269 return NVPTXISD::Suld2DV2I8Trap; 2270 case Intrinsic::nvvm_suld_2d_v2i16_trap: 2271 return NVPTXISD::Suld2DV2I16Trap; 2272 case Intrinsic::nvvm_suld_2d_v2i32_trap: 2273 return NVPTXISD::Suld2DV2I32Trap; 2274 case Intrinsic::nvvm_suld_2d_v4i8_trap: 2275 return NVPTXISD::Suld2DV4I8Trap; 2276 case Intrinsic::nvvm_suld_2d_v4i16_trap: 2277 return NVPTXISD::Suld2DV4I16Trap; 2278 case Intrinsic::nvvm_suld_2d_v4i32_trap: 2279 return NVPTXISD::Suld2DV4I32Trap; 2280 case Intrinsic::nvvm_suld_2d_array_i8_trap: 2281 return NVPTXISD::Suld2DArrayI8Trap; 2282 case Intrinsic::nvvm_suld_2d_array_i16_trap: 2283 return NVPTXISD::Suld2DArrayI16Trap; 2284 case Intrinsic::nvvm_suld_2d_array_i32_trap: 2285 return NVPTXISD::Suld2DArrayI32Trap; 2286 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 2287 return NVPTXISD::Suld2DArrayV2I8Trap; 2288 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 2289 return NVPTXISD::Suld2DArrayV2I16Trap; 2290 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 2291 return NVPTXISD::Suld2DArrayV2I32Trap; 2292 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 2293 return NVPTXISD::Suld2DArrayV4I8Trap; 2294 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 2295 return NVPTXISD::Suld2DArrayV4I16Trap; 2296 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 2297 return NVPTXISD::Suld2DArrayV4I32Trap; 2298 case Intrinsic::nvvm_suld_3d_i8_trap: 2299 return NVPTXISD::Suld3DI8Trap; 2300 case Intrinsic::nvvm_suld_3d_i16_trap: 2301 return NVPTXISD::Suld3DI16Trap; 2302 case Intrinsic::nvvm_suld_3d_i32_trap: 2303 return NVPTXISD::Suld3DI32Trap; 2304 case Intrinsic::nvvm_suld_3d_v2i8_trap: 2305 return NVPTXISD::Suld3DV2I8Trap; 2306 case Intrinsic::nvvm_suld_3d_v2i16_trap: 2307 return NVPTXISD::Suld3DV2I16Trap; 2308 case Intrinsic::nvvm_suld_3d_v2i32_trap: 2309 return NVPTXISD::Suld3DV2I32Trap; 2310 case Intrinsic::nvvm_suld_3d_v4i8_trap: 2311 return NVPTXISD::Suld3DV4I8Trap; 2312 case Intrinsic::nvvm_suld_3d_v4i16_trap: 2313 return NVPTXISD::Suld3DV4I16Trap; 2314 case Intrinsic::nvvm_suld_3d_v4i32_trap: 2315 return NVPTXISD::Suld3DV4I32Trap; 2316 } 2317 } 2318 2319 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 2320 // TgtMemIntrinsic 2321 // because we need the information that is only available in the "Value" type 2322 // of destination 2323 // pointer. In particular, the address space information. 2324 bool NVPTXTargetLowering::getTgtMemIntrinsic( 2325 IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const { 2326 switch (Intrinsic) { 2327 default: 2328 return false; 2329 2330 case Intrinsic::nvvm_atomic_load_add_f32: 2331 Info.opc = ISD::INTRINSIC_W_CHAIN; 2332 Info.memVT = MVT::f32; 2333 Info.ptrVal = I.getArgOperand(0); 2334 Info.offset = 0; 2335 Info.vol = 0; 2336 Info.readMem = true; 2337 Info.writeMem = true; 2338 Info.align = 0; 2339 return true; 2340 2341 case Intrinsic::nvvm_atomic_load_inc_32: 2342 case Intrinsic::nvvm_atomic_load_dec_32: 2343 Info.opc = ISD::INTRINSIC_W_CHAIN; 2344 Info.memVT = MVT::i32; 2345 Info.ptrVal = I.getArgOperand(0); 2346 Info.offset = 0; 2347 Info.vol = 0; 2348 Info.readMem = true; 2349 Info.writeMem = true; 2350 Info.align = 0; 2351 return true; 2352 2353 case Intrinsic::nvvm_ldu_global_i: 2354 case Intrinsic::nvvm_ldu_global_f: 2355 case Intrinsic::nvvm_ldu_global_p: { 2356 2357 Info.opc = ISD::INTRINSIC_W_CHAIN; 2358 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 2359 Info.memVT = getValueType(I.getType()); 2360 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 2361 Info.memVT = getPointerTy(); 2362 else 2363 Info.memVT = getValueType(I.getType()); 2364 Info.ptrVal = I.getArgOperand(0); 2365 Info.offset = 0; 2366 Info.vol = 0; 2367 Info.readMem = true; 2368 Info.writeMem = false; 2369 2370 // alignment is available as metadata. 2371 // Grab it and set the alignment. 2372 assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata"); 2373 MDNode *AlignMD = I.getMetadata("align"); 2374 assert(AlignMD && "Must have a non-null MDNode"); 2375 assert(AlignMD->getNumOperands() == 1 && "Must have a single operand"); 2376 Value *Align = AlignMD->getOperand(0); 2377 int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue(); 2378 Info.align = Alignment; 2379 2380 return true; 2381 } 2382 case Intrinsic::nvvm_ldg_global_i: 2383 case Intrinsic::nvvm_ldg_global_f: 2384 case Intrinsic::nvvm_ldg_global_p: { 2385 2386 Info.opc = ISD::INTRINSIC_W_CHAIN; 2387 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 2388 Info.memVT = getValueType(I.getType()); 2389 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 2390 Info.memVT = getPointerTy(); 2391 else 2392 Info.memVT = getValueType(I.getType()); 2393 Info.ptrVal = I.getArgOperand(0); 2394 Info.offset = 0; 2395 Info.vol = 0; 2396 Info.readMem = true; 2397 Info.writeMem = false; 2398 2399 // alignment is available as metadata. 2400 // Grab it and set the alignment. 2401 assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata"); 2402 MDNode *AlignMD = I.getMetadata("align"); 2403 assert(AlignMD && "Must have a non-null MDNode"); 2404 assert(AlignMD->getNumOperands() == 1 && "Must have a single operand"); 2405 Value *Align = AlignMD->getOperand(0); 2406 int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue(); 2407 Info.align = Alignment; 2408 2409 return true; 2410 } 2411 2412 case Intrinsic::nvvm_tex_1d_v4f32_i32: 2413 case Intrinsic::nvvm_tex_1d_v4f32_f32: 2414 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 2415 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 2416 case Intrinsic::nvvm_tex_1d_array_v4f32_i32: 2417 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 2418 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 2419 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 2420 case Intrinsic::nvvm_tex_2d_v4f32_i32: 2421 case Intrinsic::nvvm_tex_2d_v4f32_f32: 2422 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 2423 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 2424 case Intrinsic::nvvm_tex_2d_array_v4f32_i32: 2425 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 2426 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 2427 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 2428 case Intrinsic::nvvm_tex_3d_v4f32_i32: 2429 case Intrinsic::nvvm_tex_3d_v4f32_f32: 2430 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 2431 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: { 2432 Info.opc = getOpcForTextureInstr(Intrinsic); 2433 Info.memVT = MVT::f32; 2434 Info.ptrVal = nullptr; 2435 Info.offset = 0; 2436 Info.vol = 0; 2437 Info.readMem = true; 2438 Info.writeMem = false; 2439 Info.align = 16; 2440 return true; 2441 } 2442 case Intrinsic::nvvm_tex_1d_v4i32_i32: 2443 case Intrinsic::nvvm_tex_1d_v4i32_f32: 2444 case Intrinsic::nvvm_tex_1d_level_v4i32_f32: 2445 case Intrinsic::nvvm_tex_1d_grad_v4i32_f32: 2446 case Intrinsic::nvvm_tex_1d_array_v4i32_i32: 2447 case Intrinsic::nvvm_tex_1d_array_v4i32_f32: 2448 case Intrinsic::nvvm_tex_1d_array_level_v4i32_f32: 2449 case Intrinsic::nvvm_tex_1d_array_grad_v4i32_f32: 2450 case Intrinsic::nvvm_tex_2d_v4i32_i32: 2451 case Intrinsic::nvvm_tex_2d_v4i32_f32: 2452 case Intrinsic::nvvm_tex_2d_level_v4i32_f32: 2453 case Intrinsic::nvvm_tex_2d_grad_v4i32_f32: 2454 case Intrinsic::nvvm_tex_2d_array_v4i32_i32: 2455 case Intrinsic::nvvm_tex_2d_array_v4i32_f32: 2456 case Intrinsic::nvvm_tex_2d_array_level_v4i32_f32: 2457 case Intrinsic::nvvm_tex_2d_array_grad_v4i32_f32: 2458 case Intrinsic::nvvm_tex_3d_v4i32_i32: 2459 case Intrinsic::nvvm_tex_3d_v4i32_f32: 2460 case Intrinsic::nvvm_tex_3d_level_v4i32_f32: 2461 case Intrinsic::nvvm_tex_3d_grad_v4i32_f32: { 2462 Info.opc = getOpcForTextureInstr(Intrinsic); 2463 Info.memVT = MVT::i32; 2464 Info.ptrVal = nullptr; 2465 Info.offset = 0; 2466 Info.vol = 0; 2467 Info.readMem = true; 2468 Info.writeMem = false; 2469 Info.align = 16; 2470 return true; 2471 } 2472 case Intrinsic::nvvm_suld_1d_i8_trap: 2473 case Intrinsic::nvvm_suld_1d_v2i8_trap: 2474 case Intrinsic::nvvm_suld_1d_v4i8_trap: 2475 case Intrinsic::nvvm_suld_1d_array_i8_trap: 2476 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 2477 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 2478 case Intrinsic::nvvm_suld_2d_i8_trap: 2479 case Intrinsic::nvvm_suld_2d_v2i8_trap: 2480 case Intrinsic::nvvm_suld_2d_v4i8_trap: 2481 case Intrinsic::nvvm_suld_2d_array_i8_trap: 2482 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 2483 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 2484 case Intrinsic::nvvm_suld_3d_i8_trap: 2485 case Intrinsic::nvvm_suld_3d_v2i8_trap: 2486 case Intrinsic::nvvm_suld_3d_v4i8_trap: { 2487 Info.opc = getOpcForSurfaceInstr(Intrinsic); 2488 Info.memVT = MVT::i8; 2489 Info.ptrVal = nullptr; 2490 Info.offset = 0; 2491 Info.vol = 0; 2492 Info.readMem = true; 2493 Info.writeMem = false; 2494 Info.align = 16; 2495 return true; 2496 } 2497 case Intrinsic::nvvm_suld_1d_i16_trap: 2498 case Intrinsic::nvvm_suld_1d_v2i16_trap: 2499 case Intrinsic::nvvm_suld_1d_v4i16_trap: 2500 case Intrinsic::nvvm_suld_1d_array_i16_trap: 2501 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 2502 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 2503 case Intrinsic::nvvm_suld_2d_i16_trap: 2504 case Intrinsic::nvvm_suld_2d_v2i16_trap: 2505 case Intrinsic::nvvm_suld_2d_v4i16_trap: 2506 case Intrinsic::nvvm_suld_2d_array_i16_trap: 2507 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 2508 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 2509 case Intrinsic::nvvm_suld_3d_i16_trap: 2510 case Intrinsic::nvvm_suld_3d_v2i16_trap: 2511 case Intrinsic::nvvm_suld_3d_v4i16_trap: { 2512 Info.opc = getOpcForSurfaceInstr(Intrinsic); 2513 Info.memVT = MVT::i16; 2514 Info.ptrVal = nullptr; 2515 Info.offset = 0; 2516 Info.vol = 0; 2517 Info.readMem = true; 2518 Info.writeMem = false; 2519 Info.align = 16; 2520 return true; 2521 } 2522 case Intrinsic::nvvm_suld_1d_i32_trap: 2523 case Intrinsic::nvvm_suld_1d_v2i32_trap: 2524 case Intrinsic::nvvm_suld_1d_v4i32_trap: 2525 case Intrinsic::nvvm_suld_1d_array_i32_trap: 2526 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 2527 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 2528 case Intrinsic::nvvm_suld_2d_i32_trap: 2529 case Intrinsic::nvvm_suld_2d_v2i32_trap: 2530 case Intrinsic::nvvm_suld_2d_v4i32_trap: 2531 case Intrinsic::nvvm_suld_2d_array_i32_trap: 2532 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 2533 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 2534 case Intrinsic::nvvm_suld_3d_i32_trap: 2535 case Intrinsic::nvvm_suld_3d_v2i32_trap: 2536 case Intrinsic::nvvm_suld_3d_v4i32_trap: { 2537 Info.opc = getOpcForSurfaceInstr(Intrinsic); 2538 Info.memVT = MVT::i32; 2539 Info.ptrVal = nullptr; 2540 Info.offset = 0; 2541 Info.vol = 0; 2542 Info.readMem = true; 2543 Info.writeMem = false; 2544 Info.align = 16; 2545 return true; 2546 } 2547 2548 } 2549 return false; 2550 } 2551 2552 /// isLegalAddressingMode - Return true if the addressing mode represented 2553 /// by AM is legal for this target, for a load/store of the specified type. 2554 /// Used to guide target specific optimizations, like loop strength reduction 2555 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 2556 /// (CodeGenPrepare.cpp) 2557 bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM, 2558 Type *Ty) const { 2559 2560 // AddrMode - This represents an addressing mode of: 2561 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 2562 // 2563 // The legal address modes are 2564 // - [avar] 2565 // - [areg] 2566 // - [areg+immoff] 2567 // - [immAddr] 2568 2569 if (AM.BaseGV) { 2570 if (AM.BaseOffs || AM.HasBaseReg || AM.Scale) 2571 return false; 2572 return true; 2573 } 2574 2575 switch (AM.Scale) { 2576 case 0: // "r", "r+i" or "i" is allowed 2577 break; 2578 case 1: 2579 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 2580 return false; 2581 // Otherwise we have r+i. 2582 break; 2583 default: 2584 // No scale > 1 is allowed 2585 return false; 2586 } 2587 return true; 2588 } 2589 2590 //===----------------------------------------------------------------------===// 2591 // NVPTX Inline Assembly Support 2592 //===----------------------------------------------------------------------===// 2593 2594 /// getConstraintType - Given a constraint letter, return the type of 2595 /// constraint it is for this target. 2596 NVPTXTargetLowering::ConstraintType 2597 NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const { 2598 if (Constraint.size() == 1) { 2599 switch (Constraint[0]) { 2600 default: 2601 break; 2602 case 'b': 2603 case 'r': 2604 case 'h': 2605 case 'c': 2606 case 'l': 2607 case 'f': 2608 case 'd': 2609 case '0': 2610 case 'N': 2611 return C_RegisterClass; 2612 } 2613 } 2614 return TargetLowering::getConstraintType(Constraint); 2615 } 2616 2617 std::pair<unsigned, const TargetRegisterClass *> 2618 NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 2619 MVT VT) const { 2620 if (Constraint.size() == 1) { 2621 switch (Constraint[0]) { 2622 case 'b': 2623 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 2624 case 'c': 2625 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 2626 case 'h': 2627 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 2628 case 'r': 2629 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 2630 case 'l': 2631 case 'N': 2632 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 2633 case 'f': 2634 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 2635 case 'd': 2636 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 2637 } 2638 } 2639 return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 2640 } 2641 2642 /// getFunctionAlignment - Return the Log2 alignment of this function. 2643 unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const { 2644 return 4; 2645 } 2646 2647 //===----------------------------------------------------------------------===// 2648 // NVPTX DAG Combining 2649 //===----------------------------------------------------------------------===// 2650 2651 extern unsigned FMAContractLevel; 2652 2653 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 2654 /// operands N0 and N1. This is a helper for PerformADDCombine that is 2655 /// called with the default operands, and if that fails, with commuted 2656 /// operands. 2657 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 2658 TargetLowering::DAGCombinerInfo &DCI, 2659 const NVPTXSubtarget &Subtarget, 2660 CodeGenOpt::Level OptLevel) { 2661 SelectionDAG &DAG = DCI.DAG; 2662 // Skip non-integer, non-scalar case 2663 EVT VT=N0.getValueType(); 2664 if (VT.isVector()) 2665 return SDValue(); 2666 2667 // fold (add (mul a, b), c) -> (mad a, b, c) 2668 // 2669 if (N0.getOpcode() == ISD::MUL) { 2670 assert (VT.isInteger()); 2671 // For integer: 2672 // Since integer multiply-add costs the same as integer multiply 2673 // but is more costly than integer add, do the fusion only when 2674 // the mul is only used in the add. 2675 if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || 2676 !N0.getNode()->hasOneUse()) 2677 return SDValue(); 2678 2679 // Do the folding 2680 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 2681 N0.getOperand(0), N0.getOperand(1), N1); 2682 } 2683 else if (N0.getOpcode() == ISD::FMUL) { 2684 if (VT == MVT::f32 || VT == MVT::f64) { 2685 if (FMAContractLevel == 0) 2686 return SDValue(); 2687 2688 // For floating point: 2689 // Do the fusion only when the mul has less than 5 uses and all 2690 // are add. 2691 // The heuristic is that if a use is not an add, then that use 2692 // cannot be fused into fma, therefore mul is still needed anyway. 2693 // If there are more than 4 uses, even if they are all add, fusing 2694 // them will increase register pressue. 2695 // 2696 int numUses = 0; 2697 int nonAddCount = 0; 2698 for (SDNode::use_iterator UI = N0.getNode()->use_begin(), 2699 UE = N0.getNode()->use_end(); 2700 UI != UE; ++UI) { 2701 numUses++; 2702 SDNode *User = *UI; 2703 if (User->getOpcode() != ISD::FADD) 2704 ++nonAddCount; 2705 } 2706 if (numUses >= 5) 2707 return SDValue(); 2708 if (nonAddCount) { 2709 int orderNo = N->getIROrder(); 2710 int orderNo2 = N0.getNode()->getIROrder(); 2711 // simple heuristics here for considering potential register 2712 // pressure, the logics here is that the differnce are used 2713 // to measure the distance between def and use, the longer distance 2714 // more likely cause register pressure. 2715 if (orderNo - orderNo2 < 500) 2716 return SDValue(); 2717 2718 // Now, check if at least one of the FMUL's operands is live beyond the node N, 2719 // which guarantees that the FMA will not increase register pressure at node N. 2720 bool opIsLive = false; 2721 const SDNode *left = N0.getOperand(0).getNode(); 2722 const SDNode *right = N0.getOperand(1).getNode(); 2723 2724 if (dyn_cast<ConstantSDNode>(left) || dyn_cast<ConstantSDNode>(right)) 2725 opIsLive = true; 2726 2727 if (!opIsLive) 2728 for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) { 2729 SDNode *User = *UI; 2730 int orderNo3 = User->getIROrder(); 2731 if (orderNo3 > orderNo) { 2732 opIsLive = true; 2733 break; 2734 } 2735 } 2736 2737 if (!opIsLive) 2738 for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) { 2739 SDNode *User = *UI; 2740 int orderNo3 = User->getIROrder(); 2741 if (orderNo3 > orderNo) { 2742 opIsLive = true; 2743 break; 2744 } 2745 } 2746 2747 if (!opIsLive) 2748 return SDValue(); 2749 } 2750 2751 return DAG.getNode(ISD::FMA, SDLoc(N), VT, 2752 N0.getOperand(0), N0.getOperand(1), N1); 2753 } 2754 } 2755 2756 return SDValue(); 2757 } 2758 2759 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 2760 /// 2761 static SDValue PerformADDCombine(SDNode *N, 2762 TargetLowering::DAGCombinerInfo &DCI, 2763 const NVPTXSubtarget &Subtarget, 2764 CodeGenOpt::Level OptLevel) { 2765 SDValue N0 = N->getOperand(0); 2766 SDValue N1 = N->getOperand(1); 2767 2768 // First try with the default operand order. 2769 SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, 2770 OptLevel); 2771 if (Result.getNode()) 2772 return Result; 2773 2774 // If that didn't work, try again with the operands commuted. 2775 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); 2776 } 2777 2778 static SDValue PerformANDCombine(SDNode *N, 2779 TargetLowering::DAGCombinerInfo &DCI) { 2780 // The type legalizer turns a vector load of i8 values into a zextload to i16 2781 // registers, optionally ANY_EXTENDs it (if target type is integer), 2782 // and ANDs off the high 8 bits. Since we turn this load into a 2783 // target-specific DAG node, the DAG combiner fails to eliminate these AND 2784 // nodes. Do that here. 2785 SDValue Val = N->getOperand(0); 2786 SDValue Mask = N->getOperand(1); 2787 2788 if (isa<ConstantSDNode>(Val)) { 2789 std::swap(Val, Mask); 2790 } 2791 2792 SDValue AExt; 2793 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 2794 if (Val.getOpcode() == ISD::ANY_EXTEND) { 2795 AExt = Val; 2796 Val = Val->getOperand(0); 2797 } 2798 2799 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 2800 Val = Val->getOperand(0); 2801 } 2802 2803 if (Val->getOpcode() == NVPTXISD::LoadV2 || 2804 Val->getOpcode() == NVPTXISD::LoadV4) { 2805 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 2806 if (!MaskCnst) { 2807 // Not an AND with a constant 2808 return SDValue(); 2809 } 2810 2811 uint64_t MaskVal = MaskCnst->getZExtValue(); 2812 if (MaskVal != 0xff) { 2813 // Not an AND that chops off top 8 bits 2814 return SDValue(); 2815 } 2816 2817 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 2818 if (!Mem) { 2819 // Not a MemSDNode?!? 2820 return SDValue(); 2821 } 2822 2823 EVT MemVT = Mem->getMemoryVT(); 2824 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 2825 // We only handle the i8 case 2826 return SDValue(); 2827 } 2828 2829 unsigned ExtType = 2830 cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> 2831 getZExtValue(); 2832 if (ExtType == ISD::SEXTLOAD) { 2833 // If for some reason the load is a sextload, the and is needed to zero 2834 // out the high 8 bits 2835 return SDValue(); 2836 } 2837 2838 bool AddTo = false; 2839 if (AExt.getNode() != 0) { 2840 // Re-insert the ext as a zext. 2841 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 2842 AExt.getValueType(), Val); 2843 AddTo = true; 2844 } 2845 2846 // If we get here, the AND is unnecessary. Just replace it with the load 2847 DCI.CombineTo(N, Val, AddTo); 2848 } 2849 2850 return SDValue(); 2851 } 2852 2853 enum OperandSignedness { 2854 Signed = 0, 2855 Unsigned, 2856 Unknown 2857 }; 2858 2859 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 2860 /// that can be demoted to \p OptSize bits without loss of information. The 2861 /// signedness of the operand, if determinable, is placed in \p S. 2862 static bool IsMulWideOperandDemotable(SDValue Op, 2863 unsigned OptSize, 2864 OperandSignedness &S) { 2865 S = Unknown; 2866 2867 if (Op.getOpcode() == ISD::SIGN_EXTEND || 2868 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 2869 EVT OrigVT = Op.getOperand(0).getValueType(); 2870 if (OrigVT.getSizeInBits() == OptSize) { 2871 S = Signed; 2872 return true; 2873 } 2874 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 2875 EVT OrigVT = Op.getOperand(0).getValueType(); 2876 if (OrigVT.getSizeInBits() == OptSize) { 2877 S = Unsigned; 2878 return true; 2879 } 2880 } 2881 2882 return false; 2883 } 2884 2885 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 2886 /// be demoted to \p OptSize bits without loss of information. If the operands 2887 /// contain a constant, it should appear as the RHS operand. The signedness of 2888 /// the operands is placed in \p IsSigned. 2889 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 2890 unsigned OptSize, 2891 bool &IsSigned) { 2892 2893 OperandSignedness LHSSign; 2894 2895 // The LHS operand must be a demotable op 2896 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 2897 return false; 2898 2899 // We should have been able to determine the signedness from the LHS 2900 if (LHSSign == Unknown) 2901 return false; 2902 2903 IsSigned = (LHSSign == Signed); 2904 2905 // The RHS can be a demotable op or a constant 2906 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 2907 APInt Val = CI->getAPIntValue(); 2908 if (LHSSign == Unsigned) { 2909 if (Val.isIntN(OptSize)) { 2910 return true; 2911 } 2912 return false; 2913 } else { 2914 if (Val.isSignedIntN(OptSize)) { 2915 return true; 2916 } 2917 return false; 2918 } 2919 } else { 2920 OperandSignedness RHSSign; 2921 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 2922 return false; 2923 2924 if (LHSSign != RHSSign) 2925 return false; 2926 2927 return true; 2928 } 2929 } 2930 2931 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 2932 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 2933 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 2934 /// amount. 2935 static SDValue TryMULWIDECombine(SDNode *N, 2936 TargetLowering::DAGCombinerInfo &DCI) { 2937 EVT MulType = N->getValueType(0); 2938 if (MulType != MVT::i32 && MulType != MVT::i64) { 2939 return SDValue(); 2940 } 2941 2942 unsigned OptSize = MulType.getSizeInBits() >> 1; 2943 SDValue LHS = N->getOperand(0); 2944 SDValue RHS = N->getOperand(1); 2945 2946 // Canonicalize the multiply so the constant (if any) is on the right 2947 if (N->getOpcode() == ISD::MUL) { 2948 if (isa<ConstantSDNode>(LHS)) { 2949 std::swap(LHS, RHS); 2950 } 2951 } 2952 2953 // If we have a SHL, determine the actual multiply amount 2954 if (N->getOpcode() == ISD::SHL) { 2955 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 2956 if (!ShlRHS) { 2957 return SDValue(); 2958 } 2959 2960 APInt ShiftAmt = ShlRHS->getAPIntValue(); 2961 unsigned BitWidth = MulType.getSizeInBits(); 2962 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 2963 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 2964 RHS = DCI.DAG.getConstant(MulVal, MulType); 2965 } else { 2966 return SDValue(); 2967 } 2968 } 2969 2970 bool Signed; 2971 // Verify that our operands are demotable 2972 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 2973 return SDValue(); 2974 } 2975 2976 EVT DemotedVT; 2977 if (MulType == MVT::i32) { 2978 DemotedVT = MVT::i16; 2979 } else { 2980 DemotedVT = MVT::i32; 2981 } 2982 2983 // Truncate the operands to the correct size. Note that these are just for 2984 // type consistency and will (likely) be eliminated in later phases. 2985 SDValue TruncLHS = 2986 DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, LHS); 2987 SDValue TruncRHS = 2988 DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, RHS); 2989 2990 unsigned Opc; 2991 if (Signed) { 2992 Opc = NVPTXISD::MUL_WIDE_SIGNED; 2993 } else { 2994 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 2995 } 2996 2997 return DCI.DAG.getNode(Opc, SDLoc(N), MulType, TruncLHS, TruncRHS); 2998 } 2999 3000 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 3001 static SDValue PerformMULCombine(SDNode *N, 3002 TargetLowering::DAGCombinerInfo &DCI, 3003 CodeGenOpt::Level OptLevel) { 3004 if (OptLevel > 0) { 3005 // Try mul.wide combining at OptLevel > 0 3006 SDValue Ret = TryMULWIDECombine(N, DCI); 3007 if (Ret.getNode()) 3008 return Ret; 3009 } 3010 3011 return SDValue(); 3012 } 3013 3014 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 3015 static SDValue PerformSHLCombine(SDNode *N, 3016 TargetLowering::DAGCombinerInfo &DCI, 3017 CodeGenOpt::Level OptLevel) { 3018 if (OptLevel > 0) { 3019 // Try mul.wide combining at OptLevel > 0 3020 SDValue Ret = TryMULWIDECombine(N, DCI); 3021 if (Ret.getNode()) 3022 return Ret; 3023 } 3024 3025 return SDValue(); 3026 } 3027 3028 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 3029 DAGCombinerInfo &DCI) const { 3030 // FIXME: Get this from the DAG somehow 3031 CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive; 3032 switch (N->getOpcode()) { 3033 default: break; 3034 case ISD::ADD: 3035 case ISD::FADD: 3036 return PerformADDCombine(N, DCI, nvptxSubtarget, OptLevel); 3037 case ISD::MUL: 3038 return PerformMULCombine(N, DCI, OptLevel); 3039 case ISD::SHL: 3040 return PerformSHLCombine(N, DCI, OptLevel); 3041 case ISD::AND: 3042 return PerformANDCombine(N, DCI); 3043 } 3044 return SDValue(); 3045 } 3046 3047 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 3048 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 3049 SmallVectorImpl<SDValue> &Results) { 3050 EVT ResVT = N->getValueType(0); 3051 SDLoc DL(N); 3052 3053 assert(ResVT.isVector() && "Vector load must have vector type"); 3054 3055 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 3056 // legal. We can (and should) split that into 2 loads of <2 x double> here 3057 // but I'm leaving that as a TODO for now. 3058 assert(ResVT.isSimple() && "Can only handle simple types"); 3059 switch (ResVT.getSimpleVT().SimpleTy) { 3060 default: 3061 return; 3062 case MVT::v2i8: 3063 case MVT::v2i16: 3064 case MVT::v2i32: 3065 case MVT::v2i64: 3066 case MVT::v2f32: 3067 case MVT::v2f64: 3068 case MVT::v4i8: 3069 case MVT::v4i16: 3070 case MVT::v4i32: 3071 case MVT::v4f32: 3072 // This is a "native" vector type 3073 break; 3074 } 3075 3076 EVT EltVT = ResVT.getVectorElementType(); 3077 unsigned NumElts = ResVT.getVectorNumElements(); 3078 3079 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 3080 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 3081 // loaded type to i16 and propagate the "real" type as the memory type. 3082 bool NeedTrunc = false; 3083 if (EltVT.getSizeInBits() < 16) { 3084 EltVT = MVT::i16; 3085 NeedTrunc = true; 3086 } 3087 3088 unsigned Opcode = 0; 3089 SDVTList LdResVTs; 3090 3091 switch (NumElts) { 3092 default: 3093 return; 3094 case 2: 3095 Opcode = NVPTXISD::LoadV2; 3096 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 3097 break; 3098 case 4: { 3099 Opcode = NVPTXISD::LoadV4; 3100 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 3101 LdResVTs = DAG.getVTList(ListVTs); 3102 break; 3103 } 3104 } 3105 3106 SmallVector<SDValue, 8> OtherOps; 3107 3108 // Copy regular operands 3109 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 3110 OtherOps.push_back(N->getOperand(i)); 3111 3112 LoadSDNode *LD = cast<LoadSDNode>(N); 3113 3114 // The select routine does not have access to the LoadSDNode instance, so 3115 // pass along the extension information 3116 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType())); 3117 3118 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 3119 LD->getMemoryVT(), 3120 LD->getMemOperand()); 3121 3122 SmallVector<SDValue, 4> ScalarRes; 3123 3124 for (unsigned i = 0; i < NumElts; ++i) { 3125 SDValue Res = NewLD.getValue(i); 3126 if (NeedTrunc) 3127 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 3128 ScalarRes.push_back(Res); 3129 } 3130 3131 SDValue LoadChain = NewLD.getValue(NumElts); 3132 3133 SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes); 3134 3135 Results.push_back(BuildVec); 3136 Results.push_back(LoadChain); 3137 } 3138 3139 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 3140 SmallVectorImpl<SDValue> &Results) { 3141 SDValue Chain = N->getOperand(0); 3142 SDValue Intrin = N->getOperand(1); 3143 SDLoc DL(N); 3144 3145 // Get the intrinsic ID 3146 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); 3147 switch (IntrinNo) { 3148 default: 3149 return; 3150 case Intrinsic::nvvm_ldg_global_i: 3151 case Intrinsic::nvvm_ldg_global_f: 3152 case Intrinsic::nvvm_ldg_global_p: 3153 case Intrinsic::nvvm_ldu_global_i: 3154 case Intrinsic::nvvm_ldu_global_f: 3155 case Intrinsic::nvvm_ldu_global_p: { 3156 EVT ResVT = N->getValueType(0); 3157 3158 if (ResVT.isVector()) { 3159 // Vector LDG/LDU 3160 3161 unsigned NumElts = ResVT.getVectorNumElements(); 3162 EVT EltVT = ResVT.getVectorElementType(); 3163 3164 // Since LDU/LDG are target nodes, we cannot rely on DAG type 3165 // legalization. 3166 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 3167 // loaded type to i16 and propagate the "real" type as the memory type. 3168 bool NeedTrunc = false; 3169 if (EltVT.getSizeInBits() < 16) { 3170 EltVT = MVT::i16; 3171 NeedTrunc = true; 3172 } 3173 3174 unsigned Opcode = 0; 3175 SDVTList LdResVTs; 3176 3177 switch (NumElts) { 3178 default: 3179 return; 3180 case 2: 3181 switch (IntrinNo) { 3182 default: 3183 return; 3184 case Intrinsic::nvvm_ldg_global_i: 3185 case Intrinsic::nvvm_ldg_global_f: 3186 case Intrinsic::nvvm_ldg_global_p: 3187 Opcode = NVPTXISD::LDGV2; 3188 break; 3189 case Intrinsic::nvvm_ldu_global_i: 3190 case Intrinsic::nvvm_ldu_global_f: 3191 case Intrinsic::nvvm_ldu_global_p: 3192 Opcode = NVPTXISD::LDUV2; 3193 break; 3194 } 3195 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 3196 break; 3197 case 4: { 3198 switch (IntrinNo) { 3199 default: 3200 return; 3201 case Intrinsic::nvvm_ldg_global_i: 3202 case Intrinsic::nvvm_ldg_global_f: 3203 case Intrinsic::nvvm_ldg_global_p: 3204 Opcode = NVPTXISD::LDGV4; 3205 break; 3206 case Intrinsic::nvvm_ldu_global_i: 3207 case Intrinsic::nvvm_ldu_global_f: 3208 case Intrinsic::nvvm_ldu_global_p: 3209 Opcode = NVPTXISD::LDUV4; 3210 break; 3211 } 3212 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 3213 LdResVTs = DAG.getVTList(ListVTs); 3214 break; 3215 } 3216 } 3217 3218 SmallVector<SDValue, 8> OtherOps; 3219 3220 // Copy regular operands 3221 3222 OtherOps.push_back(Chain); // Chain 3223 // Skip operand 1 (intrinsic ID) 3224 // Others 3225 for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i) 3226 OtherOps.push_back(N->getOperand(i)); 3227 3228 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 3229 3230 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 3231 MemSD->getMemoryVT(), 3232 MemSD->getMemOperand()); 3233 3234 SmallVector<SDValue, 4> ScalarRes; 3235 3236 for (unsigned i = 0; i < NumElts; ++i) { 3237 SDValue Res = NewLD.getValue(i); 3238 if (NeedTrunc) 3239 Res = 3240 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 3241 ScalarRes.push_back(Res); 3242 } 3243 3244 SDValue LoadChain = NewLD.getValue(NumElts); 3245 3246 SDValue BuildVec = 3247 DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes); 3248 3249 Results.push_back(BuildVec); 3250 Results.push_back(LoadChain); 3251 } else { 3252 // i8 LDG/LDU 3253 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 3254 "Custom handling of non-i8 ldu/ldg?"); 3255 3256 // Just copy all operands as-is 3257 SmallVector<SDValue, 4> Ops; 3258 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 3259 Ops.push_back(N->getOperand(i)); 3260 3261 // Force output to i16 3262 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 3263 3264 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 3265 3266 // We make sure the memory type is i8, which will be used during isel 3267 // to select the proper instruction. 3268 SDValue NewLD = 3269 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 3270 MVT::i8, MemSD->getMemOperand()); 3271 3272 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 3273 NewLD.getValue(0))); 3274 Results.push_back(NewLD.getValue(1)); 3275 } 3276 } 3277 } 3278 } 3279 3280 void NVPTXTargetLowering::ReplaceNodeResults( 3281 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 3282 switch (N->getOpcode()) { 3283 default: 3284 report_fatal_error("Unhandled custom legalization"); 3285 case ISD::LOAD: 3286 ReplaceLoadVector(N, DAG, Results); 3287 return; 3288 case ISD::INTRINSIC_W_CHAIN: 3289 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 3290 return; 3291 } 3292 } 3293 3294 // Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file. 3295 void NVPTXSection::anchor() {} 3296 3297 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() { 3298 delete TextSection; 3299 delete DataSection; 3300 delete BSSSection; 3301 delete ReadOnlySection; 3302 3303 delete StaticCtorSection; 3304 delete StaticDtorSection; 3305 delete LSDASection; 3306 delete EHFrameSection; 3307 delete DwarfAbbrevSection; 3308 delete DwarfInfoSection; 3309 delete DwarfLineSection; 3310 delete DwarfFrameSection; 3311 delete DwarfPubTypesSection; 3312 delete DwarfDebugInlineSection; 3313 delete DwarfStrSection; 3314 delete DwarfLocSection; 3315 delete DwarfARangesSection; 3316 delete DwarfRangesSection; 3317 delete DwarfMacroInfoSection; 3318 } 3319