1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for R600 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "R600ISelLowering.h" 16 #include "AMDGPUFrameLowering.h" 17 #include "AMDGPUIntrinsicInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "R600Defines.h" 20 #include "R600InstrInfo.h" 21 #include "R600MachineFunctionInfo.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/CodeGen/CallingConvLower.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineRegisterInfo.h" 27 #include "llvm/CodeGen/SelectionDAG.h" 28 #include "llvm/IR/Argument.h" 29 #include "llvm/IR/Function.h" 30 31 using namespace llvm; 32 33 R600TargetLowering::R600TargetLowering(TargetMachine &TM, 34 const AMDGPUSubtarget &STI) 35 : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { 36 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 37 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 38 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 39 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 40 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); 41 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); 42 43 computeRegisterProperties(STI.getRegisterInfo()); 44 45 // Set condition code actions 46 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 47 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 48 setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 49 setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 50 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 51 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 52 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 53 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 54 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 55 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 56 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 57 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 58 59 setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 60 setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 61 setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 62 setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 63 64 setOperationAction(ISD::FCOS, MVT::f32, Custom); 65 setOperationAction(ISD::FSIN, MVT::f32, Custom); 66 67 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 68 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 69 70 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 71 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 72 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 73 74 setOperationAction(ISD::FSUB, MVT::f32, Expand); 75 76 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 77 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 78 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); 79 80 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 81 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 82 83 setOperationAction(ISD::SETCC, MVT::i32, Expand); 84 setOperationAction(ISD::SETCC, MVT::f32, Expand); 85 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 86 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 87 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 88 89 setOperationAction(ISD::SELECT, MVT::i32, Expand); 90 setOperationAction(ISD::SELECT, MVT::f32, Expand); 91 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 92 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 93 94 // Expand sign extension of vectors 95 if (!Subtarget->hasBFE()) 96 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 97 98 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); 99 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); 100 101 if (!Subtarget->hasBFE()) 102 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 103 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); 104 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); 105 106 if (!Subtarget->hasBFE()) 107 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 108 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 109 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); 110 111 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); 113 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); 114 115 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); 116 117 118 // Legalize loads and stores to the private address space. 119 setOperationAction(ISD::LOAD, MVT::i32, Custom); 120 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 121 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 122 123 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 124 // spaces, so it is custom lowered to handle those where it isn't. 125 for (MVT VT : MVT::integer_valuetypes()) { 126 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 127 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); 128 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); 129 130 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 131 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); 132 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); 133 134 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 135 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); 136 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); 137 } 138 139 setOperationAction(ISD::STORE, MVT::i8, Custom); 140 setOperationAction(ISD::STORE, MVT::i32, Custom); 141 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 142 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 143 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 144 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 145 146 setOperationAction(ISD::LOAD, MVT::i32, Custom); 147 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 148 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 149 150 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); 151 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); 152 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 153 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 154 155 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); 156 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); 157 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 158 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 159 160 setTargetDAGCombine(ISD::FP_ROUND); 161 setTargetDAGCombine(ISD::FP_TO_SINT); 162 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 163 setTargetDAGCombine(ISD::SELECT_CC); 164 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 165 166 setOperationAction(ISD::SUB, MVT::i64, Expand); 167 168 // These should be replaced by UDVIREM, but it does not happen automatically 169 // during Type Legalization 170 setOperationAction(ISD::UDIV, MVT::i64, Custom); 171 setOperationAction(ISD::UREM, MVT::i64, Custom); 172 setOperationAction(ISD::SDIV, MVT::i64, Custom); 173 setOperationAction(ISD::SREM, MVT::i64, Custom); 174 175 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 176 // to be Legal/Custom in order to avoid library calls. 177 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 178 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 179 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 180 181 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 182 183 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 184 for (MVT VT : ScalarIntVTs) { 185 setOperationAction(ISD::ADDC, VT, Expand); 186 setOperationAction(ISD::SUBC, VT, Expand); 187 setOperationAction(ISD::ADDE, VT, Expand); 188 setOperationAction(ISD::SUBE, VT, Expand); 189 } 190 191 setSchedulingPreference(Sched::Source); 192 } 193 194 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( 195 MachineInstr * MI, MachineBasicBlock * BB) const { 196 MachineFunction * MF = BB->getParent(); 197 MachineRegisterInfo &MRI = MF->getRegInfo(); 198 MachineBasicBlock::iterator I = *MI; 199 const R600InstrInfo *TII = 200 static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); 201 202 switch (MI->getOpcode()) { 203 default: 204 // Replace LDS_*_RET instruction that don't have any uses with the 205 // equivalent LDS_*_NORET instruction. 206 if (TII->isLDSRetInstr(MI->getOpcode())) { 207 int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); 208 assert(DstIdx != -1); 209 MachineInstrBuilder NewMI; 210 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add 211 // LDS_1A2D support and remove this special case. 212 if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || 213 MI->getOpcode() == AMDGPU::LDS_CMPST_RET) 214 return BB; 215 216 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 217 TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); 218 for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { 219 NewMI.addOperand(MI->getOperand(i)); 220 } 221 } else { 222 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 223 } 224 break; 225 case AMDGPU::CLAMP_R600: { 226 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 227 AMDGPU::MOV, 228 MI->getOperand(0).getReg(), 229 MI->getOperand(1).getReg()); 230 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); 231 break; 232 } 233 234 case AMDGPU::FABS_R600: { 235 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 236 AMDGPU::MOV, 237 MI->getOperand(0).getReg(), 238 MI->getOperand(1).getReg()); 239 TII->addFlag(NewMI, 0, MO_FLAG_ABS); 240 break; 241 } 242 243 case AMDGPU::FNEG_R600: { 244 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 245 AMDGPU::MOV, 246 MI->getOperand(0).getReg(), 247 MI->getOperand(1).getReg()); 248 TII->addFlag(NewMI, 0, MO_FLAG_NEG); 249 break; 250 } 251 252 case AMDGPU::MASK_WRITE: { 253 unsigned maskedRegister = MI->getOperand(0).getReg(); 254 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 255 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 256 TII->addFlag(defInstr, 0, MO_FLAG_MASK); 257 break; 258 } 259 260 case AMDGPU::MOV_IMM_F32: 261 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 262 MI->getOperand(1).getFPImm()->getValueAPF() 263 .bitcastToAPInt().getZExtValue()); 264 break; 265 case AMDGPU::MOV_IMM_I32: 266 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 267 MI->getOperand(1).getImm()); 268 break; 269 case AMDGPU::CONST_COPY: { 270 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, 271 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); 272 TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, 273 MI->getOperand(1).getImm()); 274 break; 275 } 276 277 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 278 case AMDGPU::RAT_WRITE_CACHELESS_64_eg: 279 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 280 unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 281 282 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 283 .addOperand(MI->getOperand(0)) 284 .addOperand(MI->getOperand(1)) 285 .addImm(EOP); // Set End of program bit 286 break; 287 } 288 289 case AMDGPU::TXD: { 290 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 291 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 292 MachineOperand &RID = MI->getOperand(4); 293 MachineOperand &SID = MI->getOperand(5); 294 unsigned TextureId = MI->getOperand(6).getImm(); 295 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 296 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 297 298 switch (TextureId) { 299 case 5: // Rect 300 CTX = CTY = 0; 301 break; 302 case 6: // Shadow1D 303 SrcW = SrcZ; 304 break; 305 case 7: // Shadow2D 306 SrcW = SrcZ; 307 break; 308 case 8: // ShadowRect 309 CTX = CTY = 0; 310 SrcW = SrcZ; 311 break; 312 case 9: // 1DArray 313 SrcZ = SrcY; 314 CTZ = 0; 315 break; 316 case 10: // 2DArray 317 CTZ = 0; 318 break; 319 case 11: // Shadow1DArray 320 SrcZ = SrcY; 321 CTZ = 0; 322 break; 323 case 12: // Shadow2DArray 324 CTZ = 0; 325 break; 326 } 327 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 328 .addOperand(MI->getOperand(3)) 329 .addImm(SrcX) 330 .addImm(SrcY) 331 .addImm(SrcZ) 332 .addImm(SrcW) 333 .addImm(0) 334 .addImm(0) 335 .addImm(0) 336 .addImm(0) 337 .addImm(1) 338 .addImm(2) 339 .addImm(3) 340 .addOperand(RID) 341 .addOperand(SID) 342 .addImm(CTX) 343 .addImm(CTY) 344 .addImm(CTZ) 345 .addImm(CTW); 346 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 347 .addOperand(MI->getOperand(2)) 348 .addImm(SrcX) 349 .addImm(SrcY) 350 .addImm(SrcZ) 351 .addImm(SrcW) 352 .addImm(0) 353 .addImm(0) 354 .addImm(0) 355 .addImm(0) 356 .addImm(1) 357 .addImm(2) 358 .addImm(3) 359 .addOperand(RID) 360 .addOperand(SID) 361 .addImm(CTX) 362 .addImm(CTY) 363 .addImm(CTZ) 364 .addImm(CTW); 365 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 366 .addOperand(MI->getOperand(0)) 367 .addOperand(MI->getOperand(1)) 368 .addImm(SrcX) 369 .addImm(SrcY) 370 .addImm(SrcZ) 371 .addImm(SrcW) 372 .addImm(0) 373 .addImm(0) 374 .addImm(0) 375 .addImm(0) 376 .addImm(1) 377 .addImm(2) 378 .addImm(3) 379 .addOperand(RID) 380 .addOperand(SID) 381 .addImm(CTX) 382 .addImm(CTY) 383 .addImm(CTZ) 384 .addImm(CTW) 385 .addReg(T0, RegState::Implicit) 386 .addReg(T1, RegState::Implicit); 387 break; 388 } 389 390 case AMDGPU::TXD_SHADOW: { 391 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 392 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 393 MachineOperand &RID = MI->getOperand(4); 394 MachineOperand &SID = MI->getOperand(5); 395 unsigned TextureId = MI->getOperand(6).getImm(); 396 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 397 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 398 399 switch (TextureId) { 400 case 5: // Rect 401 CTX = CTY = 0; 402 break; 403 case 6: // Shadow1D 404 SrcW = SrcZ; 405 break; 406 case 7: // Shadow2D 407 SrcW = SrcZ; 408 break; 409 case 8: // ShadowRect 410 CTX = CTY = 0; 411 SrcW = SrcZ; 412 break; 413 case 9: // 1DArray 414 SrcZ = SrcY; 415 CTZ = 0; 416 break; 417 case 10: // 2DArray 418 CTZ = 0; 419 break; 420 case 11: // Shadow1DArray 421 SrcZ = SrcY; 422 CTZ = 0; 423 break; 424 case 12: // Shadow2DArray 425 CTZ = 0; 426 break; 427 } 428 429 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 430 .addOperand(MI->getOperand(3)) 431 .addImm(SrcX) 432 .addImm(SrcY) 433 .addImm(SrcZ) 434 .addImm(SrcW) 435 .addImm(0) 436 .addImm(0) 437 .addImm(0) 438 .addImm(0) 439 .addImm(1) 440 .addImm(2) 441 .addImm(3) 442 .addOperand(RID) 443 .addOperand(SID) 444 .addImm(CTX) 445 .addImm(CTY) 446 .addImm(CTZ) 447 .addImm(CTW); 448 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 449 .addOperand(MI->getOperand(2)) 450 .addImm(SrcX) 451 .addImm(SrcY) 452 .addImm(SrcZ) 453 .addImm(SrcW) 454 .addImm(0) 455 .addImm(0) 456 .addImm(0) 457 .addImm(0) 458 .addImm(1) 459 .addImm(2) 460 .addImm(3) 461 .addOperand(RID) 462 .addOperand(SID) 463 .addImm(CTX) 464 .addImm(CTY) 465 .addImm(CTZ) 466 .addImm(CTW); 467 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 468 .addOperand(MI->getOperand(0)) 469 .addOperand(MI->getOperand(1)) 470 .addImm(SrcX) 471 .addImm(SrcY) 472 .addImm(SrcZ) 473 .addImm(SrcW) 474 .addImm(0) 475 .addImm(0) 476 .addImm(0) 477 .addImm(0) 478 .addImm(1) 479 .addImm(2) 480 .addImm(3) 481 .addOperand(RID) 482 .addOperand(SID) 483 .addImm(CTX) 484 .addImm(CTY) 485 .addImm(CTZ) 486 .addImm(CTW) 487 .addReg(T0, RegState::Implicit) 488 .addReg(T1, RegState::Implicit); 489 break; 490 } 491 492 case AMDGPU::BRANCH: 493 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 494 .addOperand(MI->getOperand(0)); 495 break; 496 497 case AMDGPU::BRANCH_COND_f32: { 498 MachineInstr *NewMI = 499 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 500 AMDGPU::PREDICATE_BIT) 501 .addOperand(MI->getOperand(1)) 502 .addImm(OPCODE_IS_NOT_ZERO) 503 .addImm(0); // Flags 504 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 505 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 506 .addOperand(MI->getOperand(0)) 507 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 508 break; 509 } 510 511 case AMDGPU::BRANCH_COND_i32: { 512 MachineInstr *NewMI = 513 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 514 AMDGPU::PREDICATE_BIT) 515 .addOperand(MI->getOperand(1)) 516 .addImm(OPCODE_IS_NOT_ZERO_INT) 517 .addImm(0); // Flags 518 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 519 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 520 .addOperand(MI->getOperand(0)) 521 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 522 break; 523 } 524 525 case AMDGPU::EG_ExportSwz: 526 case AMDGPU::R600_ExportSwz: { 527 // Instruction is left unmodified if its not the last one of its type 528 bool isLastInstructionOfItsType = true; 529 unsigned InstExportType = MI->getOperand(1).getImm(); 530 for (MachineBasicBlock::iterator NextExportInst = std::next(I), 531 EndBlock = BB->end(); NextExportInst != EndBlock; 532 NextExportInst = std::next(NextExportInst)) { 533 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 534 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 535 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 536 .getImm(); 537 if (CurrentInstExportType == InstExportType) { 538 isLastInstructionOfItsType = false; 539 break; 540 } 541 } 542 } 543 bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 544 if (!EOP && !isLastInstructionOfItsType) 545 return BB; 546 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; 547 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 548 .addOperand(MI->getOperand(0)) 549 .addOperand(MI->getOperand(1)) 550 .addOperand(MI->getOperand(2)) 551 .addOperand(MI->getOperand(3)) 552 .addOperand(MI->getOperand(4)) 553 .addOperand(MI->getOperand(5)) 554 .addOperand(MI->getOperand(6)) 555 .addImm(CfInst) 556 .addImm(EOP); 557 break; 558 } 559 case AMDGPU::RETURN: { 560 // RETURN instructions must have the live-out registers as implicit uses, 561 // otherwise they appear dead. 562 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 563 MachineInstrBuilder MIB(*MF, MI); 564 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 565 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 566 return BB; 567 } 568 } 569 570 MI->eraseFromParent(); 571 return BB; 572 } 573 574 //===----------------------------------------------------------------------===// 575 // Custom DAG Lowering Operations 576 //===----------------------------------------------------------------------===// 577 578 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 579 MachineFunction &MF = DAG.getMachineFunction(); 580 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 581 switch (Op.getOpcode()) { 582 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 583 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 584 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 585 case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); 586 case ISD::SRA_PARTS: 587 case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); 588 case ISD::FCOS: 589 case ISD::FSIN: return LowerTrig(Op, DAG); 590 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 591 case ISD::STORE: return LowerSTORE(Op, DAG); 592 case ISD::LOAD: { 593 SDValue Result = LowerLOAD(Op, DAG); 594 assert((!Result.getNode() || 595 Result.getNode()->getNumValues() == 2) && 596 "Load should return a value and a chain"); 597 return Result; 598 } 599 600 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 601 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 602 case ISD::INTRINSIC_VOID: { 603 SDValue Chain = Op.getOperand(0); 604 unsigned IntrinsicID = 605 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 606 switch (IntrinsicID) { 607 case AMDGPUIntrinsic::AMDGPU_store_output: { 608 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 609 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 610 MFI->LiveOuts.push_back(Reg); 611 return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); 612 } 613 case AMDGPUIntrinsic::R600_store_swizzle: { 614 const SDValue Args[8] = { 615 Chain, 616 Op.getOperand(2), // Export Value 617 Op.getOperand(3), // ArrayBase 618 Op.getOperand(4), // Type 619 DAG.getConstant(0, MVT::i32), // SWZ_X 620 DAG.getConstant(1, MVT::i32), // SWZ_Y 621 DAG.getConstant(2, MVT::i32), // SWZ_Z 622 DAG.getConstant(3, MVT::i32) // SWZ_W 623 }; 624 return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args); 625 } 626 627 // default for switch(IntrinsicID) 628 default: break; 629 } 630 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 631 break; 632 } 633 case ISD::INTRINSIC_WO_CHAIN: { 634 unsigned IntrinsicID = 635 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 636 EVT VT = Op.getValueType(); 637 SDLoc DL(Op); 638 switch(IntrinsicID) { 639 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 640 case AMDGPUIntrinsic::R600_load_input: { 641 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 642 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 643 MachineFunction &MF = DAG.getMachineFunction(); 644 MachineRegisterInfo &MRI = MF.getRegInfo(); 645 MRI.addLiveIn(Reg); 646 return DAG.getCopyFromReg(DAG.getEntryNode(), 647 SDLoc(DAG.getEntryNode()), Reg, VT); 648 } 649 650 case AMDGPUIntrinsic::R600_interp_input: { 651 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 652 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); 653 MachineSDNode *interp; 654 if (ijb < 0) { 655 const R600InstrInfo *TII = 656 static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); 657 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, 658 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); 659 return DAG.getTargetExtractSubreg( 660 TII->getRegisterInfo().getSubRegFromChannel(slot % 4), 661 DL, MVT::f32, SDValue(interp, 0)); 662 } 663 MachineFunction &MF = DAG.getMachineFunction(); 664 MachineRegisterInfo &MRI = MF.getRegInfo(); 665 unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); 666 unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); 667 MRI.addLiveIn(RegisterI); 668 MRI.addLiveIn(RegisterJ); 669 SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), 670 SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); 671 SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), 672 SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); 673 674 if (slot % 4 < 2) 675 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 676 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 677 RegisterJNode, RegisterINode); 678 else 679 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 680 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 681 RegisterJNode, RegisterINode); 682 return SDValue(interp, slot % 2); 683 } 684 case AMDGPUIntrinsic::R600_interp_xy: 685 case AMDGPUIntrinsic::R600_interp_zw: { 686 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 687 MachineSDNode *interp; 688 SDValue RegisterINode = Op.getOperand(2); 689 SDValue RegisterJNode = Op.getOperand(3); 690 691 if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) 692 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 693 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), 694 RegisterJNode, RegisterINode); 695 else 696 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 697 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), 698 RegisterJNode, RegisterINode); 699 return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, 700 SDValue(interp, 0), SDValue(interp, 1)); 701 } 702 case AMDGPUIntrinsic::R600_tex: 703 case AMDGPUIntrinsic::R600_texc: 704 case AMDGPUIntrinsic::R600_txl: 705 case AMDGPUIntrinsic::R600_txlc: 706 case AMDGPUIntrinsic::R600_txb: 707 case AMDGPUIntrinsic::R600_txbc: 708 case AMDGPUIntrinsic::R600_txf: 709 case AMDGPUIntrinsic::R600_txq: 710 case AMDGPUIntrinsic::R600_ddx: 711 case AMDGPUIntrinsic::R600_ddy: 712 case AMDGPUIntrinsic::R600_ldptr: { 713 unsigned TextureOp; 714 switch (IntrinsicID) { 715 case AMDGPUIntrinsic::R600_tex: 716 TextureOp = 0; 717 break; 718 case AMDGPUIntrinsic::R600_texc: 719 TextureOp = 1; 720 break; 721 case AMDGPUIntrinsic::R600_txl: 722 TextureOp = 2; 723 break; 724 case AMDGPUIntrinsic::R600_txlc: 725 TextureOp = 3; 726 break; 727 case AMDGPUIntrinsic::R600_txb: 728 TextureOp = 4; 729 break; 730 case AMDGPUIntrinsic::R600_txbc: 731 TextureOp = 5; 732 break; 733 case AMDGPUIntrinsic::R600_txf: 734 TextureOp = 6; 735 break; 736 case AMDGPUIntrinsic::R600_txq: 737 TextureOp = 7; 738 break; 739 case AMDGPUIntrinsic::R600_ddx: 740 TextureOp = 8; 741 break; 742 case AMDGPUIntrinsic::R600_ddy: 743 TextureOp = 9; 744 break; 745 case AMDGPUIntrinsic::R600_ldptr: 746 TextureOp = 10; 747 break; 748 default: 749 llvm_unreachable("Unknow Texture Operation"); 750 } 751 752 SDValue TexArgs[19] = { 753 DAG.getConstant(TextureOp, MVT::i32), 754 Op.getOperand(1), 755 DAG.getConstant(0, MVT::i32), 756 DAG.getConstant(1, MVT::i32), 757 DAG.getConstant(2, MVT::i32), 758 DAG.getConstant(3, MVT::i32), 759 Op.getOperand(2), 760 Op.getOperand(3), 761 Op.getOperand(4), 762 DAG.getConstant(0, MVT::i32), 763 DAG.getConstant(1, MVT::i32), 764 DAG.getConstant(2, MVT::i32), 765 DAG.getConstant(3, MVT::i32), 766 Op.getOperand(5), 767 Op.getOperand(6), 768 Op.getOperand(7), 769 Op.getOperand(8), 770 Op.getOperand(9), 771 Op.getOperand(10) 772 }; 773 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); 774 } 775 case AMDGPUIntrinsic::AMDGPU_dp4: { 776 SDValue Args[8] = { 777 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 778 DAG.getConstant(0, MVT::i32)), 779 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 780 DAG.getConstant(0, MVT::i32)), 781 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 782 DAG.getConstant(1, MVT::i32)), 783 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 784 DAG.getConstant(1, MVT::i32)), 785 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 786 DAG.getConstant(2, MVT::i32)), 787 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 788 DAG.getConstant(2, MVT::i32)), 789 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 790 DAG.getConstant(3, MVT::i32)), 791 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 792 DAG.getConstant(3, MVT::i32)) 793 }; 794 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); 795 } 796 797 case Intrinsic::r600_read_ngroups_x: 798 return LowerImplicitParameter(DAG, VT, DL, 0); 799 case Intrinsic::r600_read_ngroups_y: 800 return LowerImplicitParameter(DAG, VT, DL, 1); 801 case Intrinsic::r600_read_ngroups_z: 802 return LowerImplicitParameter(DAG, VT, DL, 2); 803 case Intrinsic::r600_read_global_size_x: 804 return LowerImplicitParameter(DAG, VT, DL, 3); 805 case Intrinsic::r600_read_global_size_y: 806 return LowerImplicitParameter(DAG, VT, DL, 4); 807 case Intrinsic::r600_read_global_size_z: 808 return LowerImplicitParameter(DAG, VT, DL, 5); 809 case Intrinsic::r600_read_local_size_x: 810 return LowerImplicitParameter(DAG, VT, DL, 6); 811 case Intrinsic::r600_read_local_size_y: 812 return LowerImplicitParameter(DAG, VT, DL, 7); 813 case Intrinsic::r600_read_local_size_z: 814 return LowerImplicitParameter(DAG, VT, DL, 8); 815 816 case Intrinsic::AMDGPU_read_workdim: 817 return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4); 818 819 case Intrinsic::r600_read_tgid_x: 820 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 821 AMDGPU::T1_X, VT); 822 case Intrinsic::r600_read_tgid_y: 823 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 824 AMDGPU::T1_Y, VT); 825 case Intrinsic::r600_read_tgid_z: 826 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 827 AMDGPU::T1_Z, VT); 828 case Intrinsic::r600_read_tidig_x: 829 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 830 AMDGPU::T0_X, VT); 831 case Intrinsic::r600_read_tidig_y: 832 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 833 AMDGPU::T0_Y, VT); 834 case Intrinsic::r600_read_tidig_z: 835 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 836 AMDGPU::T0_Z, VT); 837 case Intrinsic::AMDGPU_rsq: 838 // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. 839 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 840 841 case AMDGPUIntrinsic::AMDGPU_fract: 842 case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. 843 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 844 } 845 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 846 break; 847 } 848 } // end switch(Op.getOpcode()) 849 return SDValue(); 850 } 851 852 void R600TargetLowering::ReplaceNodeResults(SDNode *N, 853 SmallVectorImpl<SDValue> &Results, 854 SelectionDAG &DAG) const { 855 switch (N->getOpcode()) { 856 default: 857 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 858 return; 859 case ISD::FP_TO_UINT: 860 if (N->getValueType(0) == MVT::i1) { 861 Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 862 return; 863 } 864 // Fall-through. Since we don't care about out of bounds values 865 // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint 866 // considers some extra cases which are not necessary here. 867 case ISD::FP_TO_SINT: { 868 SDValue Result; 869 if (expandFP_TO_SINT(N, Result, DAG)) 870 Results.push_back(Result); 871 return; 872 } 873 case ISD::UDIV: { 874 SDValue Op = SDValue(N, 0); 875 SDLoc DL(Op); 876 EVT VT = Op.getValueType(); 877 SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), 878 N->getOperand(0), N->getOperand(1)); 879 Results.push_back(UDIVREM); 880 break; 881 } 882 case ISD::UREM: { 883 SDValue Op = SDValue(N, 0); 884 SDLoc DL(Op); 885 EVT VT = Op.getValueType(); 886 SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), 887 N->getOperand(0), N->getOperand(1)); 888 Results.push_back(UDIVREM.getValue(1)); 889 break; 890 } 891 case ISD::SDIV: { 892 SDValue Op = SDValue(N, 0); 893 SDLoc DL(Op); 894 EVT VT = Op.getValueType(); 895 SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), 896 N->getOperand(0), N->getOperand(1)); 897 Results.push_back(SDIVREM); 898 break; 899 } 900 case ISD::SREM: { 901 SDValue Op = SDValue(N, 0); 902 SDLoc DL(Op); 903 EVT VT = Op.getValueType(); 904 SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), 905 N->getOperand(0), N->getOperand(1)); 906 Results.push_back(SDIVREM.getValue(1)); 907 break; 908 } 909 case ISD::SDIVREM: { 910 SDValue Op = SDValue(N, 1); 911 SDValue RES = LowerSDIVREM(Op, DAG); 912 Results.push_back(RES); 913 Results.push_back(RES.getValue(1)); 914 break; 915 } 916 case ISD::UDIVREM: { 917 SDValue Op = SDValue(N, 0); 918 LowerUDIVREM64(Op, DAG, Results); 919 break; 920 } 921 } 922 } 923 924 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, 925 SDValue Vector) const { 926 927 SDLoc DL(Vector); 928 EVT VecVT = Vector.getValueType(); 929 EVT EltVT = VecVT.getVectorElementType(); 930 SmallVector<SDValue, 8> Args; 931 932 for (unsigned i = 0, e = VecVT.getVectorNumElements(); 933 i != e; ++i) { 934 Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, 935 Vector, DAG.getConstant(i, getVectorIdxTy()))); 936 } 937 938 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); 939 } 940 941 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 942 SelectionDAG &DAG) const { 943 944 SDLoc DL(Op); 945 SDValue Vector = Op.getOperand(0); 946 SDValue Index = Op.getOperand(1); 947 948 if (isa<ConstantSDNode>(Index) || 949 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 950 return Op; 951 952 Vector = vectorToVerticalVector(DAG, Vector); 953 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), 954 Vector, Index); 955 } 956 957 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 958 SelectionDAG &DAG) const { 959 SDLoc DL(Op); 960 SDValue Vector = Op.getOperand(0); 961 SDValue Value = Op.getOperand(1); 962 SDValue Index = Op.getOperand(2); 963 964 if (isa<ConstantSDNode>(Index) || 965 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 966 return Op; 967 968 Vector = vectorToVerticalVector(DAG, Vector); 969 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), 970 Vector, Value, Index); 971 return vectorToVerticalVector(DAG, Insert); 972 } 973 974 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 975 // On hw >= R700, COS/SIN input must be between -1. and 1. 976 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 977 EVT VT = Op.getValueType(); 978 SDValue Arg = Op.getOperand(0); 979 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, 980 DAG.getNode(ISD::FADD, SDLoc(Op), VT, 981 DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, 982 DAG.getConstantFP(0.15915494309, MVT::f32)), 983 DAG.getConstantFP(0.5, MVT::f32))); 984 unsigned TrigNode; 985 switch (Op.getOpcode()) { 986 case ISD::FCOS: 987 TrigNode = AMDGPUISD::COS_HW; 988 break; 989 case ISD::FSIN: 990 TrigNode = AMDGPUISD::SIN_HW; 991 break; 992 default: 993 llvm_unreachable("Wrong trig opcode"); 994 } 995 SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT, 996 DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart, 997 DAG.getConstantFP(-0.5, MVT::f32))); 998 if (Gen >= AMDGPUSubtarget::R700) 999 return TrigVal; 1000 // On R600 hw, COS/SIN input must be between -Pi and Pi. 1001 return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal, 1002 DAG.getConstantFP(3.14159265359, MVT::f32)); 1003 } 1004 1005 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { 1006 SDLoc DL(Op); 1007 EVT VT = Op.getValueType(); 1008 1009 SDValue Lo = Op.getOperand(0); 1010 SDValue Hi = Op.getOperand(1); 1011 SDValue Shift = Op.getOperand(2); 1012 SDValue Zero = DAG.getConstant(0, VT); 1013 SDValue One = DAG.getConstant(1, VT); 1014 1015 SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT); 1016 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT); 1017 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 1018 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 1019 1020 // The dance around Width1 is necessary for 0 special case. 1021 // Without it the CompShift might be 32, producing incorrect results in 1022 // Overflow. So we do the shift in two steps, the alternative is to 1023 // add a conditional to filter the special case. 1024 1025 SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); 1026 Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); 1027 1028 SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); 1029 HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); 1030 SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); 1031 1032 SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); 1033 SDValue LoBig = Zero; 1034 1035 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 1036 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 1037 1038 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 1039 } 1040 1041 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { 1042 SDLoc DL(Op); 1043 EVT VT = Op.getValueType(); 1044 1045 SDValue Lo = Op.getOperand(0); 1046 SDValue Hi = Op.getOperand(1); 1047 SDValue Shift = Op.getOperand(2); 1048 SDValue Zero = DAG.getConstant(0, VT); 1049 SDValue One = DAG.getConstant(1, VT); 1050 1051 const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; 1052 1053 SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT); 1054 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT); 1055 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 1056 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 1057 1058 // The dance around Width1 is necessary for 0 special case. 1059 // Without it the CompShift might be 32, producing incorrect results in 1060 // Overflow. So we do the shift in two steps, the alternative is to 1061 // add a conditional to filter the special case. 1062 1063 SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); 1064 Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); 1065 1066 SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); 1067 SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); 1068 LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); 1069 1070 SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); 1071 SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; 1072 1073 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 1074 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 1075 1076 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 1077 } 1078 1079 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 1080 return DAG.getNode( 1081 ISD::SETCC, 1082 SDLoc(Op), 1083 MVT::i1, 1084 Op, DAG.getConstantFP(0.0f, MVT::f32), 1085 DAG.getCondCode(ISD::SETNE) 1086 ); 1087 } 1088 1089 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 1090 SDLoc DL, 1091 unsigned DwordOffset) const { 1092 unsigned ByteOffset = DwordOffset * 4; 1093 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1094 AMDGPUAS::CONSTANT_BUFFER_0); 1095 1096 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 1097 assert(isInt<16>(ByteOffset)); 1098 1099 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 1100 DAG.getConstant(ByteOffset, MVT::i32), // PTR 1101 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 1102 false, false, false, 0); 1103 } 1104 1105 bool R600TargetLowering::isZero(SDValue Op) const { 1106 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 1107 return Cst->isNullValue(); 1108 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 1109 return CstFP->isZero(); 1110 } else { 1111 return false; 1112 } 1113 } 1114 1115 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 1116 SDLoc DL(Op); 1117 EVT VT = Op.getValueType(); 1118 1119 SDValue LHS = Op.getOperand(0); 1120 SDValue RHS = Op.getOperand(1); 1121 SDValue True = Op.getOperand(2); 1122 SDValue False = Op.getOperand(3); 1123 SDValue CC = Op.getOperand(4); 1124 SDValue Temp; 1125 1126 if (VT == MVT::f32) { 1127 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); 1128 SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); 1129 if (MinMax) 1130 return MinMax; 1131 } 1132 1133 // LHS and RHS are guaranteed to be the same value type 1134 EVT CompareVT = LHS.getValueType(); 1135 1136 // Check if we can lower this to a native operation. 1137 1138 // Try to lower to a SET* instruction: 1139 // 1140 // SET* can match the following patterns: 1141 // 1142 // select_cc f32, f32, -1, 0, cc_supported 1143 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 1144 // select_cc i32, i32, -1, 0, cc_supported 1145 // 1146 1147 // Move hardware True/False values to the correct operand. 1148 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1149 ISD::CondCode InverseCC = 1150 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 1151 if (isHWTrueValue(False) && isHWFalseValue(True)) { 1152 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 1153 std::swap(False, True); 1154 CC = DAG.getCondCode(InverseCC); 1155 } else { 1156 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 1157 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 1158 std::swap(False, True); 1159 std::swap(LHS, RHS); 1160 CC = DAG.getCondCode(SwapInvCC); 1161 } 1162 } 1163 } 1164 1165 if (isHWTrueValue(True) && isHWFalseValue(False) && 1166 (CompareVT == VT || VT == MVT::i32)) { 1167 // This can be matched by a SET* instruction. 1168 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 1169 } 1170 1171 // Try to lower to a CND* instruction: 1172 // 1173 // CND* can match the following patterns: 1174 // 1175 // select_cc f32, 0.0, f32, f32, cc_supported 1176 // select_cc f32, 0.0, i32, i32, cc_supported 1177 // select_cc i32, 0, f32, f32, cc_supported 1178 // select_cc i32, 0, i32, i32, cc_supported 1179 // 1180 1181 // Try to move the zero value to the RHS 1182 if (isZero(LHS)) { 1183 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1184 // Try swapping the operands 1185 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 1186 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 1187 std::swap(LHS, RHS); 1188 CC = DAG.getCondCode(CCSwapped); 1189 } else { 1190 // Try inverting the conditon and then swapping the operands 1191 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); 1192 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 1193 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 1194 std::swap(True, False); 1195 std::swap(LHS, RHS); 1196 CC = DAG.getCondCode(CCSwapped); 1197 } 1198 } 1199 } 1200 if (isZero(RHS)) { 1201 SDValue Cond = LHS; 1202 SDValue Zero = RHS; 1203 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1204 if (CompareVT != VT) { 1205 // Bitcast True / False to the correct types. This will end up being 1206 // a nop, but it allows us to define only a single pattern in the 1207 // .TD files for each CND* instruction rather than having to have 1208 // one pattern for integer True/False and one for fp True/False 1209 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 1210 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 1211 } 1212 1213 switch (CCOpcode) { 1214 case ISD::SETONE: 1215 case ISD::SETUNE: 1216 case ISD::SETNE: 1217 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 1218 Temp = True; 1219 True = False; 1220 False = Temp; 1221 break; 1222 default: 1223 break; 1224 } 1225 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 1226 Cond, Zero, 1227 True, False, 1228 DAG.getCondCode(CCOpcode)); 1229 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 1230 } 1231 1232 // If we make it this for it means we have no native instructions to handle 1233 // this SELECT_CC, so we must lower it. 1234 SDValue HWTrue, HWFalse; 1235 1236 if (CompareVT == MVT::f32) { 1237 HWTrue = DAG.getConstantFP(1.0f, CompareVT); 1238 HWFalse = DAG.getConstantFP(0.0f, CompareVT); 1239 } else if (CompareVT == MVT::i32) { 1240 HWTrue = DAG.getConstant(-1, CompareVT); 1241 HWFalse = DAG.getConstant(0, CompareVT); 1242 } 1243 else { 1244 llvm_unreachable("Unhandled value type in LowerSELECT_CC"); 1245 } 1246 1247 // Lower this unsupported SELECT_CC into a combination of two supported 1248 // SELECT_CC operations. 1249 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 1250 1251 return DAG.getNode(ISD::SELECT_CC, DL, VT, 1252 Cond, HWFalse, 1253 True, False, 1254 DAG.getCondCode(ISD::SETNE)); 1255 } 1256 1257 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to 1258 /// convert these pointers to a register index. Each register holds 1259 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 1260 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 1261 /// for indirect addressing. 1262 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 1263 unsigned StackWidth, 1264 SelectionDAG &DAG) const { 1265 unsigned SRLPad; 1266 switch(StackWidth) { 1267 case 1: 1268 SRLPad = 2; 1269 break; 1270 case 2: 1271 SRLPad = 3; 1272 break; 1273 case 4: 1274 SRLPad = 4; 1275 break; 1276 default: llvm_unreachable("Invalid stack width"); 1277 } 1278 1279 return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr, 1280 DAG.getConstant(SRLPad, MVT::i32)); 1281 } 1282 1283 void R600TargetLowering::getStackAddress(unsigned StackWidth, 1284 unsigned ElemIdx, 1285 unsigned &Channel, 1286 unsigned &PtrIncr) const { 1287 switch (StackWidth) { 1288 default: 1289 case 1: 1290 Channel = 0; 1291 if (ElemIdx > 0) { 1292 PtrIncr = 1; 1293 } else { 1294 PtrIncr = 0; 1295 } 1296 break; 1297 case 2: 1298 Channel = ElemIdx % 2; 1299 if (ElemIdx == 2) { 1300 PtrIncr = 1; 1301 } else { 1302 PtrIncr = 0; 1303 } 1304 break; 1305 case 4: 1306 Channel = ElemIdx; 1307 PtrIncr = 0; 1308 break; 1309 } 1310 } 1311 1312 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1313 SDLoc DL(Op); 1314 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 1315 SDValue Chain = Op.getOperand(0); 1316 SDValue Value = Op.getOperand(1); 1317 SDValue Ptr = Op.getOperand(2); 1318 1319 SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1320 if (Result.getNode()) { 1321 return Result; 1322 } 1323 1324 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { 1325 if (StoreNode->isTruncatingStore()) { 1326 EVT VT = Value.getValueType(); 1327 assert(VT.bitsLE(MVT::i32)); 1328 EVT MemVT = StoreNode->getMemoryVT(); 1329 SDValue MaskConstant; 1330 if (MemVT == MVT::i8) { 1331 MaskConstant = DAG.getConstant(0xFF, MVT::i32); 1332 } else { 1333 assert(MemVT == MVT::i16); 1334 MaskConstant = DAG.getConstant(0xFFFF, MVT::i32); 1335 } 1336 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, 1337 DAG.getConstant(2, MVT::i32)); 1338 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, 1339 DAG.getConstant(0x00000003, VT)); 1340 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 1341 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 1342 DAG.getConstant(3, VT)); 1343 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); 1344 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); 1345 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 1346 // vector instead. 1347 SDValue Src[4] = { 1348 ShiftedValue, 1349 DAG.getConstant(0, MVT::i32), 1350 DAG.getConstant(0, MVT::i32), 1351 Mask 1352 }; 1353 SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); 1354 SDValue Args[3] = { Chain, Input, DWordAddr }; 1355 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 1356 Op->getVTList(), Args, MemVT, 1357 StoreNode->getMemOperand()); 1358 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && 1359 Value.getValueType().bitsGE(MVT::i32)) { 1360 // Convert pointer from byte address to dword address. 1361 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 1362 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 1363 Ptr, DAG.getConstant(2, MVT::i32))); 1364 1365 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 1366 llvm_unreachable("Truncated and indexed stores not supported yet"); 1367 } else { 1368 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1369 } 1370 return Chain; 1371 } 1372 } 1373 1374 EVT ValueVT = Value.getValueType(); 1375 1376 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1377 return SDValue(); 1378 } 1379 1380 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1381 if (Ret.getNode()) { 1382 return Ret; 1383 } 1384 // Lowering for indirect addressing 1385 1386 const MachineFunction &MF = DAG.getMachineFunction(); 1387 const AMDGPUFrameLowering *TFL = 1388 static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); 1389 unsigned StackWidth = TFL->getStackWidth(MF); 1390 1391 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1392 1393 if (ValueVT.isVector()) { 1394 unsigned NumElemVT = ValueVT.getVectorNumElements(); 1395 EVT ElemVT = ValueVT.getVectorElementType(); 1396 SmallVector<SDValue, 4> Stores(NumElemVT); 1397 1398 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1399 "vector width in load"); 1400 1401 for (unsigned i = 0; i < NumElemVT; ++i) { 1402 unsigned Channel, PtrIncr; 1403 getStackAddress(StackWidth, i, Channel, PtrIncr); 1404 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1405 DAG.getConstant(PtrIncr, MVT::i32)); 1406 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 1407 Value, DAG.getConstant(i, MVT::i32)); 1408 1409 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1410 Chain, Elem, Ptr, 1411 DAG.getTargetConstant(Channel, MVT::i32)); 1412 } 1413 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 1414 } else { 1415 if (ValueVT == MVT::i8) { 1416 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 1417 } 1418 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 1419 DAG.getTargetConstant(0, MVT::i32)); // Channel 1420 } 1421 1422 return Chain; 1423 } 1424 1425 // return (512 + (kc_bank << 12) 1426 static int 1427 ConstantAddressBlock(unsigned AddressSpace) { 1428 switch (AddressSpace) { 1429 case AMDGPUAS::CONSTANT_BUFFER_0: 1430 return 512; 1431 case AMDGPUAS::CONSTANT_BUFFER_1: 1432 return 512 + 4096; 1433 case AMDGPUAS::CONSTANT_BUFFER_2: 1434 return 512 + 4096 * 2; 1435 case AMDGPUAS::CONSTANT_BUFFER_3: 1436 return 512 + 4096 * 3; 1437 case AMDGPUAS::CONSTANT_BUFFER_4: 1438 return 512 + 4096 * 4; 1439 case AMDGPUAS::CONSTANT_BUFFER_5: 1440 return 512 + 4096 * 5; 1441 case AMDGPUAS::CONSTANT_BUFFER_6: 1442 return 512 + 4096 * 6; 1443 case AMDGPUAS::CONSTANT_BUFFER_7: 1444 return 512 + 4096 * 7; 1445 case AMDGPUAS::CONSTANT_BUFFER_8: 1446 return 512 + 4096 * 8; 1447 case AMDGPUAS::CONSTANT_BUFFER_9: 1448 return 512 + 4096 * 9; 1449 case AMDGPUAS::CONSTANT_BUFFER_10: 1450 return 512 + 4096 * 10; 1451 case AMDGPUAS::CONSTANT_BUFFER_11: 1452 return 512 + 4096 * 11; 1453 case AMDGPUAS::CONSTANT_BUFFER_12: 1454 return 512 + 4096 * 12; 1455 case AMDGPUAS::CONSTANT_BUFFER_13: 1456 return 512 + 4096 * 13; 1457 case AMDGPUAS::CONSTANT_BUFFER_14: 1458 return 512 + 4096 * 14; 1459 case AMDGPUAS::CONSTANT_BUFFER_15: 1460 return 512 + 4096 * 15; 1461 default: 1462 return -1; 1463 } 1464 } 1465 1466 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const 1467 { 1468 EVT VT = Op.getValueType(); 1469 SDLoc DL(Op); 1470 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1471 SDValue Chain = Op.getOperand(0); 1472 SDValue Ptr = Op.getOperand(1); 1473 SDValue LoweredLoad; 1474 1475 SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); 1476 if (Ret.getNode()) { 1477 SDValue Ops[2] = { 1478 Ret, 1479 Chain 1480 }; 1481 return DAG.getMergeValues(Ops, DL); 1482 } 1483 1484 // Lower loads constant address space global variable loads 1485 if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && 1486 isa<GlobalVariable>(GetUnderlyingObject( 1487 LoadNode->getMemOperand()->getValue(), *getDataLayout()))) { 1488 1489 SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, 1490 getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); 1491 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 1492 DAG.getConstant(2, MVT::i32)); 1493 return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), 1494 LoadNode->getChain(), Ptr, 1495 DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); 1496 } 1497 1498 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { 1499 SDValue MergedValues[2] = { 1500 ScalarizeVectorLoad(Op, DAG), 1501 Chain 1502 }; 1503 return DAG.getMergeValues(MergedValues, DL); 1504 } 1505 1506 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1507 if (ConstantBlock > -1 && 1508 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 1509 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 1510 SDValue Result; 1511 if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) || 1512 isa<Constant>(LoadNode->getMemOperand()->getValue()) || 1513 isa<ConstantSDNode>(Ptr)) { 1514 SDValue Slots[4]; 1515 for (unsigned i = 0; i < 4; i++) { 1516 // We want Const position encoded with the following formula : 1517 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1518 // const_index is Ptr computed by llvm using an alignment of 16. 1519 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1520 // then div by 4 at the ISel step 1521 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1522 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); 1523 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1524 } 1525 EVT NewVT = MVT::v4i32; 1526 unsigned NumElements = 4; 1527 if (VT.isVector()) { 1528 NewVT = VT; 1529 NumElements = VT.getVectorNumElements(); 1530 } 1531 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, 1532 makeArrayRef(Slots, NumElements)); 1533 } else { 1534 // non-constant ptr can't be folded, keeps it as a v4f32 load 1535 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1536 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), 1537 DAG.getConstant(LoadNode->getAddressSpace() - 1538 AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) 1539 ); 1540 } 1541 1542 if (!VT.isVector()) { 1543 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1544 DAG.getConstant(0, MVT::i32)); 1545 } 1546 1547 SDValue MergedValues[2] = { 1548 Result, 1549 Chain 1550 }; 1551 return DAG.getMergeValues(MergedValues, DL); 1552 } 1553 1554 // For most operations returning SDValue() will result in the node being 1555 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 1556 // need to manually expand loads that may be legal in some address spaces and 1557 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 1558 // compute shaders, since the data is sign extended when it is uploaded to the 1559 // buffer. However SEXT loads from other address spaces are not supported, so 1560 // we need to expand them here. 1561 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 1562 EVT MemVT = LoadNode->getMemoryVT(); 1563 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 1564 SDValue ShiftAmount = 1565 DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32); 1566 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, 1567 LoadNode->getPointerInfo(), MemVT, 1568 LoadNode->isVolatile(), 1569 LoadNode->isNonTemporal(), 1570 LoadNode->isInvariant(), 1571 LoadNode->getAlignment()); 1572 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount); 1573 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount); 1574 1575 SDValue MergedValues[2] = { Sra, Chain }; 1576 return DAG.getMergeValues(MergedValues, DL); 1577 } 1578 1579 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1580 return SDValue(); 1581 } 1582 1583 // Lowering for indirect addressing 1584 const MachineFunction &MF = DAG.getMachineFunction(); 1585 const AMDGPUFrameLowering *TFL = 1586 static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); 1587 unsigned StackWidth = TFL->getStackWidth(MF); 1588 1589 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1590 1591 if (VT.isVector()) { 1592 unsigned NumElemVT = VT.getVectorNumElements(); 1593 EVT ElemVT = VT.getVectorElementType(); 1594 SDValue Loads[4]; 1595 1596 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1597 "vector width in load"); 1598 1599 for (unsigned i = 0; i < NumElemVT; ++i) { 1600 unsigned Channel, PtrIncr; 1601 getStackAddress(StackWidth, i, Channel, PtrIncr); 1602 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1603 DAG.getConstant(PtrIncr, MVT::i32)); 1604 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 1605 Chain, Ptr, 1606 DAG.getTargetConstant(Channel, MVT::i32), 1607 Op.getOperand(2)); 1608 } 1609 for (unsigned i = NumElemVT; i < 4; ++i) { 1610 Loads[i] = DAG.getUNDEF(ElemVT); 1611 } 1612 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); 1613 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); 1614 } else { 1615 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 1616 Chain, Ptr, 1617 DAG.getTargetConstant(0, MVT::i32), // Channel 1618 Op.getOperand(2)); 1619 } 1620 1621 SDValue Ops[2] = { 1622 LoweredLoad, 1623 Chain 1624 }; 1625 1626 return DAG.getMergeValues(Ops, DL); 1627 } 1628 1629 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1630 SDValue Chain = Op.getOperand(0); 1631 SDValue Cond = Op.getOperand(1); 1632 SDValue Jump = Op.getOperand(2); 1633 1634 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), 1635 Chain, Jump, Cond); 1636 } 1637 1638 /// XXX Only kernel functions are supported, so we can assume for now that 1639 /// every function is a kernel function, but in the future we should use 1640 /// separate calling conventions for kernel and non-kernel functions. 1641 SDValue R600TargetLowering::LowerFormalArguments( 1642 SDValue Chain, 1643 CallingConv::ID CallConv, 1644 bool isVarArg, 1645 const SmallVectorImpl<ISD::InputArg> &Ins, 1646 SDLoc DL, SelectionDAG &DAG, 1647 SmallVectorImpl<SDValue> &InVals) const { 1648 SmallVector<CCValAssign, 16> ArgLocs; 1649 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1650 *DAG.getContext()); 1651 MachineFunction &MF = DAG.getMachineFunction(); 1652 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 1653 1654 SmallVector<ISD::InputArg, 8> LocalIns; 1655 1656 getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); 1657 1658 AnalyzeFormalArguments(CCInfo, LocalIns); 1659 1660 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1661 CCValAssign &VA = ArgLocs[i]; 1662 const ISD::InputArg &In = Ins[i]; 1663 EVT VT = In.VT; 1664 EVT MemVT = VA.getLocVT(); 1665 if (!VT.isVector() && MemVT.isVector()) { 1666 // Get load source type if scalarized. 1667 MemVT = MemVT.getVectorElementType(); 1668 } 1669 1670 if (MFI->getShaderType() != ShaderType::COMPUTE) { 1671 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); 1672 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1673 InVals.push_back(Register); 1674 continue; 1675 } 1676 1677 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1678 AMDGPUAS::CONSTANT_BUFFER_0); 1679 1680 // i64 isn't a legal type, so the register type used ends up as i32, which 1681 // isn't expected here. It attempts to create this sextload, but it ends up 1682 // being invalid. Somehow this seems to work with i64 arguments, but breaks 1683 // for <1 x i64>. 1684 1685 // The first 36 bytes of the input buffer contains information about 1686 // thread group and global sizes. 1687 ISD::LoadExtType Ext = ISD::NON_EXTLOAD; 1688 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { 1689 // FIXME: This should really check the extload type, but the handling of 1690 // extload vector parameters seems to be broken. 1691 1692 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 1693 Ext = ISD::SEXTLOAD; 1694 } 1695 1696 // Compute the offset from the value. 1697 // XXX - I think PartOffset should give you this, but it seems to give the 1698 // size of the register which isn't useful. 1699 1700 unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); 1701 unsigned PartOffset = VA.getLocMemOffset(); 1702 unsigned Offset = 36 + VA.getLocMemOffset(); 1703 1704 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); 1705 SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, 1706 DAG.getConstant(Offset, MVT::i32), 1707 DAG.getUNDEF(MVT::i32), 1708 PtrInfo, 1709 MemVT, false, true, true, 4); 1710 1711 // 4 is the preferred alignment for the CONSTANT memory space. 1712 InVals.push_back(Arg); 1713 MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); 1714 } 1715 return Chain; 1716 } 1717 1718 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 1719 if (!VT.isVector()) 1720 return MVT::i32; 1721 return VT.changeVectorElementTypeToInteger(); 1722 } 1723 1724 static SDValue CompactSwizzlableVector( 1725 SelectionDAG &DAG, SDValue VectorEntry, 1726 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1727 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1728 assert(RemapSwizzle.empty()); 1729 SDValue NewBldVec[4] = { 1730 VectorEntry.getOperand(0), 1731 VectorEntry.getOperand(1), 1732 VectorEntry.getOperand(2), 1733 VectorEntry.getOperand(3) 1734 }; 1735 1736 for (unsigned i = 0; i < 4; i++) { 1737 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1738 // We mask write here to teach later passes that the ith element of this 1739 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 1740 // break false dependencies and additionnaly make assembly easier to read. 1741 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 1742 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1743 if (C->isZero()) { 1744 RemapSwizzle[i] = 4; // SEL_0 1745 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1746 } else if (C->isExactlyValue(1.0)) { 1747 RemapSwizzle[i] = 5; // SEL_1 1748 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1749 } 1750 } 1751 1752 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1753 continue; 1754 for (unsigned j = 0; j < i; j++) { 1755 if (NewBldVec[i] == NewBldVec[j]) { 1756 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1757 RemapSwizzle[i] = j; 1758 break; 1759 } 1760 } 1761 } 1762 1763 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1764 VectorEntry.getValueType(), NewBldVec); 1765 } 1766 1767 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1768 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1769 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1770 assert(RemapSwizzle.empty()); 1771 SDValue NewBldVec[4] = { 1772 VectorEntry.getOperand(0), 1773 VectorEntry.getOperand(1), 1774 VectorEntry.getOperand(2), 1775 VectorEntry.getOperand(3) 1776 }; 1777 bool isUnmovable[4] = { false, false, false, false }; 1778 for (unsigned i = 0; i < 4; i++) { 1779 RemapSwizzle[i] = i; 1780 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1781 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1782 ->getZExtValue(); 1783 if (i == Idx) 1784 isUnmovable[Idx] = true; 1785 } 1786 } 1787 1788 for (unsigned i = 0; i < 4; i++) { 1789 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1790 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1791 ->getZExtValue(); 1792 if (isUnmovable[Idx]) 1793 continue; 1794 // Swap i and Idx 1795 std::swap(NewBldVec[Idx], NewBldVec[i]); 1796 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 1797 break; 1798 } 1799 } 1800 1801 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1802 VectorEntry.getValueType(), NewBldVec); 1803 } 1804 1805 1806 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, 1807 SDValue Swz[4], SelectionDAG &DAG) const { 1808 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); 1809 // Old -> New swizzle values 1810 DenseMap<unsigned, unsigned> SwizzleRemap; 1811 1812 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1813 for (unsigned i = 0; i < 4; i++) { 1814 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1815 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1816 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 1817 } 1818 1819 SwizzleRemap.clear(); 1820 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1821 for (unsigned i = 0; i < 4; i++) { 1822 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1823 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1824 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 1825 } 1826 1827 return BuildVector; 1828 } 1829 1830 1831 //===----------------------------------------------------------------------===// 1832 // Custom DAG Optimizations 1833 //===----------------------------------------------------------------------===// 1834 1835 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1836 DAGCombinerInfo &DCI) const { 1837 SelectionDAG &DAG = DCI.DAG; 1838 1839 switch (N->getOpcode()) { 1840 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1841 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1842 case ISD::FP_ROUND: { 1843 SDValue Arg = N->getOperand(0); 1844 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1845 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), 1846 Arg.getOperand(0)); 1847 } 1848 break; 1849 } 1850 1851 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1852 // (i32 select_cc f32, f32, -1, 0 cc) 1853 // 1854 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1855 // this to one of the SET*_DX10 instructions. 1856 case ISD::FP_TO_SINT: { 1857 SDValue FNeg = N->getOperand(0); 1858 if (FNeg.getOpcode() != ISD::FNEG) { 1859 return SDValue(); 1860 } 1861 SDValue SelectCC = FNeg.getOperand(0); 1862 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1863 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1864 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1865 !isHWTrueValue(SelectCC.getOperand(2)) || 1866 !isHWFalseValue(SelectCC.getOperand(3))) { 1867 return SDValue(); 1868 } 1869 1870 return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), 1871 SelectCC.getOperand(0), // LHS 1872 SelectCC.getOperand(1), // RHS 1873 DAG.getConstant(-1, MVT::i32), // True 1874 DAG.getConstant(0, MVT::i32), // False 1875 SelectCC.getOperand(4)); // CC 1876 1877 break; 1878 } 1879 1880 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 1881 // => build_vector elt0, ... , NewEltIdx, ... , eltN 1882 case ISD::INSERT_VECTOR_ELT: { 1883 SDValue InVec = N->getOperand(0); 1884 SDValue InVal = N->getOperand(1); 1885 SDValue EltNo = N->getOperand(2); 1886 SDLoc dl(N); 1887 1888 // If the inserted element is an UNDEF, just use the input vector. 1889 if (InVal.getOpcode() == ISD::UNDEF) 1890 return InVec; 1891 1892 EVT VT = InVec.getValueType(); 1893 1894 // If we can't generate a legal BUILD_VECTOR, exit 1895 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 1896 return SDValue(); 1897 1898 // Check that we know which element is being inserted 1899 if (!isa<ConstantSDNode>(EltNo)) 1900 return SDValue(); 1901 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 1902 1903 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 1904 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 1905 // vector elements. 1906 SmallVector<SDValue, 8> Ops; 1907 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 1908 Ops.append(InVec.getNode()->op_begin(), 1909 InVec.getNode()->op_end()); 1910 } else if (InVec.getOpcode() == ISD::UNDEF) { 1911 unsigned NElts = VT.getVectorNumElements(); 1912 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 1913 } else { 1914 return SDValue(); 1915 } 1916 1917 // Insert the element 1918 if (Elt < Ops.size()) { 1919 // All the operands of BUILD_VECTOR must have the same type; 1920 // we enforce that here. 1921 EVT OpVT = Ops[0].getValueType(); 1922 if (InVal.getValueType() != OpVT) 1923 InVal = OpVT.bitsGT(InVal.getValueType()) ? 1924 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : 1925 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); 1926 Ops[Elt] = InVal; 1927 } 1928 1929 // Return the new vector 1930 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 1931 } 1932 1933 // Extract_vec (Build_vector) generated by custom lowering 1934 // also needs to be customly combined 1935 case ISD::EXTRACT_VECTOR_ELT: { 1936 SDValue Arg = N->getOperand(0); 1937 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1938 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1939 unsigned Element = Const->getZExtValue(); 1940 return Arg->getOperand(Element); 1941 } 1942 } 1943 if (Arg.getOpcode() == ISD::BITCAST && 1944 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 1945 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1946 unsigned Element = Const->getZExtValue(); 1947 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), 1948 Arg->getOperand(0).getOperand(Element)); 1949 } 1950 } 1951 } 1952 1953 case ISD::SELECT_CC: { 1954 // Try common optimizations 1955 SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1956 if (Ret.getNode()) 1957 return Ret; 1958 1959 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1960 // selectcc x, y, a, b, inv(cc) 1961 // 1962 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1963 // selectcc x, y, a, b, cc 1964 SDValue LHS = N->getOperand(0); 1965 if (LHS.getOpcode() != ISD::SELECT_CC) { 1966 return SDValue(); 1967 } 1968 1969 SDValue RHS = N->getOperand(1); 1970 SDValue True = N->getOperand(2); 1971 SDValue False = N->getOperand(3); 1972 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1973 1974 if (LHS.getOperand(2).getNode() != True.getNode() || 1975 LHS.getOperand(3).getNode() != False.getNode() || 1976 RHS.getNode() != False.getNode()) { 1977 return SDValue(); 1978 } 1979 1980 switch (NCC) { 1981 default: return SDValue(); 1982 case ISD::SETNE: return LHS; 1983 case ISD::SETEQ: { 1984 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1985 LHSCC = ISD::getSetCCInverse(LHSCC, 1986 LHS.getOperand(0).getValueType().isInteger()); 1987 if (DCI.isBeforeLegalizeOps() || 1988 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 1989 return DAG.getSelectCC(SDLoc(N), 1990 LHS.getOperand(0), 1991 LHS.getOperand(1), 1992 LHS.getOperand(2), 1993 LHS.getOperand(3), 1994 LHSCC); 1995 break; 1996 } 1997 } 1998 return SDValue(); 1999 } 2000 2001 case AMDGPUISD::EXPORT: { 2002 SDValue Arg = N->getOperand(1); 2003 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2004 break; 2005 2006 SDValue NewArgs[8] = { 2007 N->getOperand(0), // Chain 2008 SDValue(), 2009 N->getOperand(2), // ArrayBase 2010 N->getOperand(3), // Type 2011 N->getOperand(4), // SWZ_X 2012 N->getOperand(5), // SWZ_Y 2013 N->getOperand(6), // SWZ_Z 2014 N->getOperand(7) // SWZ_W 2015 }; 2016 SDLoc DL(N); 2017 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG); 2018 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); 2019 } 2020 case AMDGPUISD::TEXTURE_FETCH: { 2021 SDValue Arg = N->getOperand(1); 2022 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2023 break; 2024 2025 SDValue NewArgs[19] = { 2026 N->getOperand(0), 2027 N->getOperand(1), 2028 N->getOperand(2), 2029 N->getOperand(3), 2030 N->getOperand(4), 2031 N->getOperand(5), 2032 N->getOperand(6), 2033 N->getOperand(7), 2034 N->getOperand(8), 2035 N->getOperand(9), 2036 N->getOperand(10), 2037 N->getOperand(11), 2038 N->getOperand(12), 2039 N->getOperand(13), 2040 N->getOperand(14), 2041 N->getOperand(15), 2042 N->getOperand(16), 2043 N->getOperand(17), 2044 N->getOperand(18), 2045 }; 2046 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG); 2047 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(), 2048 NewArgs); 2049 } 2050 } 2051 2052 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 2053 } 2054 2055 static bool 2056 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, 2057 SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { 2058 const R600InstrInfo *TII = 2059 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); 2060 if (!Src.isMachineOpcode()) 2061 return false; 2062 switch (Src.getMachineOpcode()) { 2063 case AMDGPU::FNEG_R600: 2064 if (!Neg.getNode()) 2065 return false; 2066 Src = Src.getOperand(0); 2067 Neg = DAG.getTargetConstant(1, MVT::i32); 2068 return true; 2069 case AMDGPU::FABS_R600: 2070 if (!Abs.getNode()) 2071 return false; 2072 Src = Src.getOperand(0); 2073 Abs = DAG.getTargetConstant(1, MVT::i32); 2074 return true; 2075 case AMDGPU::CONST_COPY: { 2076 unsigned Opcode = ParentNode->getMachineOpcode(); 2077 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2078 2079 if (!Sel.getNode()) 2080 return false; 2081 2082 SDValue CstOffset = Src.getOperand(0); 2083 if (ParentNode->getValueType(0).isVector()) 2084 return false; 2085 2086 // Gather constants values 2087 int SrcIndices[] = { 2088 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 2089 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 2090 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), 2091 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 2092 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 2093 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 2094 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 2095 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 2096 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 2097 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 2098 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 2099 }; 2100 std::vector<unsigned> Consts; 2101 for (int OtherSrcIdx : SrcIndices) { 2102 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 2103 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 2104 continue; 2105 if (HasDst) { 2106 OtherSrcIdx--; 2107 OtherSelIdx--; 2108 } 2109 if (RegisterSDNode *Reg = 2110 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 2111 if (Reg->getReg() == AMDGPU::ALU_CONST) { 2112 ConstantSDNode *Cst 2113 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); 2114 Consts.push_back(Cst->getZExtValue()); 2115 } 2116 } 2117 } 2118 2119 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); 2120 Consts.push_back(Cst->getZExtValue()); 2121 if (!TII->fitsConstReadLimitations(Consts)) { 2122 return false; 2123 } 2124 2125 Sel = CstOffset; 2126 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); 2127 return true; 2128 } 2129 case AMDGPU::MOV_IMM_I32: 2130 case AMDGPU::MOV_IMM_F32: { 2131 unsigned ImmReg = AMDGPU::ALU_LITERAL_X; 2132 uint64_t ImmValue = 0; 2133 2134 2135 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { 2136 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); 2137 float FloatValue = FPC->getValueAPF().convertToFloat(); 2138 if (FloatValue == 0.0) { 2139 ImmReg = AMDGPU::ZERO; 2140 } else if (FloatValue == 0.5) { 2141 ImmReg = AMDGPU::HALF; 2142 } else if (FloatValue == 1.0) { 2143 ImmReg = AMDGPU::ONE; 2144 } else { 2145 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 2146 } 2147 } else { 2148 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); 2149 uint64_t Value = C->getZExtValue(); 2150 if (Value == 0) { 2151 ImmReg = AMDGPU::ZERO; 2152 } else if (Value == 1) { 2153 ImmReg = AMDGPU::ONE_INT; 2154 } else { 2155 ImmValue = Value; 2156 } 2157 } 2158 2159 // Check that we aren't already using an immediate. 2160 // XXX: It's possible for an instruction to have more than one 2161 // immediate operand, but this is not supported yet. 2162 if (ImmReg == AMDGPU::ALU_LITERAL_X) { 2163 if (!Imm.getNode()) 2164 return false; 2165 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); 2166 assert(C); 2167 if (C->getZExtValue()) 2168 return false; 2169 Imm = DAG.getTargetConstant(ImmValue, MVT::i32); 2170 } 2171 Src = DAG.getRegister(ImmReg, MVT::i32); 2172 return true; 2173 } 2174 default: 2175 return false; 2176 } 2177 } 2178 2179 2180 /// \brief Fold the instructions after selecting them 2181 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 2182 SelectionDAG &DAG) const { 2183 const R600InstrInfo *TII = 2184 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); 2185 if (!Node->isMachineOpcode()) 2186 return Node; 2187 unsigned Opcode = Node->getMachineOpcode(); 2188 SDValue FakeOp; 2189 2190 std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); 2191 2192 if (Opcode == AMDGPU::DOT_4) { 2193 int OperandIdx[] = { 2194 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 2195 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 2196 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 2197 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 2198 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 2199 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 2200 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 2201 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 2202 }; 2203 int NegIdx[] = { 2204 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), 2205 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), 2206 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), 2207 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), 2208 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), 2209 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), 2210 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), 2211 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) 2212 }; 2213 int AbsIdx[] = { 2214 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), 2215 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), 2216 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), 2217 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), 2218 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), 2219 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), 2220 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), 2221 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) 2222 }; 2223 for (unsigned i = 0; i < 8; i++) { 2224 if (OperandIdx[i] < 0) 2225 return Node; 2226 SDValue &Src = Ops[OperandIdx[i] - 1]; 2227 SDValue &Neg = Ops[NegIdx[i] - 1]; 2228 SDValue &Abs = Ops[AbsIdx[i] - 1]; 2229 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2230 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2231 if (HasDst) 2232 SelIdx--; 2233 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2234 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 2235 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2236 } 2237 } else if (Opcode == AMDGPU::REG_SEQUENCE) { 2238 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 2239 SDValue &Src = Ops[i]; 2240 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 2241 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2242 } 2243 } else if (Opcode == AMDGPU::CLAMP_R600) { 2244 SDValue Src = Node->getOperand(0); 2245 if (!Src.isMachineOpcode() || 2246 !TII->hasInstrModifiers(Src.getMachineOpcode())) 2247 return Node; 2248 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), 2249 AMDGPU::OpName::clamp); 2250 if (ClampIdx < 0) 2251 return Node; 2252 std::vector<SDValue> Ops(Src->op_begin(), Src->op_end()); 2253 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32); 2254 return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node), 2255 Node->getVTList(), Ops); 2256 } else { 2257 if (!TII->hasInstrModifiers(Opcode)) 2258 return Node; 2259 int OperandIdx[] = { 2260 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 2261 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 2262 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) 2263 }; 2264 int NegIdx[] = { 2265 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), 2266 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), 2267 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) 2268 }; 2269 int AbsIdx[] = { 2270 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), 2271 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), 2272 -1 2273 }; 2274 for (unsigned i = 0; i < 3; i++) { 2275 if (OperandIdx[i] < 0) 2276 return Node; 2277 SDValue &Src = Ops[OperandIdx[i] - 1]; 2278 SDValue &Neg = Ops[NegIdx[i] - 1]; 2279 SDValue FakeAbs; 2280 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 2281 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2282 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2283 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); 2284 if (HasDst) { 2285 SelIdx--; 2286 ImmIdx--; 2287 } 2288 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2289 SDValue &Imm = Ops[ImmIdx]; 2290 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 2291 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2292 } 2293 } 2294 2295 return Node; 2296 } 2297