1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for R600 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "R600ISelLowering.h" 16 #include "AMDGPUFrameLowering.h" 17 #include "AMDGPUIntrinsicInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "R600Defines.h" 20 #include "R600InstrInfo.h" 21 #include "R600MachineFunctionInfo.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/CodeGen/CallingConvLower.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineRegisterInfo.h" 27 #include "llvm/CodeGen/SelectionDAG.h" 28 #include "llvm/IR/Argument.h" 29 #include "llvm/IR/Function.h" 30 31 using namespace llvm; 32 33 R600TargetLowering::R600TargetLowering(TargetMachine &TM, 34 const AMDGPUSubtarget &STI) 35 : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { 36 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 37 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 38 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 39 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 40 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); 41 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); 42 43 computeRegisterProperties(STI.getRegisterInfo()); 44 45 // Set condition code actions 46 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 47 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 48 setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 49 setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 50 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 51 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 52 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 53 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 54 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 55 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 56 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 57 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 58 59 setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 60 setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 61 setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 62 setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 63 64 setOperationAction(ISD::FCOS, MVT::f32, Custom); 65 setOperationAction(ISD::FSIN, MVT::f32, Custom); 66 67 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 68 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 69 70 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 71 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 72 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 73 74 setOperationAction(ISD::FSUB, MVT::f32, Expand); 75 76 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 77 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 78 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); 79 80 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 81 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 82 83 setOperationAction(ISD::SETCC, MVT::i32, Expand); 84 setOperationAction(ISD::SETCC, MVT::f32, Expand); 85 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 86 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 87 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 88 89 setOperationAction(ISD::SELECT, MVT::i32, Expand); 90 setOperationAction(ISD::SELECT, MVT::f32, Expand); 91 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 92 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 93 94 // ADD, SUB overflow. 95 // TODO: turn these into Legal? 96 if (Subtarget->hasCARRY()) 97 setOperationAction(ISD::UADDO, MVT::i32, Custom); 98 99 if (Subtarget->hasBORROW()) 100 setOperationAction(ISD::USUBO, MVT::i32, Custom); 101 102 // Expand sign extension of vectors 103 if (!Subtarget->hasBFE()) 104 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 105 106 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); 107 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); 108 109 if (!Subtarget->hasBFE()) 110 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 111 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); 112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); 113 114 if (!Subtarget->hasBFE()) 115 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 116 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 117 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); 118 119 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 120 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); 121 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); 122 123 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); 124 125 126 // Legalize loads and stores to the private address space. 127 setOperationAction(ISD::LOAD, MVT::i32, Custom); 128 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 129 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 130 131 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 132 // spaces, so it is custom lowered to handle those where it isn't. 133 for (MVT VT : MVT::integer_valuetypes()) { 134 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 135 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); 136 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); 137 138 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 139 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); 140 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); 141 142 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 143 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); 144 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); 145 } 146 147 setOperationAction(ISD::STORE, MVT::i8, Custom); 148 setOperationAction(ISD::STORE, MVT::i32, Custom); 149 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 150 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 151 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 152 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 153 154 setOperationAction(ISD::LOAD, MVT::i32, Custom); 155 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 156 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 157 158 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); 159 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); 160 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 161 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 162 163 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); 164 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); 165 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 166 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 167 168 setTargetDAGCombine(ISD::FP_ROUND); 169 setTargetDAGCombine(ISD::FP_TO_SINT); 170 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 171 setTargetDAGCombine(ISD::SELECT_CC); 172 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 173 174 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 175 // to be Legal/Custom in order to avoid library calls. 176 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 177 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 178 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 179 180 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 181 182 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 183 for (MVT VT : ScalarIntVTs) { 184 setOperationAction(ISD::ADDC, VT, Expand); 185 setOperationAction(ISD::SUBC, VT, Expand); 186 setOperationAction(ISD::ADDE, VT, Expand); 187 setOperationAction(ISD::SUBE, VT, Expand); 188 } 189 190 setSchedulingPreference(Sched::Source); 191 } 192 193 static inline bool isEOP(MachineBasicBlock::iterator I) { 194 return std::next(I)->getOpcode() == AMDGPU::RETURN; 195 } 196 197 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( 198 MachineInstr * MI, MachineBasicBlock * BB) const { 199 MachineFunction * MF = BB->getParent(); 200 MachineRegisterInfo &MRI = MF->getRegInfo(); 201 MachineBasicBlock::iterator I = *MI; 202 const R600InstrInfo *TII = 203 static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); 204 205 switch (MI->getOpcode()) { 206 default: 207 // Replace LDS_*_RET instruction that don't have any uses with the 208 // equivalent LDS_*_NORET instruction. 209 if (TII->isLDSRetInstr(MI->getOpcode())) { 210 int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); 211 assert(DstIdx != -1); 212 MachineInstrBuilder NewMI; 213 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add 214 // LDS_1A2D support and remove this special case. 215 if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || 216 MI->getOpcode() == AMDGPU::LDS_CMPST_RET) 217 return BB; 218 219 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 220 TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); 221 for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { 222 NewMI.addOperand(MI->getOperand(i)); 223 } 224 } else { 225 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 226 } 227 break; 228 case AMDGPU::CLAMP_R600: { 229 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 230 AMDGPU::MOV, 231 MI->getOperand(0).getReg(), 232 MI->getOperand(1).getReg()); 233 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); 234 break; 235 } 236 237 case AMDGPU::FABS_R600: { 238 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 239 AMDGPU::MOV, 240 MI->getOperand(0).getReg(), 241 MI->getOperand(1).getReg()); 242 TII->addFlag(NewMI, 0, MO_FLAG_ABS); 243 break; 244 } 245 246 case AMDGPU::FNEG_R600: { 247 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 248 AMDGPU::MOV, 249 MI->getOperand(0).getReg(), 250 MI->getOperand(1).getReg()); 251 TII->addFlag(NewMI, 0, MO_FLAG_NEG); 252 break; 253 } 254 255 case AMDGPU::MASK_WRITE: { 256 unsigned maskedRegister = MI->getOperand(0).getReg(); 257 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 258 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 259 TII->addFlag(defInstr, 0, MO_FLAG_MASK); 260 break; 261 } 262 263 case AMDGPU::MOV_IMM_F32: 264 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 265 MI->getOperand(1).getFPImm()->getValueAPF() 266 .bitcastToAPInt().getZExtValue()); 267 break; 268 case AMDGPU::MOV_IMM_I32: 269 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 270 MI->getOperand(1).getImm()); 271 break; 272 case AMDGPU::CONST_COPY: { 273 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, 274 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); 275 TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, 276 MI->getOperand(1).getImm()); 277 break; 278 } 279 280 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 281 case AMDGPU::RAT_WRITE_CACHELESS_64_eg: 282 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 283 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 284 .addOperand(MI->getOperand(0)) 285 .addOperand(MI->getOperand(1)) 286 .addImm(isEOP(I)); // Set End of program bit 287 break; 288 } 289 case AMDGPU::RAT_STORE_TYPED_eg: { 290 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 291 .addOperand(MI->getOperand(0)) 292 .addOperand(MI->getOperand(1)) 293 .addOperand(MI->getOperand(2)) 294 .addImm(isEOP(I)); // Set End of program bit 295 break; 296 } 297 298 case AMDGPU::TXD: { 299 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 300 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 301 MachineOperand &RID = MI->getOperand(4); 302 MachineOperand &SID = MI->getOperand(5); 303 unsigned TextureId = MI->getOperand(6).getImm(); 304 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 305 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 306 307 switch (TextureId) { 308 case 5: // Rect 309 CTX = CTY = 0; 310 break; 311 case 6: // Shadow1D 312 SrcW = SrcZ; 313 break; 314 case 7: // Shadow2D 315 SrcW = SrcZ; 316 break; 317 case 8: // ShadowRect 318 CTX = CTY = 0; 319 SrcW = SrcZ; 320 break; 321 case 9: // 1DArray 322 SrcZ = SrcY; 323 CTZ = 0; 324 break; 325 case 10: // 2DArray 326 CTZ = 0; 327 break; 328 case 11: // Shadow1DArray 329 SrcZ = SrcY; 330 CTZ = 0; 331 break; 332 case 12: // Shadow2DArray 333 CTZ = 0; 334 break; 335 } 336 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 337 .addOperand(MI->getOperand(3)) 338 .addImm(SrcX) 339 .addImm(SrcY) 340 .addImm(SrcZ) 341 .addImm(SrcW) 342 .addImm(0) 343 .addImm(0) 344 .addImm(0) 345 .addImm(0) 346 .addImm(1) 347 .addImm(2) 348 .addImm(3) 349 .addOperand(RID) 350 .addOperand(SID) 351 .addImm(CTX) 352 .addImm(CTY) 353 .addImm(CTZ) 354 .addImm(CTW); 355 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 356 .addOperand(MI->getOperand(2)) 357 .addImm(SrcX) 358 .addImm(SrcY) 359 .addImm(SrcZ) 360 .addImm(SrcW) 361 .addImm(0) 362 .addImm(0) 363 .addImm(0) 364 .addImm(0) 365 .addImm(1) 366 .addImm(2) 367 .addImm(3) 368 .addOperand(RID) 369 .addOperand(SID) 370 .addImm(CTX) 371 .addImm(CTY) 372 .addImm(CTZ) 373 .addImm(CTW); 374 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 375 .addOperand(MI->getOperand(0)) 376 .addOperand(MI->getOperand(1)) 377 .addImm(SrcX) 378 .addImm(SrcY) 379 .addImm(SrcZ) 380 .addImm(SrcW) 381 .addImm(0) 382 .addImm(0) 383 .addImm(0) 384 .addImm(0) 385 .addImm(1) 386 .addImm(2) 387 .addImm(3) 388 .addOperand(RID) 389 .addOperand(SID) 390 .addImm(CTX) 391 .addImm(CTY) 392 .addImm(CTZ) 393 .addImm(CTW) 394 .addReg(T0, RegState::Implicit) 395 .addReg(T1, RegState::Implicit); 396 break; 397 } 398 399 case AMDGPU::TXD_SHADOW: { 400 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 401 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 402 MachineOperand &RID = MI->getOperand(4); 403 MachineOperand &SID = MI->getOperand(5); 404 unsigned TextureId = MI->getOperand(6).getImm(); 405 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 406 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 407 408 switch (TextureId) { 409 case 5: // Rect 410 CTX = CTY = 0; 411 break; 412 case 6: // Shadow1D 413 SrcW = SrcZ; 414 break; 415 case 7: // Shadow2D 416 SrcW = SrcZ; 417 break; 418 case 8: // ShadowRect 419 CTX = CTY = 0; 420 SrcW = SrcZ; 421 break; 422 case 9: // 1DArray 423 SrcZ = SrcY; 424 CTZ = 0; 425 break; 426 case 10: // 2DArray 427 CTZ = 0; 428 break; 429 case 11: // Shadow1DArray 430 SrcZ = SrcY; 431 CTZ = 0; 432 break; 433 case 12: // Shadow2DArray 434 CTZ = 0; 435 break; 436 } 437 438 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 439 .addOperand(MI->getOperand(3)) 440 .addImm(SrcX) 441 .addImm(SrcY) 442 .addImm(SrcZ) 443 .addImm(SrcW) 444 .addImm(0) 445 .addImm(0) 446 .addImm(0) 447 .addImm(0) 448 .addImm(1) 449 .addImm(2) 450 .addImm(3) 451 .addOperand(RID) 452 .addOperand(SID) 453 .addImm(CTX) 454 .addImm(CTY) 455 .addImm(CTZ) 456 .addImm(CTW); 457 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 458 .addOperand(MI->getOperand(2)) 459 .addImm(SrcX) 460 .addImm(SrcY) 461 .addImm(SrcZ) 462 .addImm(SrcW) 463 .addImm(0) 464 .addImm(0) 465 .addImm(0) 466 .addImm(0) 467 .addImm(1) 468 .addImm(2) 469 .addImm(3) 470 .addOperand(RID) 471 .addOperand(SID) 472 .addImm(CTX) 473 .addImm(CTY) 474 .addImm(CTZ) 475 .addImm(CTW); 476 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 477 .addOperand(MI->getOperand(0)) 478 .addOperand(MI->getOperand(1)) 479 .addImm(SrcX) 480 .addImm(SrcY) 481 .addImm(SrcZ) 482 .addImm(SrcW) 483 .addImm(0) 484 .addImm(0) 485 .addImm(0) 486 .addImm(0) 487 .addImm(1) 488 .addImm(2) 489 .addImm(3) 490 .addOperand(RID) 491 .addOperand(SID) 492 .addImm(CTX) 493 .addImm(CTY) 494 .addImm(CTZ) 495 .addImm(CTW) 496 .addReg(T0, RegState::Implicit) 497 .addReg(T1, RegState::Implicit); 498 break; 499 } 500 501 case AMDGPU::BRANCH: 502 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 503 .addOperand(MI->getOperand(0)); 504 break; 505 506 case AMDGPU::BRANCH_COND_f32: { 507 MachineInstr *NewMI = 508 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 509 AMDGPU::PREDICATE_BIT) 510 .addOperand(MI->getOperand(1)) 511 .addImm(OPCODE_IS_NOT_ZERO) 512 .addImm(0); // Flags 513 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 514 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 515 .addOperand(MI->getOperand(0)) 516 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 517 break; 518 } 519 520 case AMDGPU::BRANCH_COND_i32: { 521 MachineInstr *NewMI = 522 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 523 AMDGPU::PREDICATE_BIT) 524 .addOperand(MI->getOperand(1)) 525 .addImm(OPCODE_IS_NOT_ZERO_INT) 526 .addImm(0); // Flags 527 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 528 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 529 .addOperand(MI->getOperand(0)) 530 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 531 break; 532 } 533 534 case AMDGPU::EG_ExportSwz: 535 case AMDGPU::R600_ExportSwz: { 536 // Instruction is left unmodified if its not the last one of its type 537 bool isLastInstructionOfItsType = true; 538 unsigned InstExportType = MI->getOperand(1).getImm(); 539 for (MachineBasicBlock::iterator NextExportInst = std::next(I), 540 EndBlock = BB->end(); NextExportInst != EndBlock; 541 NextExportInst = std::next(NextExportInst)) { 542 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 543 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 544 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 545 .getImm(); 546 if (CurrentInstExportType == InstExportType) { 547 isLastInstructionOfItsType = false; 548 break; 549 } 550 } 551 } 552 bool EOP = isEOP(I); 553 if (!EOP && !isLastInstructionOfItsType) 554 return BB; 555 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; 556 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 557 .addOperand(MI->getOperand(0)) 558 .addOperand(MI->getOperand(1)) 559 .addOperand(MI->getOperand(2)) 560 .addOperand(MI->getOperand(3)) 561 .addOperand(MI->getOperand(4)) 562 .addOperand(MI->getOperand(5)) 563 .addOperand(MI->getOperand(6)) 564 .addImm(CfInst) 565 .addImm(EOP); 566 break; 567 } 568 case AMDGPU::RETURN: { 569 // RETURN instructions must have the live-out registers as implicit uses, 570 // otherwise they appear dead. 571 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 572 MachineInstrBuilder MIB(*MF, MI); 573 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 574 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 575 return BB; 576 } 577 } 578 579 MI->eraseFromParent(); 580 return BB; 581 } 582 583 //===----------------------------------------------------------------------===// 584 // Custom DAG Lowering Operations 585 //===----------------------------------------------------------------------===// 586 587 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 588 MachineFunction &MF = DAG.getMachineFunction(); 589 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 590 switch (Op.getOpcode()) { 591 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 592 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 593 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 594 case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); 595 case ISD::SRA_PARTS: 596 case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); 597 case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); 598 case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); 599 case ISD::FCOS: 600 case ISD::FSIN: return LowerTrig(Op, DAG); 601 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 602 case ISD::STORE: return LowerSTORE(Op, DAG); 603 case ISD::LOAD: { 604 SDValue Result = LowerLOAD(Op, DAG); 605 assert((!Result.getNode() || 606 Result.getNode()->getNumValues() == 2) && 607 "Load should return a value and a chain"); 608 return Result; 609 } 610 611 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 612 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 613 case ISD::INTRINSIC_VOID: { 614 SDValue Chain = Op.getOperand(0); 615 unsigned IntrinsicID = 616 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 617 switch (IntrinsicID) { 618 case AMDGPUIntrinsic::AMDGPU_store_output: { 619 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 620 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 621 MFI->LiveOuts.push_back(Reg); 622 return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); 623 } 624 case AMDGPUIntrinsic::R600_store_swizzle: { 625 SDLoc DL(Op); 626 const SDValue Args[8] = { 627 Chain, 628 Op.getOperand(2), // Export Value 629 Op.getOperand(3), // ArrayBase 630 Op.getOperand(4), // Type 631 DAG.getConstant(0, DL, MVT::i32), // SWZ_X 632 DAG.getConstant(1, DL, MVT::i32), // SWZ_Y 633 DAG.getConstant(2, DL, MVT::i32), // SWZ_Z 634 DAG.getConstant(3, DL, MVT::i32) // SWZ_W 635 }; 636 return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args); 637 } 638 639 // default for switch(IntrinsicID) 640 default: break; 641 } 642 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 643 break; 644 } 645 case ISD::INTRINSIC_WO_CHAIN: { 646 unsigned IntrinsicID = 647 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 648 EVT VT = Op.getValueType(); 649 SDLoc DL(Op); 650 switch(IntrinsicID) { 651 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 652 case AMDGPUIntrinsic::R600_load_input: { 653 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 654 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 655 MachineFunction &MF = DAG.getMachineFunction(); 656 MachineRegisterInfo &MRI = MF.getRegInfo(); 657 MRI.addLiveIn(Reg); 658 return DAG.getCopyFromReg(DAG.getEntryNode(), 659 SDLoc(DAG.getEntryNode()), Reg, VT); 660 } 661 662 case AMDGPUIntrinsic::R600_interp_input: { 663 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 664 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); 665 MachineSDNode *interp; 666 if (ijb < 0) { 667 const R600InstrInfo *TII = 668 static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); 669 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, 670 MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32)); 671 return DAG.getTargetExtractSubreg( 672 TII->getRegisterInfo().getSubRegFromChannel(slot % 4), 673 DL, MVT::f32, SDValue(interp, 0)); 674 } 675 MachineFunction &MF = DAG.getMachineFunction(); 676 MachineRegisterInfo &MRI = MF.getRegInfo(); 677 unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); 678 unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); 679 MRI.addLiveIn(RegisterI); 680 MRI.addLiveIn(RegisterJ); 681 SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), 682 SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); 683 SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), 684 SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); 685 686 if (slot % 4 < 2) 687 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 688 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), 689 RegisterJNode, RegisterINode); 690 else 691 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 692 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), 693 RegisterJNode, RegisterINode); 694 return SDValue(interp, slot % 2); 695 } 696 case AMDGPUIntrinsic::R600_interp_xy: 697 case AMDGPUIntrinsic::R600_interp_zw: { 698 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 699 MachineSDNode *interp; 700 SDValue RegisterINode = Op.getOperand(2); 701 SDValue RegisterJNode = Op.getOperand(3); 702 703 if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) 704 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 705 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), 706 RegisterJNode, RegisterINode); 707 else 708 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 709 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), 710 RegisterJNode, RegisterINode); 711 return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, 712 SDValue(interp, 0), SDValue(interp, 1)); 713 } 714 case AMDGPUIntrinsic::R600_tex: 715 case AMDGPUIntrinsic::R600_texc: 716 case AMDGPUIntrinsic::R600_txl: 717 case AMDGPUIntrinsic::R600_txlc: 718 case AMDGPUIntrinsic::R600_txb: 719 case AMDGPUIntrinsic::R600_txbc: 720 case AMDGPUIntrinsic::R600_txf: 721 case AMDGPUIntrinsic::R600_txq: 722 case AMDGPUIntrinsic::R600_ddx: 723 case AMDGPUIntrinsic::R600_ddy: 724 case AMDGPUIntrinsic::R600_ldptr: { 725 unsigned TextureOp; 726 switch (IntrinsicID) { 727 case AMDGPUIntrinsic::R600_tex: 728 TextureOp = 0; 729 break; 730 case AMDGPUIntrinsic::R600_texc: 731 TextureOp = 1; 732 break; 733 case AMDGPUIntrinsic::R600_txl: 734 TextureOp = 2; 735 break; 736 case AMDGPUIntrinsic::R600_txlc: 737 TextureOp = 3; 738 break; 739 case AMDGPUIntrinsic::R600_txb: 740 TextureOp = 4; 741 break; 742 case AMDGPUIntrinsic::R600_txbc: 743 TextureOp = 5; 744 break; 745 case AMDGPUIntrinsic::R600_txf: 746 TextureOp = 6; 747 break; 748 case AMDGPUIntrinsic::R600_txq: 749 TextureOp = 7; 750 break; 751 case AMDGPUIntrinsic::R600_ddx: 752 TextureOp = 8; 753 break; 754 case AMDGPUIntrinsic::R600_ddy: 755 TextureOp = 9; 756 break; 757 case AMDGPUIntrinsic::R600_ldptr: 758 TextureOp = 10; 759 break; 760 default: 761 llvm_unreachable("Unknow Texture Operation"); 762 } 763 764 SDValue TexArgs[19] = { 765 DAG.getConstant(TextureOp, DL, MVT::i32), 766 Op.getOperand(1), 767 DAG.getConstant(0, DL, MVT::i32), 768 DAG.getConstant(1, DL, MVT::i32), 769 DAG.getConstant(2, DL, MVT::i32), 770 DAG.getConstant(3, DL, MVT::i32), 771 Op.getOperand(2), 772 Op.getOperand(3), 773 Op.getOperand(4), 774 DAG.getConstant(0, DL, MVT::i32), 775 DAG.getConstant(1, DL, MVT::i32), 776 DAG.getConstant(2, DL, MVT::i32), 777 DAG.getConstant(3, DL, MVT::i32), 778 Op.getOperand(5), 779 Op.getOperand(6), 780 Op.getOperand(7), 781 Op.getOperand(8), 782 Op.getOperand(9), 783 Op.getOperand(10) 784 }; 785 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); 786 } 787 case AMDGPUIntrinsic::AMDGPU_dp4: { 788 SDValue Args[8] = { 789 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 790 DAG.getConstant(0, DL, MVT::i32)), 791 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 792 DAG.getConstant(0, DL, MVT::i32)), 793 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 794 DAG.getConstant(1, DL, MVT::i32)), 795 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 796 DAG.getConstant(1, DL, MVT::i32)), 797 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 798 DAG.getConstant(2, DL, MVT::i32)), 799 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 800 DAG.getConstant(2, DL, MVT::i32)), 801 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 802 DAG.getConstant(3, DL, MVT::i32)), 803 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 804 DAG.getConstant(3, DL, MVT::i32)) 805 }; 806 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); 807 } 808 809 case Intrinsic::r600_read_ngroups_x: 810 return LowerImplicitParameter(DAG, VT, DL, 0); 811 case Intrinsic::r600_read_ngroups_y: 812 return LowerImplicitParameter(DAG, VT, DL, 1); 813 case Intrinsic::r600_read_ngroups_z: 814 return LowerImplicitParameter(DAG, VT, DL, 2); 815 case Intrinsic::r600_read_global_size_x: 816 return LowerImplicitParameter(DAG, VT, DL, 3); 817 case Intrinsic::r600_read_global_size_y: 818 return LowerImplicitParameter(DAG, VT, DL, 4); 819 case Intrinsic::r600_read_global_size_z: 820 return LowerImplicitParameter(DAG, VT, DL, 5); 821 case Intrinsic::r600_read_local_size_x: 822 return LowerImplicitParameter(DAG, VT, DL, 6); 823 case Intrinsic::r600_read_local_size_y: 824 return LowerImplicitParameter(DAG, VT, DL, 7); 825 case Intrinsic::r600_read_local_size_z: 826 return LowerImplicitParameter(DAG, VT, DL, 8); 827 828 case Intrinsic::AMDGPU_read_workdim: { 829 uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); 830 return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); 831 } 832 833 case Intrinsic::r600_read_tgid_x: 834 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 835 AMDGPU::T1_X, VT); 836 case Intrinsic::r600_read_tgid_y: 837 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 838 AMDGPU::T1_Y, VT); 839 case Intrinsic::r600_read_tgid_z: 840 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 841 AMDGPU::T1_Z, VT); 842 case Intrinsic::r600_read_tidig_x: 843 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 844 AMDGPU::T0_X, VT); 845 case Intrinsic::r600_read_tidig_y: 846 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 847 AMDGPU::T0_Y, VT); 848 case Intrinsic::r600_read_tidig_z: 849 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 850 AMDGPU::T0_Z, VT); 851 case Intrinsic::AMDGPU_rsq: 852 // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. 853 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 854 855 case AMDGPUIntrinsic::AMDGPU_fract: 856 case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. 857 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 858 } 859 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 860 break; 861 } 862 } // end switch(Op.getOpcode()) 863 return SDValue(); 864 } 865 866 void R600TargetLowering::ReplaceNodeResults(SDNode *N, 867 SmallVectorImpl<SDValue> &Results, 868 SelectionDAG &DAG) const { 869 switch (N->getOpcode()) { 870 default: 871 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 872 return; 873 case ISD::FP_TO_UINT: 874 if (N->getValueType(0) == MVT::i1) { 875 Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 876 return; 877 } 878 // Fall-through. Since we don't care about out of bounds values 879 // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint 880 // considers some extra cases which are not necessary here. 881 case ISD::FP_TO_SINT: { 882 SDValue Result; 883 if (expandFP_TO_SINT(N, Result, DAG)) 884 Results.push_back(Result); 885 return; 886 } 887 case ISD::SDIVREM: { 888 SDValue Op = SDValue(N, 1); 889 SDValue RES = LowerSDIVREM(Op, DAG); 890 Results.push_back(RES); 891 Results.push_back(RES.getValue(1)); 892 break; 893 } 894 case ISD::UDIVREM: { 895 SDValue Op = SDValue(N, 0); 896 LowerUDIVREM64(Op, DAG, Results); 897 break; 898 } 899 } 900 } 901 902 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, 903 SDValue Vector) const { 904 905 SDLoc DL(Vector); 906 EVT VecVT = Vector.getValueType(); 907 EVT EltVT = VecVT.getVectorElementType(); 908 SmallVector<SDValue, 8> Args; 909 910 for (unsigned i = 0, e = VecVT.getVectorNumElements(); 911 i != e; ++i) { 912 Args.push_back(DAG.getNode( 913 ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, 914 DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout())))); 915 } 916 917 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); 918 } 919 920 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 921 SelectionDAG &DAG) const { 922 923 SDLoc DL(Op); 924 SDValue Vector = Op.getOperand(0); 925 SDValue Index = Op.getOperand(1); 926 927 if (isa<ConstantSDNode>(Index) || 928 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 929 return Op; 930 931 Vector = vectorToVerticalVector(DAG, Vector); 932 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), 933 Vector, Index); 934 } 935 936 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 937 SelectionDAG &DAG) const { 938 SDLoc DL(Op); 939 SDValue Vector = Op.getOperand(0); 940 SDValue Value = Op.getOperand(1); 941 SDValue Index = Op.getOperand(2); 942 943 if (isa<ConstantSDNode>(Index) || 944 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 945 return Op; 946 947 Vector = vectorToVerticalVector(DAG, Vector); 948 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), 949 Vector, Value, Index); 950 return vectorToVerticalVector(DAG, Insert); 951 } 952 953 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 954 // On hw >= R700, COS/SIN input must be between -1. and 1. 955 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 956 EVT VT = Op.getValueType(); 957 SDValue Arg = Op.getOperand(0); 958 SDLoc DL(Op); 959 960 // TODO: Should this propagate fast-math-flags? 961 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 962 DAG.getNode(ISD::FADD, DL, VT, 963 DAG.getNode(ISD::FMUL, DL, VT, Arg, 964 DAG.getConstantFP(0.15915494309, DL, MVT::f32)), 965 DAG.getConstantFP(0.5, DL, MVT::f32))); 966 unsigned TrigNode; 967 switch (Op.getOpcode()) { 968 case ISD::FCOS: 969 TrigNode = AMDGPUISD::COS_HW; 970 break; 971 case ISD::FSIN: 972 TrigNode = AMDGPUISD::SIN_HW; 973 break; 974 default: 975 llvm_unreachable("Wrong trig opcode"); 976 } 977 SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, 978 DAG.getNode(ISD::FADD, DL, VT, FractPart, 979 DAG.getConstantFP(-0.5, DL, MVT::f32))); 980 if (Gen >= AMDGPUSubtarget::R700) 981 return TrigVal; 982 // On R600 hw, COS/SIN input must be between -Pi and Pi. 983 return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, 984 DAG.getConstantFP(3.14159265359, DL, MVT::f32)); 985 } 986 987 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { 988 SDLoc DL(Op); 989 EVT VT = Op.getValueType(); 990 991 SDValue Lo = Op.getOperand(0); 992 SDValue Hi = Op.getOperand(1); 993 SDValue Shift = Op.getOperand(2); 994 SDValue Zero = DAG.getConstant(0, DL, VT); 995 SDValue One = DAG.getConstant(1, DL, VT); 996 997 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); 998 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); 999 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 1000 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 1001 1002 // The dance around Width1 is necessary for 0 special case. 1003 // Without it the CompShift might be 32, producing incorrect results in 1004 // Overflow. So we do the shift in two steps, the alternative is to 1005 // add a conditional to filter the special case. 1006 1007 SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); 1008 Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); 1009 1010 SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); 1011 HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); 1012 SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); 1013 1014 SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); 1015 SDValue LoBig = Zero; 1016 1017 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 1018 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 1019 1020 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 1021 } 1022 1023 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { 1024 SDLoc DL(Op); 1025 EVT VT = Op.getValueType(); 1026 1027 SDValue Lo = Op.getOperand(0); 1028 SDValue Hi = Op.getOperand(1); 1029 SDValue Shift = Op.getOperand(2); 1030 SDValue Zero = DAG.getConstant(0, DL, VT); 1031 SDValue One = DAG.getConstant(1, DL, VT); 1032 1033 const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; 1034 1035 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); 1036 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); 1037 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 1038 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 1039 1040 // The dance around Width1 is necessary for 0 special case. 1041 // Without it the CompShift might be 32, producing incorrect results in 1042 // Overflow. So we do the shift in two steps, the alternative is to 1043 // add a conditional to filter the special case. 1044 1045 SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); 1046 Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); 1047 1048 SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); 1049 SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); 1050 LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); 1051 1052 SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); 1053 SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; 1054 1055 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 1056 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 1057 1058 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 1059 } 1060 1061 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, 1062 unsigned mainop, unsigned ovf) const { 1063 SDLoc DL(Op); 1064 EVT VT = Op.getValueType(); 1065 1066 SDValue Lo = Op.getOperand(0); 1067 SDValue Hi = Op.getOperand(1); 1068 1069 SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); 1070 // Extend sign. 1071 OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, 1072 DAG.getValueType(MVT::i1)); 1073 1074 SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); 1075 1076 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); 1077 } 1078 1079 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 1080 SDLoc DL(Op); 1081 return DAG.getNode( 1082 ISD::SETCC, 1083 DL, 1084 MVT::i1, 1085 Op, DAG.getConstantFP(0.0f, DL, MVT::f32), 1086 DAG.getCondCode(ISD::SETNE) 1087 ); 1088 } 1089 1090 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 1091 SDLoc DL, 1092 unsigned DwordOffset) const { 1093 unsigned ByteOffset = DwordOffset * 4; 1094 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1095 AMDGPUAS::CONSTANT_BUFFER_0); 1096 1097 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 1098 assert(isInt<16>(ByteOffset)); 1099 1100 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 1101 DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR 1102 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 1103 false, false, false, 0); 1104 } 1105 1106 bool R600TargetLowering::isZero(SDValue Op) const { 1107 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 1108 return Cst->isNullValue(); 1109 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 1110 return CstFP->isZero(); 1111 } else { 1112 return false; 1113 } 1114 } 1115 1116 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 1117 SDLoc DL(Op); 1118 EVT VT = Op.getValueType(); 1119 1120 SDValue LHS = Op.getOperand(0); 1121 SDValue RHS = Op.getOperand(1); 1122 SDValue True = Op.getOperand(2); 1123 SDValue False = Op.getOperand(3); 1124 SDValue CC = Op.getOperand(4); 1125 SDValue Temp; 1126 1127 if (VT == MVT::f32) { 1128 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); 1129 SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); 1130 if (MinMax) 1131 return MinMax; 1132 } 1133 1134 // LHS and RHS are guaranteed to be the same value type 1135 EVT CompareVT = LHS.getValueType(); 1136 1137 // Check if we can lower this to a native operation. 1138 1139 // Try to lower to a SET* instruction: 1140 // 1141 // SET* can match the following patterns: 1142 // 1143 // select_cc f32, f32, -1, 0, cc_supported 1144 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 1145 // select_cc i32, i32, -1, 0, cc_supported 1146 // 1147 1148 // Move hardware True/False values to the correct operand. 1149 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1150 ISD::CondCode InverseCC = 1151 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 1152 if (isHWTrueValue(False) && isHWFalseValue(True)) { 1153 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 1154 std::swap(False, True); 1155 CC = DAG.getCondCode(InverseCC); 1156 } else { 1157 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 1158 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 1159 std::swap(False, True); 1160 std::swap(LHS, RHS); 1161 CC = DAG.getCondCode(SwapInvCC); 1162 } 1163 } 1164 } 1165 1166 if (isHWTrueValue(True) && isHWFalseValue(False) && 1167 (CompareVT == VT || VT == MVT::i32)) { 1168 // This can be matched by a SET* instruction. 1169 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 1170 } 1171 1172 // Try to lower to a CND* instruction: 1173 // 1174 // CND* can match the following patterns: 1175 // 1176 // select_cc f32, 0.0, f32, f32, cc_supported 1177 // select_cc f32, 0.0, i32, i32, cc_supported 1178 // select_cc i32, 0, f32, f32, cc_supported 1179 // select_cc i32, 0, i32, i32, cc_supported 1180 // 1181 1182 // Try to move the zero value to the RHS 1183 if (isZero(LHS)) { 1184 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1185 // Try swapping the operands 1186 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 1187 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 1188 std::swap(LHS, RHS); 1189 CC = DAG.getCondCode(CCSwapped); 1190 } else { 1191 // Try inverting the conditon and then swapping the operands 1192 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); 1193 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 1194 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 1195 std::swap(True, False); 1196 std::swap(LHS, RHS); 1197 CC = DAG.getCondCode(CCSwapped); 1198 } 1199 } 1200 } 1201 if (isZero(RHS)) { 1202 SDValue Cond = LHS; 1203 SDValue Zero = RHS; 1204 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1205 if (CompareVT != VT) { 1206 // Bitcast True / False to the correct types. This will end up being 1207 // a nop, but it allows us to define only a single pattern in the 1208 // .TD files for each CND* instruction rather than having to have 1209 // one pattern for integer True/False and one for fp True/False 1210 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 1211 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 1212 } 1213 1214 switch (CCOpcode) { 1215 case ISD::SETONE: 1216 case ISD::SETUNE: 1217 case ISD::SETNE: 1218 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 1219 Temp = True; 1220 True = False; 1221 False = Temp; 1222 break; 1223 default: 1224 break; 1225 } 1226 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 1227 Cond, Zero, 1228 True, False, 1229 DAG.getCondCode(CCOpcode)); 1230 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 1231 } 1232 1233 // If we make it this for it means we have no native instructions to handle 1234 // this SELECT_CC, so we must lower it. 1235 SDValue HWTrue, HWFalse; 1236 1237 if (CompareVT == MVT::f32) { 1238 HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); 1239 HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); 1240 } else if (CompareVT == MVT::i32) { 1241 HWTrue = DAG.getConstant(-1, DL, CompareVT); 1242 HWFalse = DAG.getConstant(0, DL, CompareVT); 1243 } 1244 else { 1245 llvm_unreachable("Unhandled value type in LowerSELECT_CC"); 1246 } 1247 1248 // Lower this unsupported SELECT_CC into a combination of two supported 1249 // SELECT_CC operations. 1250 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 1251 1252 return DAG.getNode(ISD::SELECT_CC, DL, VT, 1253 Cond, HWFalse, 1254 True, False, 1255 DAG.getCondCode(ISD::SETNE)); 1256 } 1257 1258 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to 1259 /// convert these pointers to a register index. Each register holds 1260 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 1261 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 1262 /// for indirect addressing. 1263 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 1264 unsigned StackWidth, 1265 SelectionDAG &DAG) const { 1266 unsigned SRLPad; 1267 switch(StackWidth) { 1268 case 1: 1269 SRLPad = 2; 1270 break; 1271 case 2: 1272 SRLPad = 3; 1273 break; 1274 case 4: 1275 SRLPad = 4; 1276 break; 1277 default: llvm_unreachable("Invalid stack width"); 1278 } 1279 1280 SDLoc DL(Ptr); 1281 return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, 1282 DAG.getConstant(SRLPad, DL, MVT::i32)); 1283 } 1284 1285 void R600TargetLowering::getStackAddress(unsigned StackWidth, 1286 unsigned ElemIdx, 1287 unsigned &Channel, 1288 unsigned &PtrIncr) const { 1289 switch (StackWidth) { 1290 default: 1291 case 1: 1292 Channel = 0; 1293 if (ElemIdx > 0) { 1294 PtrIncr = 1; 1295 } else { 1296 PtrIncr = 0; 1297 } 1298 break; 1299 case 2: 1300 Channel = ElemIdx % 2; 1301 if (ElemIdx == 2) { 1302 PtrIncr = 1; 1303 } else { 1304 PtrIncr = 0; 1305 } 1306 break; 1307 case 4: 1308 Channel = ElemIdx; 1309 PtrIncr = 0; 1310 break; 1311 } 1312 } 1313 1314 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1315 SDLoc DL(Op); 1316 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 1317 SDValue Chain = Op.getOperand(0); 1318 SDValue Value = Op.getOperand(1); 1319 SDValue Ptr = Op.getOperand(2); 1320 1321 SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1322 if (Result.getNode()) { 1323 return Result; 1324 } 1325 1326 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { 1327 if (StoreNode->isTruncatingStore()) { 1328 EVT VT = Value.getValueType(); 1329 assert(VT.bitsLE(MVT::i32)); 1330 EVT MemVT = StoreNode->getMemoryVT(); 1331 SDValue MaskConstant; 1332 if (MemVT == MVT::i8) { 1333 MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); 1334 } else { 1335 assert(MemVT == MVT::i16); 1336 MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); 1337 } 1338 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, 1339 DAG.getConstant(2, DL, MVT::i32)); 1340 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, 1341 DAG.getConstant(0x00000003, DL, VT)); 1342 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 1343 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 1344 DAG.getConstant(3, DL, VT)); 1345 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); 1346 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); 1347 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 1348 // vector instead. 1349 SDValue Src[4] = { 1350 ShiftedValue, 1351 DAG.getConstant(0, DL, MVT::i32), 1352 DAG.getConstant(0, DL, MVT::i32), 1353 Mask 1354 }; 1355 SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); 1356 SDValue Args[3] = { Chain, Input, DWordAddr }; 1357 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 1358 Op->getVTList(), Args, MemVT, 1359 StoreNode->getMemOperand()); 1360 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && 1361 Value.getValueType().bitsGE(MVT::i32)) { 1362 // Convert pointer from byte address to dword address. 1363 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 1364 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 1365 Ptr, DAG.getConstant(2, DL, MVT::i32))); 1366 1367 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 1368 llvm_unreachable("Truncated and indexed stores not supported yet"); 1369 } else { 1370 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1371 } 1372 return Chain; 1373 } 1374 } 1375 1376 EVT ValueVT = Value.getValueType(); 1377 1378 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1379 return SDValue(); 1380 } 1381 1382 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1383 if (Ret.getNode()) { 1384 return Ret; 1385 } 1386 // Lowering for indirect addressing 1387 1388 const MachineFunction &MF = DAG.getMachineFunction(); 1389 const AMDGPUFrameLowering *TFL = 1390 static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); 1391 unsigned StackWidth = TFL->getStackWidth(MF); 1392 1393 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1394 1395 if (ValueVT.isVector()) { 1396 unsigned NumElemVT = ValueVT.getVectorNumElements(); 1397 EVT ElemVT = ValueVT.getVectorElementType(); 1398 SmallVector<SDValue, 4> Stores(NumElemVT); 1399 1400 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1401 "vector width in load"); 1402 1403 for (unsigned i = 0; i < NumElemVT; ++i) { 1404 unsigned Channel, PtrIncr; 1405 getStackAddress(StackWidth, i, Channel, PtrIncr); 1406 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1407 DAG.getConstant(PtrIncr, DL, MVT::i32)); 1408 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 1409 Value, DAG.getConstant(i, DL, MVT::i32)); 1410 1411 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1412 Chain, Elem, Ptr, 1413 DAG.getTargetConstant(Channel, DL, MVT::i32)); 1414 } 1415 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 1416 } else { 1417 if (ValueVT == MVT::i8) { 1418 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 1419 } 1420 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 1421 DAG.getTargetConstant(0, DL, MVT::i32)); // Channel 1422 } 1423 1424 return Chain; 1425 } 1426 1427 // return (512 + (kc_bank << 12) 1428 static int 1429 ConstantAddressBlock(unsigned AddressSpace) { 1430 switch (AddressSpace) { 1431 case AMDGPUAS::CONSTANT_BUFFER_0: 1432 return 512; 1433 case AMDGPUAS::CONSTANT_BUFFER_1: 1434 return 512 + 4096; 1435 case AMDGPUAS::CONSTANT_BUFFER_2: 1436 return 512 + 4096 * 2; 1437 case AMDGPUAS::CONSTANT_BUFFER_3: 1438 return 512 + 4096 * 3; 1439 case AMDGPUAS::CONSTANT_BUFFER_4: 1440 return 512 + 4096 * 4; 1441 case AMDGPUAS::CONSTANT_BUFFER_5: 1442 return 512 + 4096 * 5; 1443 case AMDGPUAS::CONSTANT_BUFFER_6: 1444 return 512 + 4096 * 6; 1445 case AMDGPUAS::CONSTANT_BUFFER_7: 1446 return 512 + 4096 * 7; 1447 case AMDGPUAS::CONSTANT_BUFFER_8: 1448 return 512 + 4096 * 8; 1449 case AMDGPUAS::CONSTANT_BUFFER_9: 1450 return 512 + 4096 * 9; 1451 case AMDGPUAS::CONSTANT_BUFFER_10: 1452 return 512 + 4096 * 10; 1453 case AMDGPUAS::CONSTANT_BUFFER_11: 1454 return 512 + 4096 * 11; 1455 case AMDGPUAS::CONSTANT_BUFFER_12: 1456 return 512 + 4096 * 12; 1457 case AMDGPUAS::CONSTANT_BUFFER_13: 1458 return 512 + 4096 * 13; 1459 case AMDGPUAS::CONSTANT_BUFFER_14: 1460 return 512 + 4096 * 14; 1461 case AMDGPUAS::CONSTANT_BUFFER_15: 1462 return 512 + 4096 * 15; 1463 default: 1464 return -1; 1465 } 1466 } 1467 1468 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const 1469 { 1470 EVT VT = Op.getValueType(); 1471 SDLoc DL(Op); 1472 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1473 SDValue Chain = Op.getOperand(0); 1474 SDValue Ptr = Op.getOperand(1); 1475 SDValue LoweredLoad; 1476 1477 if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG)) 1478 return Ret; 1479 1480 // Lower loads constant address space global variable loads 1481 if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && 1482 isa<GlobalVariable>(GetUnderlyingObject( 1483 LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) { 1484 1485 SDValue Ptr = DAG.getZExtOrTrunc( 1486 LoadNode->getBasePtr(), DL, 1487 getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS)); 1488 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 1489 DAG.getConstant(2, DL, MVT::i32)); 1490 return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), 1491 LoadNode->getChain(), Ptr, 1492 DAG.getTargetConstant(0, DL, MVT::i32), 1493 Op.getOperand(2)); 1494 } 1495 1496 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { 1497 SDValue MergedValues[2] = { 1498 ScalarizeVectorLoad(Op, DAG), 1499 Chain 1500 }; 1501 return DAG.getMergeValues(MergedValues, DL); 1502 } 1503 1504 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1505 if (ConstantBlock > -1 && 1506 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 1507 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 1508 SDValue Result; 1509 if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) || 1510 isa<Constant>(LoadNode->getMemOperand()->getValue()) || 1511 isa<ConstantSDNode>(Ptr)) { 1512 SDValue Slots[4]; 1513 for (unsigned i = 0; i < 4; i++) { 1514 // We want Const position encoded with the following formula : 1515 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1516 // const_index is Ptr computed by llvm using an alignment of 16. 1517 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1518 // then div by 4 at the ISel step 1519 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1520 DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); 1521 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1522 } 1523 EVT NewVT = MVT::v4i32; 1524 unsigned NumElements = 4; 1525 if (VT.isVector()) { 1526 NewVT = VT; 1527 NumElements = VT.getVectorNumElements(); 1528 } 1529 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, 1530 makeArrayRef(Slots, NumElements)); 1531 } else { 1532 // non-constant ptr can't be folded, keeps it as a v4f32 load 1533 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1534 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 1535 DAG.getConstant(4, DL, MVT::i32)), 1536 DAG.getConstant(LoadNode->getAddressSpace() - 1537 AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) 1538 ); 1539 } 1540 1541 if (!VT.isVector()) { 1542 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1543 DAG.getConstant(0, DL, MVT::i32)); 1544 } 1545 1546 SDValue MergedValues[2] = { 1547 Result, 1548 Chain 1549 }; 1550 return DAG.getMergeValues(MergedValues, DL); 1551 } 1552 1553 // For most operations returning SDValue() will result in the node being 1554 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 1555 // need to manually expand loads that may be legal in some address spaces and 1556 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 1557 // compute shaders, since the data is sign extended when it is uploaded to the 1558 // buffer. However SEXT loads from other address spaces are not supported, so 1559 // we need to expand them here. 1560 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 1561 EVT MemVT = LoadNode->getMemoryVT(); 1562 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 1563 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, 1564 LoadNode->getPointerInfo(), MemVT, 1565 LoadNode->isVolatile(), 1566 LoadNode->isNonTemporal(), 1567 LoadNode->isInvariant(), 1568 LoadNode->getAlignment()); 1569 SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, 1570 DAG.getValueType(MemVT)); 1571 1572 SDValue MergedValues[2] = { Res, Chain }; 1573 return DAG.getMergeValues(MergedValues, DL); 1574 } 1575 1576 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1577 return SDValue(); 1578 } 1579 1580 // Lowering for indirect addressing 1581 const MachineFunction &MF = DAG.getMachineFunction(); 1582 const AMDGPUFrameLowering *TFL = 1583 static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); 1584 unsigned StackWidth = TFL->getStackWidth(MF); 1585 1586 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1587 1588 if (VT.isVector()) { 1589 unsigned NumElemVT = VT.getVectorNumElements(); 1590 EVT ElemVT = VT.getVectorElementType(); 1591 SDValue Loads[4]; 1592 1593 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1594 "vector width in load"); 1595 1596 for (unsigned i = 0; i < NumElemVT; ++i) { 1597 unsigned Channel, PtrIncr; 1598 getStackAddress(StackWidth, i, Channel, PtrIncr); 1599 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1600 DAG.getConstant(PtrIncr, DL, MVT::i32)); 1601 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 1602 Chain, Ptr, 1603 DAG.getTargetConstant(Channel, DL, MVT::i32), 1604 Op.getOperand(2)); 1605 } 1606 for (unsigned i = NumElemVT; i < 4; ++i) { 1607 Loads[i] = DAG.getUNDEF(ElemVT); 1608 } 1609 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); 1610 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); 1611 } else { 1612 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 1613 Chain, Ptr, 1614 DAG.getTargetConstant(0, DL, MVT::i32), // Channel 1615 Op.getOperand(2)); 1616 } 1617 1618 SDValue Ops[2] = { 1619 LoweredLoad, 1620 Chain 1621 }; 1622 1623 return DAG.getMergeValues(Ops, DL); 1624 } 1625 1626 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1627 SDValue Chain = Op.getOperand(0); 1628 SDValue Cond = Op.getOperand(1); 1629 SDValue Jump = Op.getOperand(2); 1630 1631 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), 1632 Chain, Jump, Cond); 1633 } 1634 1635 /// XXX Only kernel functions are supported, so we can assume for now that 1636 /// every function is a kernel function, but in the future we should use 1637 /// separate calling conventions for kernel and non-kernel functions. 1638 SDValue R600TargetLowering::LowerFormalArguments( 1639 SDValue Chain, 1640 CallingConv::ID CallConv, 1641 bool isVarArg, 1642 const SmallVectorImpl<ISD::InputArg> &Ins, 1643 SDLoc DL, SelectionDAG &DAG, 1644 SmallVectorImpl<SDValue> &InVals) const { 1645 SmallVector<CCValAssign, 16> ArgLocs; 1646 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1647 *DAG.getContext()); 1648 MachineFunction &MF = DAG.getMachineFunction(); 1649 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 1650 1651 SmallVector<ISD::InputArg, 8> LocalIns; 1652 1653 getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); 1654 1655 AnalyzeFormalArguments(CCInfo, LocalIns); 1656 1657 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1658 CCValAssign &VA = ArgLocs[i]; 1659 const ISD::InputArg &In = Ins[i]; 1660 EVT VT = In.VT; 1661 EVT MemVT = VA.getLocVT(); 1662 if (!VT.isVector() && MemVT.isVector()) { 1663 // Get load source type if scalarized. 1664 MemVT = MemVT.getVectorElementType(); 1665 } 1666 1667 if (MFI->getShaderType() != ShaderType::COMPUTE) { 1668 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); 1669 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1670 InVals.push_back(Register); 1671 continue; 1672 } 1673 1674 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1675 AMDGPUAS::CONSTANT_BUFFER_0); 1676 1677 // i64 isn't a legal type, so the register type used ends up as i32, which 1678 // isn't expected here. It attempts to create this sextload, but it ends up 1679 // being invalid. Somehow this seems to work with i64 arguments, but breaks 1680 // for <1 x i64>. 1681 1682 // The first 36 bytes of the input buffer contains information about 1683 // thread group and global sizes. 1684 ISD::LoadExtType Ext = ISD::NON_EXTLOAD; 1685 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { 1686 // FIXME: This should really check the extload type, but the handling of 1687 // extload vector parameters seems to be broken. 1688 1689 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 1690 Ext = ISD::SEXTLOAD; 1691 } 1692 1693 // Compute the offset from the value. 1694 // XXX - I think PartOffset should give you this, but it seems to give the 1695 // size of the register which isn't useful. 1696 1697 unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); 1698 unsigned PartOffset = VA.getLocMemOffset(); 1699 unsigned Offset = 36 + VA.getLocMemOffset(); 1700 1701 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); 1702 SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, 1703 DAG.getConstant(Offset, DL, MVT::i32), 1704 DAG.getUNDEF(MVT::i32), 1705 PtrInfo, 1706 MemVT, false, true, true, 4); 1707 1708 // 4 is the preferred alignment for the CONSTANT memory space. 1709 InVals.push_back(Arg); 1710 MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); 1711 } 1712 return Chain; 1713 } 1714 1715 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1716 EVT VT) const { 1717 if (!VT.isVector()) 1718 return MVT::i32; 1719 return VT.changeVectorElementTypeToInteger(); 1720 } 1721 1722 static SDValue CompactSwizzlableVector( 1723 SelectionDAG &DAG, SDValue VectorEntry, 1724 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1725 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1726 assert(RemapSwizzle.empty()); 1727 SDValue NewBldVec[4] = { 1728 VectorEntry.getOperand(0), 1729 VectorEntry.getOperand(1), 1730 VectorEntry.getOperand(2), 1731 VectorEntry.getOperand(3) 1732 }; 1733 1734 for (unsigned i = 0; i < 4; i++) { 1735 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1736 // We mask write here to teach later passes that the ith element of this 1737 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 1738 // break false dependencies and additionnaly make assembly easier to read. 1739 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 1740 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1741 if (C->isZero()) { 1742 RemapSwizzle[i] = 4; // SEL_0 1743 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1744 } else if (C->isExactlyValue(1.0)) { 1745 RemapSwizzle[i] = 5; // SEL_1 1746 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1747 } 1748 } 1749 1750 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1751 continue; 1752 for (unsigned j = 0; j < i; j++) { 1753 if (NewBldVec[i] == NewBldVec[j]) { 1754 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1755 RemapSwizzle[i] = j; 1756 break; 1757 } 1758 } 1759 } 1760 1761 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1762 VectorEntry.getValueType(), NewBldVec); 1763 } 1764 1765 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1766 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1767 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1768 assert(RemapSwizzle.empty()); 1769 SDValue NewBldVec[4] = { 1770 VectorEntry.getOperand(0), 1771 VectorEntry.getOperand(1), 1772 VectorEntry.getOperand(2), 1773 VectorEntry.getOperand(3) 1774 }; 1775 bool isUnmovable[4] = { false, false, false, false }; 1776 for (unsigned i = 0; i < 4; i++) { 1777 RemapSwizzle[i] = i; 1778 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1779 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1780 ->getZExtValue(); 1781 if (i == Idx) 1782 isUnmovable[Idx] = true; 1783 } 1784 } 1785 1786 for (unsigned i = 0; i < 4; i++) { 1787 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1788 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1789 ->getZExtValue(); 1790 if (isUnmovable[Idx]) 1791 continue; 1792 // Swap i and Idx 1793 std::swap(NewBldVec[Idx], NewBldVec[i]); 1794 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 1795 break; 1796 } 1797 } 1798 1799 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1800 VectorEntry.getValueType(), NewBldVec); 1801 } 1802 1803 1804 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, 1805 SDValue Swz[4], SelectionDAG &DAG, 1806 SDLoc DL) const { 1807 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); 1808 // Old -> New swizzle values 1809 DenseMap<unsigned, unsigned> SwizzleRemap; 1810 1811 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1812 for (unsigned i = 0; i < 4; i++) { 1813 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1814 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1815 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1816 } 1817 1818 SwizzleRemap.clear(); 1819 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1820 for (unsigned i = 0; i < 4; i++) { 1821 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1822 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1823 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1824 } 1825 1826 return BuildVector; 1827 } 1828 1829 1830 //===----------------------------------------------------------------------===// 1831 // Custom DAG Optimizations 1832 //===----------------------------------------------------------------------===// 1833 1834 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1835 DAGCombinerInfo &DCI) const { 1836 SelectionDAG &DAG = DCI.DAG; 1837 1838 switch (N->getOpcode()) { 1839 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1840 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1841 case ISD::FP_ROUND: { 1842 SDValue Arg = N->getOperand(0); 1843 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1844 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), 1845 Arg.getOperand(0)); 1846 } 1847 break; 1848 } 1849 1850 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1851 // (i32 select_cc f32, f32, -1, 0 cc) 1852 // 1853 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1854 // this to one of the SET*_DX10 instructions. 1855 case ISD::FP_TO_SINT: { 1856 SDValue FNeg = N->getOperand(0); 1857 if (FNeg.getOpcode() != ISD::FNEG) { 1858 return SDValue(); 1859 } 1860 SDValue SelectCC = FNeg.getOperand(0); 1861 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1862 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1863 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1864 !isHWTrueValue(SelectCC.getOperand(2)) || 1865 !isHWFalseValue(SelectCC.getOperand(3))) { 1866 return SDValue(); 1867 } 1868 1869 SDLoc dl(N); 1870 return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0), 1871 SelectCC.getOperand(0), // LHS 1872 SelectCC.getOperand(1), // RHS 1873 DAG.getConstant(-1, dl, MVT::i32), // True 1874 DAG.getConstant(0, dl, MVT::i32), // False 1875 SelectCC.getOperand(4)); // CC 1876 1877 break; 1878 } 1879 1880 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 1881 // => build_vector elt0, ... , NewEltIdx, ... , eltN 1882 case ISD::INSERT_VECTOR_ELT: { 1883 SDValue InVec = N->getOperand(0); 1884 SDValue InVal = N->getOperand(1); 1885 SDValue EltNo = N->getOperand(2); 1886 SDLoc dl(N); 1887 1888 // If the inserted element is an UNDEF, just use the input vector. 1889 if (InVal.getOpcode() == ISD::UNDEF) 1890 return InVec; 1891 1892 EVT VT = InVec.getValueType(); 1893 1894 // If we can't generate a legal BUILD_VECTOR, exit 1895 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 1896 return SDValue(); 1897 1898 // Check that we know which element is being inserted 1899 if (!isa<ConstantSDNode>(EltNo)) 1900 return SDValue(); 1901 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 1902 1903 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 1904 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 1905 // vector elements. 1906 SmallVector<SDValue, 8> Ops; 1907 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 1908 Ops.append(InVec.getNode()->op_begin(), 1909 InVec.getNode()->op_end()); 1910 } else if (InVec.getOpcode() == ISD::UNDEF) { 1911 unsigned NElts = VT.getVectorNumElements(); 1912 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 1913 } else { 1914 return SDValue(); 1915 } 1916 1917 // Insert the element 1918 if (Elt < Ops.size()) { 1919 // All the operands of BUILD_VECTOR must have the same type; 1920 // we enforce that here. 1921 EVT OpVT = Ops[0].getValueType(); 1922 if (InVal.getValueType() != OpVT) 1923 InVal = OpVT.bitsGT(InVal.getValueType()) ? 1924 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : 1925 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); 1926 Ops[Elt] = InVal; 1927 } 1928 1929 // Return the new vector 1930 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 1931 } 1932 1933 // Extract_vec (Build_vector) generated by custom lowering 1934 // also needs to be customly combined 1935 case ISD::EXTRACT_VECTOR_ELT: { 1936 SDValue Arg = N->getOperand(0); 1937 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1938 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1939 unsigned Element = Const->getZExtValue(); 1940 return Arg->getOperand(Element); 1941 } 1942 } 1943 if (Arg.getOpcode() == ISD::BITCAST && 1944 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 1945 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1946 unsigned Element = Const->getZExtValue(); 1947 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), 1948 Arg->getOperand(0).getOperand(Element)); 1949 } 1950 } 1951 break; 1952 } 1953 1954 case ISD::SELECT_CC: { 1955 // Try common optimizations 1956 SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1957 if (Ret.getNode()) 1958 return Ret; 1959 1960 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1961 // selectcc x, y, a, b, inv(cc) 1962 // 1963 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1964 // selectcc x, y, a, b, cc 1965 SDValue LHS = N->getOperand(0); 1966 if (LHS.getOpcode() != ISD::SELECT_CC) { 1967 return SDValue(); 1968 } 1969 1970 SDValue RHS = N->getOperand(1); 1971 SDValue True = N->getOperand(2); 1972 SDValue False = N->getOperand(3); 1973 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1974 1975 if (LHS.getOperand(2).getNode() != True.getNode() || 1976 LHS.getOperand(3).getNode() != False.getNode() || 1977 RHS.getNode() != False.getNode()) { 1978 return SDValue(); 1979 } 1980 1981 switch (NCC) { 1982 default: return SDValue(); 1983 case ISD::SETNE: return LHS; 1984 case ISD::SETEQ: { 1985 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1986 LHSCC = ISD::getSetCCInverse(LHSCC, 1987 LHS.getOperand(0).getValueType().isInteger()); 1988 if (DCI.isBeforeLegalizeOps() || 1989 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 1990 return DAG.getSelectCC(SDLoc(N), 1991 LHS.getOperand(0), 1992 LHS.getOperand(1), 1993 LHS.getOperand(2), 1994 LHS.getOperand(3), 1995 LHSCC); 1996 break; 1997 } 1998 } 1999 return SDValue(); 2000 } 2001 2002 case AMDGPUISD::EXPORT: { 2003 SDValue Arg = N->getOperand(1); 2004 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2005 break; 2006 2007 SDValue NewArgs[8] = { 2008 N->getOperand(0), // Chain 2009 SDValue(), 2010 N->getOperand(2), // ArrayBase 2011 N->getOperand(3), // Type 2012 N->getOperand(4), // SWZ_X 2013 N->getOperand(5), // SWZ_Y 2014 N->getOperand(6), // SWZ_Z 2015 N->getOperand(7) // SWZ_W 2016 }; 2017 SDLoc DL(N); 2018 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); 2019 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); 2020 } 2021 case AMDGPUISD::TEXTURE_FETCH: { 2022 SDValue Arg = N->getOperand(1); 2023 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2024 break; 2025 2026 SDValue NewArgs[19] = { 2027 N->getOperand(0), 2028 N->getOperand(1), 2029 N->getOperand(2), 2030 N->getOperand(3), 2031 N->getOperand(4), 2032 N->getOperand(5), 2033 N->getOperand(6), 2034 N->getOperand(7), 2035 N->getOperand(8), 2036 N->getOperand(9), 2037 N->getOperand(10), 2038 N->getOperand(11), 2039 N->getOperand(12), 2040 N->getOperand(13), 2041 N->getOperand(14), 2042 N->getOperand(15), 2043 N->getOperand(16), 2044 N->getOperand(17), 2045 N->getOperand(18), 2046 }; 2047 SDLoc DL(N); 2048 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); 2049 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); 2050 } 2051 } 2052 2053 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 2054 } 2055 2056 static bool 2057 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, 2058 SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { 2059 const R600InstrInfo *TII = 2060 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); 2061 if (!Src.isMachineOpcode()) 2062 return false; 2063 switch (Src.getMachineOpcode()) { 2064 case AMDGPU::FNEG_R600: 2065 if (!Neg.getNode()) 2066 return false; 2067 Src = Src.getOperand(0); 2068 Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 2069 return true; 2070 case AMDGPU::FABS_R600: 2071 if (!Abs.getNode()) 2072 return false; 2073 Src = Src.getOperand(0); 2074 Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 2075 return true; 2076 case AMDGPU::CONST_COPY: { 2077 unsigned Opcode = ParentNode->getMachineOpcode(); 2078 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2079 2080 if (!Sel.getNode()) 2081 return false; 2082 2083 SDValue CstOffset = Src.getOperand(0); 2084 if (ParentNode->getValueType(0).isVector()) 2085 return false; 2086 2087 // Gather constants values 2088 int SrcIndices[] = { 2089 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 2090 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 2091 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), 2092 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 2093 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 2094 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 2095 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 2096 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 2097 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 2098 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 2099 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 2100 }; 2101 std::vector<unsigned> Consts; 2102 for (int OtherSrcIdx : SrcIndices) { 2103 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 2104 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 2105 continue; 2106 if (HasDst) { 2107 OtherSrcIdx--; 2108 OtherSelIdx--; 2109 } 2110 if (RegisterSDNode *Reg = 2111 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 2112 if (Reg->getReg() == AMDGPU::ALU_CONST) { 2113 ConstantSDNode *Cst 2114 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); 2115 Consts.push_back(Cst->getZExtValue()); 2116 } 2117 } 2118 } 2119 2120 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); 2121 Consts.push_back(Cst->getZExtValue()); 2122 if (!TII->fitsConstReadLimitations(Consts)) { 2123 return false; 2124 } 2125 2126 Sel = CstOffset; 2127 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); 2128 return true; 2129 } 2130 case AMDGPU::MOV_IMM_I32: 2131 case AMDGPU::MOV_IMM_F32: { 2132 unsigned ImmReg = AMDGPU::ALU_LITERAL_X; 2133 uint64_t ImmValue = 0; 2134 2135 2136 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { 2137 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); 2138 float FloatValue = FPC->getValueAPF().convertToFloat(); 2139 if (FloatValue == 0.0) { 2140 ImmReg = AMDGPU::ZERO; 2141 } else if (FloatValue == 0.5) { 2142 ImmReg = AMDGPU::HALF; 2143 } else if (FloatValue == 1.0) { 2144 ImmReg = AMDGPU::ONE; 2145 } else { 2146 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 2147 } 2148 } else { 2149 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); 2150 uint64_t Value = C->getZExtValue(); 2151 if (Value == 0) { 2152 ImmReg = AMDGPU::ZERO; 2153 } else if (Value == 1) { 2154 ImmReg = AMDGPU::ONE_INT; 2155 } else { 2156 ImmValue = Value; 2157 } 2158 } 2159 2160 // Check that we aren't already using an immediate. 2161 // XXX: It's possible for an instruction to have more than one 2162 // immediate operand, but this is not supported yet. 2163 if (ImmReg == AMDGPU::ALU_LITERAL_X) { 2164 if (!Imm.getNode()) 2165 return false; 2166 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); 2167 assert(C); 2168 if (C->getZExtValue()) 2169 return false; 2170 Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); 2171 } 2172 Src = DAG.getRegister(ImmReg, MVT::i32); 2173 return true; 2174 } 2175 default: 2176 return false; 2177 } 2178 } 2179 2180 2181 /// \brief Fold the instructions after selecting them 2182 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 2183 SelectionDAG &DAG) const { 2184 const R600InstrInfo *TII = 2185 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); 2186 if (!Node->isMachineOpcode()) 2187 return Node; 2188 unsigned Opcode = Node->getMachineOpcode(); 2189 SDValue FakeOp; 2190 2191 std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); 2192 2193 if (Opcode == AMDGPU::DOT_4) { 2194 int OperandIdx[] = { 2195 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 2196 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 2197 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 2198 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 2199 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 2200 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 2201 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 2202 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 2203 }; 2204 int NegIdx[] = { 2205 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), 2206 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), 2207 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), 2208 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), 2209 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), 2210 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), 2211 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), 2212 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) 2213 }; 2214 int AbsIdx[] = { 2215 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), 2216 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), 2217 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), 2218 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), 2219 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), 2220 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), 2221 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), 2222 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) 2223 }; 2224 for (unsigned i = 0; i < 8; i++) { 2225 if (OperandIdx[i] < 0) 2226 return Node; 2227 SDValue &Src = Ops[OperandIdx[i] - 1]; 2228 SDValue &Neg = Ops[NegIdx[i] - 1]; 2229 SDValue &Abs = Ops[AbsIdx[i] - 1]; 2230 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2231 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2232 if (HasDst) 2233 SelIdx--; 2234 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2235 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 2236 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2237 } 2238 } else if (Opcode == AMDGPU::REG_SEQUENCE) { 2239 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 2240 SDValue &Src = Ops[i]; 2241 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 2242 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2243 } 2244 } else if (Opcode == AMDGPU::CLAMP_R600) { 2245 SDValue Src = Node->getOperand(0); 2246 if (!Src.isMachineOpcode() || 2247 !TII->hasInstrModifiers(Src.getMachineOpcode())) 2248 return Node; 2249 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), 2250 AMDGPU::OpName::clamp); 2251 if (ClampIdx < 0) 2252 return Node; 2253 SDLoc DL(Node); 2254 std::vector<SDValue> Ops(Src->op_begin(), Src->op_end()); 2255 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32); 2256 return DAG.getMachineNode(Src.getMachineOpcode(), DL, 2257 Node->getVTList(), Ops); 2258 } else { 2259 if (!TII->hasInstrModifiers(Opcode)) 2260 return Node; 2261 int OperandIdx[] = { 2262 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 2263 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 2264 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) 2265 }; 2266 int NegIdx[] = { 2267 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), 2268 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), 2269 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) 2270 }; 2271 int AbsIdx[] = { 2272 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), 2273 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), 2274 -1 2275 }; 2276 for (unsigned i = 0; i < 3; i++) { 2277 if (OperandIdx[i] < 0) 2278 return Node; 2279 SDValue &Src = Ops[OperandIdx[i] - 1]; 2280 SDValue &Neg = Ops[NegIdx[i] - 1]; 2281 SDValue FakeAbs; 2282 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 2283 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2284 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2285 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); 2286 if (HasDst) { 2287 SelIdx--; 2288 ImmIdx--; 2289 } 2290 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2291 SDValue &Imm = Ops[ImmIdx]; 2292 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 2293 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2294 } 2295 } 2296 2297 return Node; 2298 } 2299