1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for R600 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "R600ISelLowering.h" 16 #include "AMDGPUFrameLowering.h" 17 #include "AMDGPUIntrinsicInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "R600Defines.h" 20 #include "R600InstrInfo.h" 21 #include "R600MachineFunctionInfo.h" 22 #include "llvm/CodeGen/CallingConvLower.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/CodeGen/MachineRegisterInfo.h" 26 #include "llvm/CodeGen/SelectionDAG.h" 27 #include "llvm/IR/Argument.h" 28 #include "llvm/IR/Function.h" 29 30 using namespace llvm; 31 32 R600TargetLowering::R600TargetLowering(TargetMachine &TM) : 33 AMDGPUTargetLowering(TM), 34 Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) { 35 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 36 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 37 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 38 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 39 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); 40 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); 41 42 computeRegisterProperties(); 43 44 // Set condition code actions 45 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 46 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 47 setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 48 setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 49 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 50 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 51 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 52 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 53 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 54 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 55 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 56 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 57 58 setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 59 setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 60 setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 61 setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 62 63 setOperationAction(ISD::FCOS, MVT::f32, Custom); 64 setOperationAction(ISD::FSIN, MVT::f32, Custom); 65 66 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 67 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 68 69 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 70 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 71 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 72 73 setOperationAction(ISD::FSUB, MVT::f32, Expand); 74 75 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 76 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 77 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); 78 79 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 80 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 81 82 setOperationAction(ISD::SETCC, MVT::i32, Expand); 83 setOperationAction(ISD::SETCC, MVT::f32, Expand); 84 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 85 86 setOperationAction(ISD::SELECT, MVT::i32, Expand); 87 setOperationAction(ISD::SELECT, MVT::f32, Expand); 88 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 89 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 90 91 // Expand sign extension of vectors 92 if (!Subtarget->hasBFE()) 93 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 94 95 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); 96 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); 97 98 if (!Subtarget->hasBFE()) 99 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 100 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); 101 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); 102 103 if (!Subtarget->hasBFE()) 104 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 105 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 106 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); 107 108 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 109 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); 110 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); 111 112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); 113 114 115 // Legalize loads and stores to the private address space. 116 setOperationAction(ISD::LOAD, MVT::i32, Custom); 117 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 118 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 119 120 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 121 // spaces, so it is custom lowered to handle those where it isn't. 122 setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); 123 setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); 124 setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); 125 setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); 126 setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); 127 setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); 128 129 setOperationAction(ISD::STORE, MVT::i8, Custom); 130 setOperationAction(ISD::STORE, MVT::i32, Custom); 131 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 132 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 133 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 134 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 135 136 setOperationAction(ISD::LOAD, MVT::i32, Custom); 137 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 138 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 139 140 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); 141 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); 142 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 143 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 144 145 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); 146 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); 147 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 148 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 149 150 setTargetDAGCombine(ISD::FP_ROUND); 151 setTargetDAGCombine(ISD::FP_TO_SINT); 152 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 153 setTargetDAGCombine(ISD::SELECT_CC); 154 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 155 156 setOperationAction(ISD::SUB, MVT::i64, Expand); 157 158 // These should be replaced by UDVIREM, but it does not happen automatically 159 // during Type Legalization 160 setOperationAction(ISD::UDIV, MVT::i64, Custom); 161 setOperationAction(ISD::UREM, MVT::i64, Custom); 162 setOperationAction(ISD::SDIV, MVT::i64, Custom); 163 setOperationAction(ISD::SREM, MVT::i64, Custom); 164 165 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 166 // to be Legal/Custom in order to avoid library calls. 167 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 168 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 169 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 170 171 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 172 173 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 174 for (MVT VT : ScalarIntVTs) { 175 setOperationAction(ISD::ADDC, VT, Expand); 176 setOperationAction(ISD::SUBC, VT, Expand); 177 setOperationAction(ISD::ADDE, VT, Expand); 178 setOperationAction(ISD::SUBE, VT, Expand); 179 } 180 181 setBooleanContents(ZeroOrNegativeOneBooleanContent); 182 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 183 setSchedulingPreference(Sched::Source); 184 } 185 186 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( 187 MachineInstr * MI, MachineBasicBlock * BB) const { 188 MachineFunction * MF = BB->getParent(); 189 MachineRegisterInfo &MRI = MF->getRegInfo(); 190 MachineBasicBlock::iterator I = *MI; 191 const R600InstrInfo *TII = 192 static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo()); 193 194 switch (MI->getOpcode()) { 195 default: 196 // Replace LDS_*_RET instruction that don't have any uses with the 197 // equivalent LDS_*_NORET instruction. 198 if (TII->isLDSRetInstr(MI->getOpcode())) { 199 int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); 200 assert(DstIdx != -1); 201 MachineInstrBuilder NewMI; 202 if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) 203 return BB; 204 205 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 206 TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); 207 for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { 208 NewMI.addOperand(MI->getOperand(i)); 209 } 210 } else { 211 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 212 } 213 break; 214 case AMDGPU::CLAMP_R600: { 215 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 216 AMDGPU::MOV, 217 MI->getOperand(0).getReg(), 218 MI->getOperand(1).getReg()); 219 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); 220 break; 221 } 222 223 case AMDGPU::FABS_R600: { 224 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 225 AMDGPU::MOV, 226 MI->getOperand(0).getReg(), 227 MI->getOperand(1).getReg()); 228 TII->addFlag(NewMI, 0, MO_FLAG_ABS); 229 break; 230 } 231 232 case AMDGPU::FNEG_R600: { 233 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 234 AMDGPU::MOV, 235 MI->getOperand(0).getReg(), 236 MI->getOperand(1).getReg()); 237 TII->addFlag(NewMI, 0, MO_FLAG_NEG); 238 break; 239 } 240 241 case AMDGPU::MASK_WRITE: { 242 unsigned maskedRegister = MI->getOperand(0).getReg(); 243 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 244 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 245 TII->addFlag(defInstr, 0, MO_FLAG_MASK); 246 break; 247 } 248 249 case AMDGPU::MOV_IMM_F32: 250 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 251 MI->getOperand(1).getFPImm()->getValueAPF() 252 .bitcastToAPInt().getZExtValue()); 253 break; 254 case AMDGPU::MOV_IMM_I32: 255 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 256 MI->getOperand(1).getImm()); 257 break; 258 case AMDGPU::CONST_COPY: { 259 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, 260 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); 261 TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, 262 MI->getOperand(1).getImm()); 263 break; 264 } 265 266 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 267 case AMDGPU::RAT_WRITE_CACHELESS_64_eg: 268 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 269 unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 270 271 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 272 .addOperand(MI->getOperand(0)) 273 .addOperand(MI->getOperand(1)) 274 .addImm(EOP); // Set End of program bit 275 break; 276 } 277 278 case AMDGPU::TXD: { 279 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 280 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 281 MachineOperand &RID = MI->getOperand(4); 282 MachineOperand &SID = MI->getOperand(5); 283 unsigned TextureId = MI->getOperand(6).getImm(); 284 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 285 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 286 287 switch (TextureId) { 288 case 5: // Rect 289 CTX = CTY = 0; 290 break; 291 case 6: // Shadow1D 292 SrcW = SrcZ; 293 break; 294 case 7: // Shadow2D 295 SrcW = SrcZ; 296 break; 297 case 8: // ShadowRect 298 CTX = CTY = 0; 299 SrcW = SrcZ; 300 break; 301 case 9: // 1DArray 302 SrcZ = SrcY; 303 CTZ = 0; 304 break; 305 case 10: // 2DArray 306 CTZ = 0; 307 break; 308 case 11: // Shadow1DArray 309 SrcZ = SrcY; 310 CTZ = 0; 311 break; 312 case 12: // Shadow2DArray 313 CTZ = 0; 314 break; 315 } 316 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 317 .addOperand(MI->getOperand(3)) 318 .addImm(SrcX) 319 .addImm(SrcY) 320 .addImm(SrcZ) 321 .addImm(SrcW) 322 .addImm(0) 323 .addImm(0) 324 .addImm(0) 325 .addImm(0) 326 .addImm(1) 327 .addImm(2) 328 .addImm(3) 329 .addOperand(RID) 330 .addOperand(SID) 331 .addImm(CTX) 332 .addImm(CTY) 333 .addImm(CTZ) 334 .addImm(CTW); 335 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 336 .addOperand(MI->getOperand(2)) 337 .addImm(SrcX) 338 .addImm(SrcY) 339 .addImm(SrcZ) 340 .addImm(SrcW) 341 .addImm(0) 342 .addImm(0) 343 .addImm(0) 344 .addImm(0) 345 .addImm(1) 346 .addImm(2) 347 .addImm(3) 348 .addOperand(RID) 349 .addOperand(SID) 350 .addImm(CTX) 351 .addImm(CTY) 352 .addImm(CTZ) 353 .addImm(CTW); 354 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 355 .addOperand(MI->getOperand(0)) 356 .addOperand(MI->getOperand(1)) 357 .addImm(SrcX) 358 .addImm(SrcY) 359 .addImm(SrcZ) 360 .addImm(SrcW) 361 .addImm(0) 362 .addImm(0) 363 .addImm(0) 364 .addImm(0) 365 .addImm(1) 366 .addImm(2) 367 .addImm(3) 368 .addOperand(RID) 369 .addOperand(SID) 370 .addImm(CTX) 371 .addImm(CTY) 372 .addImm(CTZ) 373 .addImm(CTW) 374 .addReg(T0, RegState::Implicit) 375 .addReg(T1, RegState::Implicit); 376 break; 377 } 378 379 case AMDGPU::TXD_SHADOW: { 380 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 381 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 382 MachineOperand &RID = MI->getOperand(4); 383 MachineOperand &SID = MI->getOperand(5); 384 unsigned TextureId = MI->getOperand(6).getImm(); 385 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 386 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 387 388 switch (TextureId) { 389 case 5: // Rect 390 CTX = CTY = 0; 391 break; 392 case 6: // Shadow1D 393 SrcW = SrcZ; 394 break; 395 case 7: // Shadow2D 396 SrcW = SrcZ; 397 break; 398 case 8: // ShadowRect 399 CTX = CTY = 0; 400 SrcW = SrcZ; 401 break; 402 case 9: // 1DArray 403 SrcZ = SrcY; 404 CTZ = 0; 405 break; 406 case 10: // 2DArray 407 CTZ = 0; 408 break; 409 case 11: // Shadow1DArray 410 SrcZ = SrcY; 411 CTZ = 0; 412 break; 413 case 12: // Shadow2DArray 414 CTZ = 0; 415 break; 416 } 417 418 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 419 .addOperand(MI->getOperand(3)) 420 .addImm(SrcX) 421 .addImm(SrcY) 422 .addImm(SrcZ) 423 .addImm(SrcW) 424 .addImm(0) 425 .addImm(0) 426 .addImm(0) 427 .addImm(0) 428 .addImm(1) 429 .addImm(2) 430 .addImm(3) 431 .addOperand(RID) 432 .addOperand(SID) 433 .addImm(CTX) 434 .addImm(CTY) 435 .addImm(CTZ) 436 .addImm(CTW); 437 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 438 .addOperand(MI->getOperand(2)) 439 .addImm(SrcX) 440 .addImm(SrcY) 441 .addImm(SrcZ) 442 .addImm(SrcW) 443 .addImm(0) 444 .addImm(0) 445 .addImm(0) 446 .addImm(0) 447 .addImm(1) 448 .addImm(2) 449 .addImm(3) 450 .addOperand(RID) 451 .addOperand(SID) 452 .addImm(CTX) 453 .addImm(CTY) 454 .addImm(CTZ) 455 .addImm(CTW); 456 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 457 .addOperand(MI->getOperand(0)) 458 .addOperand(MI->getOperand(1)) 459 .addImm(SrcX) 460 .addImm(SrcY) 461 .addImm(SrcZ) 462 .addImm(SrcW) 463 .addImm(0) 464 .addImm(0) 465 .addImm(0) 466 .addImm(0) 467 .addImm(1) 468 .addImm(2) 469 .addImm(3) 470 .addOperand(RID) 471 .addOperand(SID) 472 .addImm(CTX) 473 .addImm(CTY) 474 .addImm(CTZ) 475 .addImm(CTW) 476 .addReg(T0, RegState::Implicit) 477 .addReg(T1, RegState::Implicit); 478 break; 479 } 480 481 case AMDGPU::BRANCH: 482 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 483 .addOperand(MI->getOperand(0)); 484 break; 485 486 case AMDGPU::BRANCH_COND_f32: { 487 MachineInstr *NewMI = 488 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 489 AMDGPU::PREDICATE_BIT) 490 .addOperand(MI->getOperand(1)) 491 .addImm(OPCODE_IS_NOT_ZERO) 492 .addImm(0); // Flags 493 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 494 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 495 .addOperand(MI->getOperand(0)) 496 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 497 break; 498 } 499 500 case AMDGPU::BRANCH_COND_i32: { 501 MachineInstr *NewMI = 502 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 503 AMDGPU::PREDICATE_BIT) 504 .addOperand(MI->getOperand(1)) 505 .addImm(OPCODE_IS_NOT_ZERO_INT) 506 .addImm(0); // Flags 507 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 508 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 509 .addOperand(MI->getOperand(0)) 510 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 511 break; 512 } 513 514 case AMDGPU::EG_ExportSwz: 515 case AMDGPU::R600_ExportSwz: { 516 // Instruction is left unmodified if its not the last one of its type 517 bool isLastInstructionOfItsType = true; 518 unsigned InstExportType = MI->getOperand(1).getImm(); 519 for (MachineBasicBlock::iterator NextExportInst = std::next(I), 520 EndBlock = BB->end(); NextExportInst != EndBlock; 521 NextExportInst = std::next(NextExportInst)) { 522 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 523 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 524 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 525 .getImm(); 526 if (CurrentInstExportType == InstExportType) { 527 isLastInstructionOfItsType = false; 528 break; 529 } 530 } 531 } 532 bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 533 if (!EOP && !isLastInstructionOfItsType) 534 return BB; 535 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; 536 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 537 .addOperand(MI->getOperand(0)) 538 .addOperand(MI->getOperand(1)) 539 .addOperand(MI->getOperand(2)) 540 .addOperand(MI->getOperand(3)) 541 .addOperand(MI->getOperand(4)) 542 .addOperand(MI->getOperand(5)) 543 .addOperand(MI->getOperand(6)) 544 .addImm(CfInst) 545 .addImm(EOP); 546 break; 547 } 548 case AMDGPU::RETURN: { 549 // RETURN instructions must have the live-out registers as implicit uses, 550 // otherwise they appear dead. 551 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 552 MachineInstrBuilder MIB(*MF, MI); 553 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 554 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 555 return BB; 556 } 557 } 558 559 MI->eraseFromParent(); 560 return BB; 561 } 562 563 //===----------------------------------------------------------------------===// 564 // Custom DAG Lowering Operations 565 //===----------------------------------------------------------------------===// 566 567 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 568 MachineFunction &MF = DAG.getMachineFunction(); 569 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 570 switch (Op.getOpcode()) { 571 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 572 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 573 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 574 case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); 575 case ISD::SRA_PARTS: 576 case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); 577 case ISD::FCOS: 578 case ISD::FSIN: return LowerTrig(Op, DAG); 579 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 580 case ISD::STORE: return LowerSTORE(Op, DAG); 581 case ISD::LOAD: { 582 SDValue Result = LowerLOAD(Op, DAG); 583 assert((!Result.getNode() || 584 Result.getNode()->getNumValues() == 2) && 585 "Load should return a value and a chain"); 586 return Result; 587 } 588 589 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 590 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 591 case ISD::INTRINSIC_VOID: { 592 SDValue Chain = Op.getOperand(0); 593 unsigned IntrinsicID = 594 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 595 switch (IntrinsicID) { 596 case AMDGPUIntrinsic::AMDGPU_store_output: { 597 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 598 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 599 MFI->LiveOuts.push_back(Reg); 600 return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); 601 } 602 case AMDGPUIntrinsic::R600_store_swizzle: { 603 const SDValue Args[8] = { 604 Chain, 605 Op.getOperand(2), // Export Value 606 Op.getOperand(3), // ArrayBase 607 Op.getOperand(4), // Type 608 DAG.getConstant(0, MVT::i32), // SWZ_X 609 DAG.getConstant(1, MVT::i32), // SWZ_Y 610 DAG.getConstant(2, MVT::i32), // SWZ_Z 611 DAG.getConstant(3, MVT::i32) // SWZ_W 612 }; 613 return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args); 614 } 615 616 // default for switch(IntrinsicID) 617 default: break; 618 } 619 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 620 break; 621 } 622 case ISD::INTRINSIC_WO_CHAIN: { 623 unsigned IntrinsicID = 624 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 625 EVT VT = Op.getValueType(); 626 SDLoc DL(Op); 627 switch(IntrinsicID) { 628 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 629 case AMDGPUIntrinsic::R600_load_input: { 630 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 631 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 632 MachineFunction &MF = DAG.getMachineFunction(); 633 MachineRegisterInfo &MRI = MF.getRegInfo(); 634 MRI.addLiveIn(Reg); 635 return DAG.getCopyFromReg(DAG.getEntryNode(), 636 SDLoc(DAG.getEntryNode()), Reg, VT); 637 } 638 639 case AMDGPUIntrinsic::R600_interp_input: { 640 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 641 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); 642 MachineSDNode *interp; 643 if (ijb < 0) { 644 const MachineFunction &MF = DAG.getMachineFunction(); 645 const R600InstrInfo *TII = 646 static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo()); 647 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, 648 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); 649 return DAG.getTargetExtractSubreg( 650 TII->getRegisterInfo().getSubRegFromChannel(slot % 4), 651 DL, MVT::f32, SDValue(interp, 0)); 652 } 653 MachineFunction &MF = DAG.getMachineFunction(); 654 MachineRegisterInfo &MRI = MF.getRegInfo(); 655 unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); 656 unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); 657 MRI.addLiveIn(RegisterI); 658 MRI.addLiveIn(RegisterJ); 659 SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), 660 SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); 661 SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), 662 SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); 663 664 if (slot % 4 < 2) 665 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 666 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 667 RegisterJNode, RegisterINode); 668 else 669 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 670 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 671 RegisterJNode, RegisterINode); 672 return SDValue(interp, slot % 2); 673 } 674 case AMDGPUIntrinsic::R600_interp_xy: 675 case AMDGPUIntrinsic::R600_interp_zw: { 676 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 677 MachineSDNode *interp; 678 SDValue RegisterINode = Op.getOperand(2); 679 SDValue RegisterJNode = Op.getOperand(3); 680 681 if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) 682 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 683 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), 684 RegisterJNode, RegisterINode); 685 else 686 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 687 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), 688 RegisterJNode, RegisterINode); 689 return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, 690 SDValue(interp, 0), SDValue(interp, 1)); 691 } 692 case AMDGPUIntrinsic::R600_tex: 693 case AMDGPUIntrinsic::R600_texc: 694 case AMDGPUIntrinsic::R600_txl: 695 case AMDGPUIntrinsic::R600_txlc: 696 case AMDGPUIntrinsic::R600_txb: 697 case AMDGPUIntrinsic::R600_txbc: 698 case AMDGPUIntrinsic::R600_txf: 699 case AMDGPUIntrinsic::R600_txq: 700 case AMDGPUIntrinsic::R600_ddx: 701 case AMDGPUIntrinsic::R600_ddy: 702 case AMDGPUIntrinsic::R600_ldptr: { 703 unsigned TextureOp; 704 switch (IntrinsicID) { 705 case AMDGPUIntrinsic::R600_tex: 706 TextureOp = 0; 707 break; 708 case AMDGPUIntrinsic::R600_texc: 709 TextureOp = 1; 710 break; 711 case AMDGPUIntrinsic::R600_txl: 712 TextureOp = 2; 713 break; 714 case AMDGPUIntrinsic::R600_txlc: 715 TextureOp = 3; 716 break; 717 case AMDGPUIntrinsic::R600_txb: 718 TextureOp = 4; 719 break; 720 case AMDGPUIntrinsic::R600_txbc: 721 TextureOp = 5; 722 break; 723 case AMDGPUIntrinsic::R600_txf: 724 TextureOp = 6; 725 break; 726 case AMDGPUIntrinsic::R600_txq: 727 TextureOp = 7; 728 break; 729 case AMDGPUIntrinsic::R600_ddx: 730 TextureOp = 8; 731 break; 732 case AMDGPUIntrinsic::R600_ddy: 733 TextureOp = 9; 734 break; 735 case AMDGPUIntrinsic::R600_ldptr: 736 TextureOp = 10; 737 break; 738 default: 739 llvm_unreachable("Unknow Texture Operation"); 740 } 741 742 SDValue TexArgs[19] = { 743 DAG.getConstant(TextureOp, MVT::i32), 744 Op.getOperand(1), 745 DAG.getConstant(0, MVT::i32), 746 DAG.getConstant(1, MVT::i32), 747 DAG.getConstant(2, MVT::i32), 748 DAG.getConstant(3, MVT::i32), 749 Op.getOperand(2), 750 Op.getOperand(3), 751 Op.getOperand(4), 752 DAG.getConstant(0, MVT::i32), 753 DAG.getConstant(1, MVT::i32), 754 DAG.getConstant(2, MVT::i32), 755 DAG.getConstant(3, MVT::i32), 756 Op.getOperand(5), 757 Op.getOperand(6), 758 Op.getOperand(7), 759 Op.getOperand(8), 760 Op.getOperand(9), 761 Op.getOperand(10) 762 }; 763 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); 764 } 765 case AMDGPUIntrinsic::AMDGPU_dp4: { 766 SDValue Args[8] = { 767 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 768 DAG.getConstant(0, MVT::i32)), 769 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 770 DAG.getConstant(0, MVT::i32)), 771 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 772 DAG.getConstant(1, MVT::i32)), 773 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 774 DAG.getConstant(1, MVT::i32)), 775 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 776 DAG.getConstant(2, MVT::i32)), 777 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 778 DAG.getConstant(2, MVT::i32)), 779 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 780 DAG.getConstant(3, MVT::i32)), 781 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 782 DAG.getConstant(3, MVT::i32)) 783 }; 784 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); 785 } 786 787 case Intrinsic::r600_read_ngroups_x: 788 return LowerImplicitParameter(DAG, VT, DL, 0); 789 case Intrinsic::r600_read_ngroups_y: 790 return LowerImplicitParameter(DAG, VT, DL, 1); 791 case Intrinsic::r600_read_ngroups_z: 792 return LowerImplicitParameter(DAG, VT, DL, 2); 793 case Intrinsic::r600_read_global_size_x: 794 return LowerImplicitParameter(DAG, VT, DL, 3); 795 case Intrinsic::r600_read_global_size_y: 796 return LowerImplicitParameter(DAG, VT, DL, 4); 797 case Intrinsic::r600_read_global_size_z: 798 return LowerImplicitParameter(DAG, VT, DL, 5); 799 case Intrinsic::r600_read_local_size_x: 800 return LowerImplicitParameter(DAG, VT, DL, 6); 801 case Intrinsic::r600_read_local_size_y: 802 return LowerImplicitParameter(DAG, VT, DL, 7); 803 case Intrinsic::r600_read_local_size_z: 804 return LowerImplicitParameter(DAG, VT, DL, 8); 805 806 case Intrinsic::r600_read_tgid_x: 807 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 808 AMDGPU::T1_X, VT); 809 case Intrinsic::r600_read_tgid_y: 810 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 811 AMDGPU::T1_Y, VT); 812 case Intrinsic::r600_read_tgid_z: 813 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 814 AMDGPU::T1_Z, VT); 815 case Intrinsic::r600_read_tidig_x: 816 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 817 AMDGPU::T0_X, VT); 818 case Intrinsic::r600_read_tidig_y: 819 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 820 AMDGPU::T0_Y, VT); 821 case Intrinsic::r600_read_tidig_z: 822 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 823 AMDGPU::T0_Z, VT); 824 case Intrinsic::AMDGPU_rsq: 825 // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. 826 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 827 } 828 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 829 break; 830 } 831 } // end switch(Op.getOpcode()) 832 return SDValue(); 833 } 834 835 void R600TargetLowering::ReplaceNodeResults(SDNode *N, 836 SmallVectorImpl<SDValue> &Results, 837 SelectionDAG &DAG) const { 838 switch (N->getOpcode()) { 839 default: 840 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 841 return; 842 case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 843 return; 844 case ISD::UDIV: { 845 SDValue Op = SDValue(N, 0); 846 SDLoc DL(Op); 847 EVT VT = Op.getValueType(); 848 SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), 849 N->getOperand(0), N->getOperand(1)); 850 Results.push_back(UDIVREM); 851 break; 852 } 853 case ISD::UREM: { 854 SDValue Op = SDValue(N, 0); 855 SDLoc DL(Op); 856 EVT VT = Op.getValueType(); 857 SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), 858 N->getOperand(0), N->getOperand(1)); 859 Results.push_back(UDIVREM.getValue(1)); 860 break; 861 } 862 case ISD::SDIV: { 863 SDValue Op = SDValue(N, 0); 864 SDLoc DL(Op); 865 EVT VT = Op.getValueType(); 866 SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), 867 N->getOperand(0), N->getOperand(1)); 868 Results.push_back(SDIVREM); 869 break; 870 } 871 case ISD::SREM: { 872 SDValue Op = SDValue(N, 0); 873 SDLoc DL(Op); 874 EVT VT = Op.getValueType(); 875 SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), 876 N->getOperand(0), N->getOperand(1)); 877 Results.push_back(SDIVREM.getValue(1)); 878 break; 879 } 880 case ISD::SDIVREM: { 881 SDValue Op = SDValue(N, 1); 882 SDValue RES = LowerSDIVREM(Op, DAG); 883 Results.push_back(RES); 884 Results.push_back(RES.getValue(1)); 885 break; 886 } 887 case ISD::UDIVREM: { 888 SDValue Op = SDValue(N, 0); 889 SDLoc DL(Op); 890 EVT VT = Op.getValueType(); 891 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 892 893 SDValue one = DAG.getConstant(1, HalfVT); 894 SDValue zero = DAG.getConstant(0, HalfVT); 895 896 //HiLo split 897 SDValue LHS = N->getOperand(0); 898 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); 899 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); 900 901 SDValue RHS = N->getOperand(1); 902 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); 903 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); 904 905 // Get Speculative values 906 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); 907 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); 908 909 SDValue REM_Hi = zero; 910 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); 911 912 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); 913 SDValue DIV_Lo = zero; 914 915 const unsigned halfBitWidth = HalfVT.getSizeInBits(); 916 917 for (unsigned i = 0; i < halfBitWidth; ++i) { 918 SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); 919 // Get Value of high bit 920 SDValue HBit; 921 if (halfBitWidth == 32 && Subtarget->hasBFE()) { 922 HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); 923 } else { 924 HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); 925 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); 926 } 927 928 SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, 929 DAG.getConstant(halfBitWidth - 1, HalfVT)); 930 REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); 931 REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); 932 933 REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); 934 REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); 935 936 937 SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); 938 939 SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); 940 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE); 941 942 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); 943 944 // Update REM 945 946 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); 947 948 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE); 949 REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); 950 REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); 951 } 952 953 SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); 954 SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); 955 Results.push_back(DIV); 956 Results.push_back(REM); 957 break; 958 } 959 } 960 } 961 962 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, 963 SDValue Vector) const { 964 965 SDLoc DL(Vector); 966 EVT VecVT = Vector.getValueType(); 967 EVT EltVT = VecVT.getVectorElementType(); 968 SmallVector<SDValue, 8> Args; 969 970 for (unsigned i = 0, e = VecVT.getVectorNumElements(); 971 i != e; ++i) { 972 Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, 973 Vector, DAG.getConstant(i, getVectorIdxTy()))); 974 } 975 976 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); 977 } 978 979 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 980 SelectionDAG &DAG) const { 981 982 SDLoc DL(Op); 983 SDValue Vector = Op.getOperand(0); 984 SDValue Index = Op.getOperand(1); 985 986 if (isa<ConstantSDNode>(Index) || 987 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 988 return Op; 989 990 Vector = vectorToVerticalVector(DAG, Vector); 991 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), 992 Vector, Index); 993 } 994 995 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 996 SelectionDAG &DAG) const { 997 SDLoc DL(Op); 998 SDValue Vector = Op.getOperand(0); 999 SDValue Value = Op.getOperand(1); 1000 SDValue Index = Op.getOperand(2); 1001 1002 if (isa<ConstantSDNode>(Index) || 1003 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 1004 return Op; 1005 1006 Vector = vectorToVerticalVector(DAG, Vector); 1007 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), 1008 Vector, Value, Index); 1009 return vectorToVerticalVector(DAG, Insert); 1010 } 1011 1012 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 1013 // On hw >= R700, COS/SIN input must be between -1. and 1. 1014 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 1015 EVT VT = Op.getValueType(); 1016 SDValue Arg = Op.getOperand(0); 1017 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, 1018 DAG.getNode(ISD::FADD, SDLoc(Op), VT, 1019 DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, 1020 DAG.getConstantFP(0.15915494309, MVT::f32)), 1021 DAG.getConstantFP(0.5, MVT::f32))); 1022 unsigned TrigNode; 1023 switch (Op.getOpcode()) { 1024 case ISD::FCOS: 1025 TrigNode = AMDGPUISD::COS_HW; 1026 break; 1027 case ISD::FSIN: 1028 TrigNode = AMDGPUISD::SIN_HW; 1029 break; 1030 default: 1031 llvm_unreachable("Wrong trig opcode"); 1032 } 1033 SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT, 1034 DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart, 1035 DAG.getConstantFP(-0.5, MVT::f32))); 1036 if (Gen >= AMDGPUSubtarget::R700) 1037 return TrigVal; 1038 // On R600 hw, COS/SIN input must be between -Pi and Pi. 1039 return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal, 1040 DAG.getConstantFP(3.14159265359, MVT::f32)); 1041 } 1042 1043 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { 1044 SDLoc DL(Op); 1045 EVT VT = Op.getValueType(); 1046 1047 SDValue Lo = Op.getOperand(0); 1048 SDValue Hi = Op.getOperand(1); 1049 SDValue Shift = Op.getOperand(2); 1050 SDValue Zero = DAG.getConstant(0, VT); 1051 SDValue One = DAG.getConstant(1, VT); 1052 1053 SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT); 1054 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT); 1055 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 1056 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 1057 1058 // The dance around Width1 is necessary for 0 special case. 1059 // Without it the CompShift might be 32, producing incorrect results in 1060 // Overflow. So we do the shift in two steps, the alternative is to 1061 // add a conditional to filter the special case. 1062 1063 SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); 1064 Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); 1065 1066 SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); 1067 HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); 1068 SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); 1069 1070 SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); 1071 SDValue LoBig = Zero; 1072 1073 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 1074 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 1075 1076 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 1077 } 1078 1079 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { 1080 SDLoc DL(Op); 1081 EVT VT = Op.getValueType(); 1082 1083 SDValue Lo = Op.getOperand(0); 1084 SDValue Hi = Op.getOperand(1); 1085 SDValue Shift = Op.getOperand(2); 1086 SDValue Zero = DAG.getConstant(0, VT); 1087 SDValue One = DAG.getConstant(1, VT); 1088 1089 const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; 1090 1091 SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT); 1092 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT); 1093 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 1094 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 1095 1096 // The dance around Width1 is necessary for 0 special case. 1097 // Without it the CompShift might be 32, producing incorrect results in 1098 // Overflow. So we do the shift in two steps, the alternative is to 1099 // add a conditional to filter the special case. 1100 1101 SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); 1102 Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); 1103 1104 SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); 1105 SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); 1106 LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); 1107 1108 SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); 1109 SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; 1110 1111 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 1112 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 1113 1114 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 1115 } 1116 1117 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 1118 return DAG.getNode( 1119 ISD::SETCC, 1120 SDLoc(Op), 1121 MVT::i1, 1122 Op, DAG.getConstantFP(0.0f, MVT::f32), 1123 DAG.getCondCode(ISD::SETNE) 1124 ); 1125 } 1126 1127 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 1128 SDLoc DL, 1129 unsigned DwordOffset) const { 1130 unsigned ByteOffset = DwordOffset * 4; 1131 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1132 AMDGPUAS::CONSTANT_BUFFER_0); 1133 1134 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 1135 assert(isInt<16>(ByteOffset)); 1136 1137 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 1138 DAG.getConstant(ByteOffset, MVT::i32), // PTR 1139 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 1140 false, false, false, 0); 1141 } 1142 1143 bool R600TargetLowering::isZero(SDValue Op) const { 1144 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 1145 return Cst->isNullValue(); 1146 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 1147 return CstFP->isZero(); 1148 } else { 1149 return false; 1150 } 1151 } 1152 1153 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 1154 SDLoc DL(Op); 1155 EVT VT = Op.getValueType(); 1156 1157 SDValue LHS = Op.getOperand(0); 1158 SDValue RHS = Op.getOperand(1); 1159 SDValue True = Op.getOperand(2); 1160 SDValue False = Op.getOperand(3); 1161 SDValue CC = Op.getOperand(4); 1162 SDValue Temp; 1163 1164 // LHS and RHS are guaranteed to be the same value type 1165 EVT CompareVT = LHS.getValueType(); 1166 1167 // Check if we can lower this to a native operation. 1168 1169 // Try to lower to a SET* instruction: 1170 // 1171 // SET* can match the following patterns: 1172 // 1173 // select_cc f32, f32, -1, 0, cc_supported 1174 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 1175 // select_cc i32, i32, -1, 0, cc_supported 1176 // 1177 1178 // Move hardware True/False values to the correct operand. 1179 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1180 ISD::CondCode InverseCC = 1181 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 1182 if (isHWTrueValue(False) && isHWFalseValue(True)) { 1183 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 1184 std::swap(False, True); 1185 CC = DAG.getCondCode(InverseCC); 1186 } else { 1187 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 1188 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 1189 std::swap(False, True); 1190 std::swap(LHS, RHS); 1191 CC = DAG.getCondCode(SwapInvCC); 1192 } 1193 } 1194 } 1195 1196 if (isHWTrueValue(True) && isHWFalseValue(False) && 1197 (CompareVT == VT || VT == MVT::i32)) { 1198 // This can be matched by a SET* instruction. 1199 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 1200 } 1201 1202 // Try to lower to a CND* instruction: 1203 // 1204 // CND* can match the following patterns: 1205 // 1206 // select_cc f32, 0.0, f32, f32, cc_supported 1207 // select_cc f32, 0.0, i32, i32, cc_supported 1208 // select_cc i32, 0, f32, f32, cc_supported 1209 // select_cc i32, 0, i32, i32, cc_supported 1210 // 1211 1212 // Try to move the zero value to the RHS 1213 if (isZero(LHS)) { 1214 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1215 // Try swapping the operands 1216 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 1217 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 1218 std::swap(LHS, RHS); 1219 CC = DAG.getCondCode(CCSwapped); 1220 } else { 1221 // Try inverting the conditon and then swapping the operands 1222 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); 1223 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 1224 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 1225 std::swap(True, False); 1226 std::swap(LHS, RHS); 1227 CC = DAG.getCondCode(CCSwapped); 1228 } 1229 } 1230 } 1231 if (isZero(RHS)) { 1232 SDValue Cond = LHS; 1233 SDValue Zero = RHS; 1234 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1235 if (CompareVT != VT) { 1236 // Bitcast True / False to the correct types. This will end up being 1237 // a nop, but it allows us to define only a single pattern in the 1238 // .TD files for each CND* instruction rather than having to have 1239 // one pattern for integer True/False and one for fp True/False 1240 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 1241 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 1242 } 1243 1244 switch (CCOpcode) { 1245 case ISD::SETONE: 1246 case ISD::SETUNE: 1247 case ISD::SETNE: 1248 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 1249 Temp = True; 1250 True = False; 1251 False = Temp; 1252 break; 1253 default: 1254 break; 1255 } 1256 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 1257 Cond, Zero, 1258 True, False, 1259 DAG.getCondCode(CCOpcode)); 1260 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 1261 } 1262 1263 // If we make it this for it means we have no native instructions to handle 1264 // this SELECT_CC, so we must lower it. 1265 SDValue HWTrue, HWFalse; 1266 1267 if (CompareVT == MVT::f32) { 1268 HWTrue = DAG.getConstantFP(1.0f, CompareVT); 1269 HWFalse = DAG.getConstantFP(0.0f, CompareVT); 1270 } else if (CompareVT == MVT::i32) { 1271 HWTrue = DAG.getConstant(-1, CompareVT); 1272 HWFalse = DAG.getConstant(0, CompareVT); 1273 } 1274 else { 1275 llvm_unreachable("Unhandled value type in LowerSELECT_CC"); 1276 } 1277 1278 // Lower this unsupported SELECT_CC into a combination of two supported 1279 // SELECT_CC operations. 1280 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 1281 1282 return DAG.getNode(ISD::SELECT_CC, DL, VT, 1283 Cond, HWFalse, 1284 True, False, 1285 DAG.getCondCode(ISD::SETNE)); 1286 } 1287 1288 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to 1289 /// convert these pointers to a register index. Each register holds 1290 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 1291 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 1292 /// for indirect addressing. 1293 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 1294 unsigned StackWidth, 1295 SelectionDAG &DAG) const { 1296 unsigned SRLPad; 1297 switch(StackWidth) { 1298 case 1: 1299 SRLPad = 2; 1300 break; 1301 case 2: 1302 SRLPad = 3; 1303 break; 1304 case 4: 1305 SRLPad = 4; 1306 break; 1307 default: llvm_unreachable("Invalid stack width"); 1308 } 1309 1310 return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr, 1311 DAG.getConstant(SRLPad, MVT::i32)); 1312 } 1313 1314 void R600TargetLowering::getStackAddress(unsigned StackWidth, 1315 unsigned ElemIdx, 1316 unsigned &Channel, 1317 unsigned &PtrIncr) const { 1318 switch (StackWidth) { 1319 default: 1320 case 1: 1321 Channel = 0; 1322 if (ElemIdx > 0) { 1323 PtrIncr = 1; 1324 } else { 1325 PtrIncr = 0; 1326 } 1327 break; 1328 case 2: 1329 Channel = ElemIdx % 2; 1330 if (ElemIdx == 2) { 1331 PtrIncr = 1; 1332 } else { 1333 PtrIncr = 0; 1334 } 1335 break; 1336 case 4: 1337 Channel = ElemIdx; 1338 PtrIncr = 0; 1339 break; 1340 } 1341 } 1342 1343 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1344 SDLoc DL(Op); 1345 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 1346 SDValue Chain = Op.getOperand(0); 1347 SDValue Value = Op.getOperand(1); 1348 SDValue Ptr = Op.getOperand(2); 1349 1350 SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1351 if (Result.getNode()) { 1352 return Result; 1353 } 1354 1355 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { 1356 if (StoreNode->isTruncatingStore()) { 1357 EVT VT = Value.getValueType(); 1358 assert(VT.bitsLE(MVT::i32)); 1359 EVT MemVT = StoreNode->getMemoryVT(); 1360 SDValue MaskConstant; 1361 if (MemVT == MVT::i8) { 1362 MaskConstant = DAG.getConstant(0xFF, MVT::i32); 1363 } else { 1364 assert(MemVT == MVT::i16); 1365 MaskConstant = DAG.getConstant(0xFFFF, MVT::i32); 1366 } 1367 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, 1368 DAG.getConstant(2, MVT::i32)); 1369 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, 1370 DAG.getConstant(0x00000003, VT)); 1371 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 1372 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 1373 DAG.getConstant(3, VT)); 1374 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); 1375 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); 1376 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 1377 // vector instead. 1378 SDValue Src[4] = { 1379 ShiftedValue, 1380 DAG.getConstant(0, MVT::i32), 1381 DAG.getConstant(0, MVT::i32), 1382 Mask 1383 }; 1384 SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); 1385 SDValue Args[3] = { Chain, Input, DWordAddr }; 1386 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 1387 Op->getVTList(), Args, MemVT, 1388 StoreNode->getMemOperand()); 1389 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && 1390 Value.getValueType().bitsGE(MVT::i32)) { 1391 // Convert pointer from byte address to dword address. 1392 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 1393 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 1394 Ptr, DAG.getConstant(2, MVT::i32))); 1395 1396 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 1397 llvm_unreachable("Truncated and indexed stores not supported yet"); 1398 } else { 1399 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1400 } 1401 return Chain; 1402 } 1403 } 1404 1405 EVT ValueVT = Value.getValueType(); 1406 1407 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1408 return SDValue(); 1409 } 1410 1411 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1412 if (Ret.getNode()) { 1413 return Ret; 1414 } 1415 // Lowering for indirect addressing 1416 1417 const MachineFunction &MF = DAG.getMachineFunction(); 1418 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 1419 getTargetMachine().getFrameLowering()); 1420 unsigned StackWidth = TFL->getStackWidth(MF); 1421 1422 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1423 1424 if (ValueVT.isVector()) { 1425 unsigned NumElemVT = ValueVT.getVectorNumElements(); 1426 EVT ElemVT = ValueVT.getVectorElementType(); 1427 SmallVector<SDValue, 4> Stores(NumElemVT); 1428 1429 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1430 "vector width in load"); 1431 1432 for (unsigned i = 0; i < NumElemVT; ++i) { 1433 unsigned Channel, PtrIncr; 1434 getStackAddress(StackWidth, i, Channel, PtrIncr); 1435 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1436 DAG.getConstant(PtrIncr, MVT::i32)); 1437 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 1438 Value, DAG.getConstant(i, MVT::i32)); 1439 1440 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1441 Chain, Elem, Ptr, 1442 DAG.getTargetConstant(Channel, MVT::i32)); 1443 } 1444 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 1445 } else { 1446 if (ValueVT == MVT::i8) { 1447 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 1448 } 1449 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 1450 DAG.getTargetConstant(0, MVT::i32)); // Channel 1451 } 1452 1453 return Chain; 1454 } 1455 1456 // return (512 + (kc_bank << 12) 1457 static int 1458 ConstantAddressBlock(unsigned AddressSpace) { 1459 switch (AddressSpace) { 1460 case AMDGPUAS::CONSTANT_BUFFER_0: 1461 return 512; 1462 case AMDGPUAS::CONSTANT_BUFFER_1: 1463 return 512 + 4096; 1464 case AMDGPUAS::CONSTANT_BUFFER_2: 1465 return 512 + 4096 * 2; 1466 case AMDGPUAS::CONSTANT_BUFFER_3: 1467 return 512 + 4096 * 3; 1468 case AMDGPUAS::CONSTANT_BUFFER_4: 1469 return 512 + 4096 * 4; 1470 case AMDGPUAS::CONSTANT_BUFFER_5: 1471 return 512 + 4096 * 5; 1472 case AMDGPUAS::CONSTANT_BUFFER_6: 1473 return 512 + 4096 * 6; 1474 case AMDGPUAS::CONSTANT_BUFFER_7: 1475 return 512 + 4096 * 7; 1476 case AMDGPUAS::CONSTANT_BUFFER_8: 1477 return 512 + 4096 * 8; 1478 case AMDGPUAS::CONSTANT_BUFFER_9: 1479 return 512 + 4096 * 9; 1480 case AMDGPUAS::CONSTANT_BUFFER_10: 1481 return 512 + 4096 * 10; 1482 case AMDGPUAS::CONSTANT_BUFFER_11: 1483 return 512 + 4096 * 11; 1484 case AMDGPUAS::CONSTANT_BUFFER_12: 1485 return 512 + 4096 * 12; 1486 case AMDGPUAS::CONSTANT_BUFFER_13: 1487 return 512 + 4096 * 13; 1488 case AMDGPUAS::CONSTANT_BUFFER_14: 1489 return 512 + 4096 * 14; 1490 case AMDGPUAS::CONSTANT_BUFFER_15: 1491 return 512 + 4096 * 15; 1492 default: 1493 return -1; 1494 } 1495 } 1496 1497 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const 1498 { 1499 EVT VT = Op.getValueType(); 1500 SDLoc DL(Op); 1501 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1502 SDValue Chain = Op.getOperand(0); 1503 SDValue Ptr = Op.getOperand(1); 1504 SDValue LoweredLoad; 1505 1506 SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); 1507 if (Ret.getNode()) { 1508 SDValue Ops[2] = { 1509 Ret, 1510 Chain 1511 }; 1512 return DAG.getMergeValues(Ops, DL); 1513 } 1514 1515 1516 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { 1517 SDValue MergedValues[2] = { 1518 SplitVectorLoad(Op, DAG), 1519 Chain 1520 }; 1521 return DAG.getMergeValues(MergedValues, DL); 1522 } 1523 1524 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1525 if (ConstantBlock > -1 && 1526 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 1527 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 1528 SDValue Result; 1529 if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) || 1530 isa<Constant>(LoadNode->getMemOperand()->getValue()) || 1531 isa<ConstantSDNode>(Ptr)) { 1532 SDValue Slots[4]; 1533 for (unsigned i = 0; i < 4; i++) { 1534 // We want Const position encoded with the following formula : 1535 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1536 // const_index is Ptr computed by llvm using an alignment of 16. 1537 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1538 // then div by 4 at the ISel step 1539 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1540 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); 1541 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1542 } 1543 EVT NewVT = MVT::v4i32; 1544 unsigned NumElements = 4; 1545 if (VT.isVector()) { 1546 NewVT = VT; 1547 NumElements = VT.getVectorNumElements(); 1548 } 1549 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, 1550 makeArrayRef(Slots, NumElements)); 1551 } else { 1552 // non-constant ptr can't be folded, keeps it as a v4f32 load 1553 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1554 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), 1555 DAG.getConstant(LoadNode->getAddressSpace() - 1556 AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) 1557 ); 1558 } 1559 1560 if (!VT.isVector()) { 1561 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1562 DAG.getConstant(0, MVT::i32)); 1563 } 1564 1565 SDValue MergedValues[2] = { 1566 Result, 1567 Chain 1568 }; 1569 return DAG.getMergeValues(MergedValues, DL); 1570 } 1571 1572 // For most operations returning SDValue() will result in the node being 1573 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 1574 // need to manually expand loads that may be legal in some address spaces and 1575 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 1576 // compute shaders, since the data is sign extended when it is uploaded to the 1577 // buffer. However SEXT loads from other address spaces are not supported, so 1578 // we need to expand them here. 1579 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 1580 EVT MemVT = LoadNode->getMemoryVT(); 1581 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 1582 SDValue ShiftAmount = 1583 DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32); 1584 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, 1585 LoadNode->getPointerInfo(), MemVT, 1586 LoadNode->isVolatile(), 1587 LoadNode->isNonTemporal(), 1588 LoadNode->getAlignment()); 1589 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount); 1590 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount); 1591 1592 SDValue MergedValues[2] = { Sra, Chain }; 1593 return DAG.getMergeValues(MergedValues, DL); 1594 } 1595 1596 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1597 return SDValue(); 1598 } 1599 1600 // Lowering for indirect addressing 1601 const MachineFunction &MF = DAG.getMachineFunction(); 1602 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 1603 getTargetMachine().getFrameLowering()); 1604 unsigned StackWidth = TFL->getStackWidth(MF); 1605 1606 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1607 1608 if (VT.isVector()) { 1609 unsigned NumElemVT = VT.getVectorNumElements(); 1610 EVT ElemVT = VT.getVectorElementType(); 1611 SDValue Loads[4]; 1612 1613 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1614 "vector width in load"); 1615 1616 for (unsigned i = 0; i < NumElemVT; ++i) { 1617 unsigned Channel, PtrIncr; 1618 getStackAddress(StackWidth, i, Channel, PtrIncr); 1619 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1620 DAG.getConstant(PtrIncr, MVT::i32)); 1621 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 1622 Chain, Ptr, 1623 DAG.getTargetConstant(Channel, MVT::i32), 1624 Op.getOperand(2)); 1625 } 1626 for (unsigned i = NumElemVT; i < 4; ++i) { 1627 Loads[i] = DAG.getUNDEF(ElemVT); 1628 } 1629 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); 1630 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); 1631 } else { 1632 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 1633 Chain, Ptr, 1634 DAG.getTargetConstant(0, MVT::i32), // Channel 1635 Op.getOperand(2)); 1636 } 1637 1638 SDValue Ops[2] = { 1639 LoweredLoad, 1640 Chain 1641 }; 1642 1643 return DAG.getMergeValues(Ops, DL); 1644 } 1645 1646 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1647 SDValue Chain = Op.getOperand(0); 1648 SDValue Cond = Op.getOperand(1); 1649 SDValue Jump = Op.getOperand(2); 1650 1651 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), 1652 Chain, Jump, Cond); 1653 } 1654 1655 /// XXX Only kernel functions are supported, so we can assume for now that 1656 /// every function is a kernel function, but in the future we should use 1657 /// separate calling conventions for kernel and non-kernel functions. 1658 SDValue R600TargetLowering::LowerFormalArguments( 1659 SDValue Chain, 1660 CallingConv::ID CallConv, 1661 bool isVarArg, 1662 const SmallVectorImpl<ISD::InputArg> &Ins, 1663 SDLoc DL, SelectionDAG &DAG, 1664 SmallVectorImpl<SDValue> &InVals) const { 1665 SmallVector<CCValAssign, 16> ArgLocs; 1666 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1667 getTargetMachine(), ArgLocs, *DAG.getContext()); 1668 MachineFunction &MF = DAG.getMachineFunction(); 1669 unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType; 1670 1671 SmallVector<ISD::InputArg, 8> LocalIns; 1672 1673 getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); 1674 1675 AnalyzeFormalArguments(CCInfo, LocalIns); 1676 1677 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1678 CCValAssign &VA = ArgLocs[i]; 1679 EVT VT = Ins[i].VT; 1680 EVT MemVT = LocalIns[i].VT; 1681 1682 if (ShaderType != ShaderType::COMPUTE) { 1683 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); 1684 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1685 InVals.push_back(Register); 1686 continue; 1687 } 1688 1689 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1690 AMDGPUAS::CONSTANT_BUFFER_0); 1691 1692 // i64 isn't a legal type, so the register type used ends up as i32, which 1693 // isn't expected here. It attempts to create this sextload, but it ends up 1694 // being invalid. Somehow this seems to work with i64 arguments, but breaks 1695 // for <1 x i64>. 1696 1697 // The first 36 bytes of the input buffer contains information about 1698 // thread group and global sizes. 1699 1700 // FIXME: This should really check the extload type, but the handling of 1701 // extload vecto parameters seems to be broken. 1702 //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 1703 ISD::LoadExtType Ext = ISD::SEXTLOAD; 1704 SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain, 1705 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32), 1706 MachinePointerInfo(UndefValue::get(PtrTy)), 1707 MemVT, false, false, 4); 1708 1709 // 4 is the preferred alignment for the CONSTANT memory space. 1710 InVals.push_back(Arg); 1711 } 1712 return Chain; 1713 } 1714 1715 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 1716 if (!VT.isVector()) 1717 return MVT::i32; 1718 return VT.changeVectorElementTypeToInteger(); 1719 } 1720 1721 static SDValue CompactSwizzlableVector( 1722 SelectionDAG &DAG, SDValue VectorEntry, 1723 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1724 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1725 assert(RemapSwizzle.empty()); 1726 SDValue NewBldVec[4] = { 1727 VectorEntry.getOperand(0), 1728 VectorEntry.getOperand(1), 1729 VectorEntry.getOperand(2), 1730 VectorEntry.getOperand(3) 1731 }; 1732 1733 for (unsigned i = 0; i < 4; i++) { 1734 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1735 // We mask write here to teach later passes that the ith element of this 1736 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 1737 // break false dependencies and additionnaly make assembly easier to read. 1738 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 1739 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1740 if (C->isZero()) { 1741 RemapSwizzle[i] = 4; // SEL_0 1742 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1743 } else if (C->isExactlyValue(1.0)) { 1744 RemapSwizzle[i] = 5; // SEL_1 1745 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1746 } 1747 } 1748 1749 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1750 continue; 1751 for (unsigned j = 0; j < i; j++) { 1752 if (NewBldVec[i] == NewBldVec[j]) { 1753 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1754 RemapSwizzle[i] = j; 1755 break; 1756 } 1757 } 1758 } 1759 1760 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1761 VectorEntry.getValueType(), NewBldVec); 1762 } 1763 1764 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1765 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1766 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1767 assert(RemapSwizzle.empty()); 1768 SDValue NewBldVec[4] = { 1769 VectorEntry.getOperand(0), 1770 VectorEntry.getOperand(1), 1771 VectorEntry.getOperand(2), 1772 VectorEntry.getOperand(3) 1773 }; 1774 bool isUnmovable[4] = { false, false, false, false }; 1775 for (unsigned i = 0; i < 4; i++) { 1776 RemapSwizzle[i] = i; 1777 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1778 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1779 ->getZExtValue(); 1780 if (i == Idx) 1781 isUnmovable[Idx] = true; 1782 } 1783 } 1784 1785 for (unsigned i = 0; i < 4; i++) { 1786 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1787 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1788 ->getZExtValue(); 1789 if (isUnmovable[Idx]) 1790 continue; 1791 // Swap i and Idx 1792 std::swap(NewBldVec[Idx], NewBldVec[i]); 1793 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 1794 break; 1795 } 1796 } 1797 1798 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1799 VectorEntry.getValueType(), NewBldVec); 1800 } 1801 1802 1803 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, 1804 SDValue Swz[4], SelectionDAG &DAG) const { 1805 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); 1806 // Old -> New swizzle values 1807 DenseMap<unsigned, unsigned> SwizzleRemap; 1808 1809 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1810 for (unsigned i = 0; i < 4; i++) { 1811 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1812 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1813 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 1814 } 1815 1816 SwizzleRemap.clear(); 1817 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1818 for (unsigned i = 0; i < 4; i++) { 1819 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1820 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1821 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 1822 } 1823 1824 return BuildVector; 1825 } 1826 1827 1828 //===----------------------------------------------------------------------===// 1829 // Custom DAG Optimizations 1830 //===----------------------------------------------------------------------===// 1831 1832 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1833 DAGCombinerInfo &DCI) const { 1834 SelectionDAG &DAG = DCI.DAG; 1835 1836 switch (N->getOpcode()) { 1837 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1838 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1839 case ISD::FP_ROUND: { 1840 SDValue Arg = N->getOperand(0); 1841 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1842 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), 1843 Arg.getOperand(0)); 1844 } 1845 break; 1846 } 1847 1848 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1849 // (i32 select_cc f32, f32, -1, 0 cc) 1850 // 1851 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1852 // this to one of the SET*_DX10 instructions. 1853 case ISD::FP_TO_SINT: { 1854 SDValue FNeg = N->getOperand(0); 1855 if (FNeg.getOpcode() != ISD::FNEG) { 1856 return SDValue(); 1857 } 1858 SDValue SelectCC = FNeg.getOperand(0); 1859 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1860 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1861 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1862 !isHWTrueValue(SelectCC.getOperand(2)) || 1863 !isHWFalseValue(SelectCC.getOperand(3))) { 1864 return SDValue(); 1865 } 1866 1867 return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), 1868 SelectCC.getOperand(0), // LHS 1869 SelectCC.getOperand(1), // RHS 1870 DAG.getConstant(-1, MVT::i32), // True 1871 DAG.getConstant(0, MVT::i32), // Flase 1872 SelectCC.getOperand(4)); // CC 1873 1874 break; 1875 } 1876 1877 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 1878 // => build_vector elt0, ... , NewEltIdx, ... , eltN 1879 case ISD::INSERT_VECTOR_ELT: { 1880 SDValue InVec = N->getOperand(0); 1881 SDValue InVal = N->getOperand(1); 1882 SDValue EltNo = N->getOperand(2); 1883 SDLoc dl(N); 1884 1885 // If the inserted element is an UNDEF, just use the input vector. 1886 if (InVal.getOpcode() == ISD::UNDEF) 1887 return InVec; 1888 1889 EVT VT = InVec.getValueType(); 1890 1891 // If we can't generate a legal BUILD_VECTOR, exit 1892 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 1893 return SDValue(); 1894 1895 // Check that we know which element is being inserted 1896 if (!isa<ConstantSDNode>(EltNo)) 1897 return SDValue(); 1898 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 1899 1900 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 1901 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 1902 // vector elements. 1903 SmallVector<SDValue, 8> Ops; 1904 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 1905 Ops.append(InVec.getNode()->op_begin(), 1906 InVec.getNode()->op_end()); 1907 } else if (InVec.getOpcode() == ISD::UNDEF) { 1908 unsigned NElts = VT.getVectorNumElements(); 1909 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 1910 } else { 1911 return SDValue(); 1912 } 1913 1914 // Insert the element 1915 if (Elt < Ops.size()) { 1916 // All the operands of BUILD_VECTOR must have the same type; 1917 // we enforce that here. 1918 EVT OpVT = Ops[0].getValueType(); 1919 if (InVal.getValueType() != OpVT) 1920 InVal = OpVT.bitsGT(InVal.getValueType()) ? 1921 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : 1922 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); 1923 Ops[Elt] = InVal; 1924 } 1925 1926 // Return the new vector 1927 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 1928 } 1929 1930 // Extract_vec (Build_vector) generated by custom lowering 1931 // also needs to be customly combined 1932 case ISD::EXTRACT_VECTOR_ELT: { 1933 SDValue Arg = N->getOperand(0); 1934 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1935 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1936 unsigned Element = Const->getZExtValue(); 1937 return Arg->getOperand(Element); 1938 } 1939 } 1940 if (Arg.getOpcode() == ISD::BITCAST && 1941 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 1942 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1943 unsigned Element = Const->getZExtValue(); 1944 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), 1945 Arg->getOperand(0).getOperand(Element)); 1946 } 1947 } 1948 } 1949 1950 case ISD::SELECT_CC: { 1951 // Try common optimizations 1952 SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1953 if (Ret.getNode()) 1954 return Ret; 1955 1956 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1957 // selectcc x, y, a, b, inv(cc) 1958 // 1959 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1960 // selectcc x, y, a, b, cc 1961 SDValue LHS = N->getOperand(0); 1962 if (LHS.getOpcode() != ISD::SELECT_CC) { 1963 return SDValue(); 1964 } 1965 1966 SDValue RHS = N->getOperand(1); 1967 SDValue True = N->getOperand(2); 1968 SDValue False = N->getOperand(3); 1969 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1970 1971 if (LHS.getOperand(2).getNode() != True.getNode() || 1972 LHS.getOperand(3).getNode() != False.getNode() || 1973 RHS.getNode() != False.getNode()) { 1974 return SDValue(); 1975 } 1976 1977 switch (NCC) { 1978 default: return SDValue(); 1979 case ISD::SETNE: return LHS; 1980 case ISD::SETEQ: { 1981 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1982 LHSCC = ISD::getSetCCInverse(LHSCC, 1983 LHS.getOperand(0).getValueType().isInteger()); 1984 if (DCI.isBeforeLegalizeOps() || 1985 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 1986 return DAG.getSelectCC(SDLoc(N), 1987 LHS.getOperand(0), 1988 LHS.getOperand(1), 1989 LHS.getOperand(2), 1990 LHS.getOperand(3), 1991 LHSCC); 1992 break; 1993 } 1994 } 1995 return SDValue(); 1996 } 1997 1998 case AMDGPUISD::EXPORT: { 1999 SDValue Arg = N->getOperand(1); 2000 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2001 break; 2002 2003 SDValue NewArgs[8] = { 2004 N->getOperand(0), // Chain 2005 SDValue(), 2006 N->getOperand(2), // ArrayBase 2007 N->getOperand(3), // Type 2008 N->getOperand(4), // SWZ_X 2009 N->getOperand(5), // SWZ_Y 2010 N->getOperand(6), // SWZ_Z 2011 N->getOperand(7) // SWZ_W 2012 }; 2013 SDLoc DL(N); 2014 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG); 2015 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); 2016 } 2017 case AMDGPUISD::TEXTURE_FETCH: { 2018 SDValue Arg = N->getOperand(1); 2019 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2020 break; 2021 2022 SDValue NewArgs[19] = { 2023 N->getOperand(0), 2024 N->getOperand(1), 2025 N->getOperand(2), 2026 N->getOperand(3), 2027 N->getOperand(4), 2028 N->getOperand(5), 2029 N->getOperand(6), 2030 N->getOperand(7), 2031 N->getOperand(8), 2032 N->getOperand(9), 2033 N->getOperand(10), 2034 N->getOperand(11), 2035 N->getOperand(12), 2036 N->getOperand(13), 2037 N->getOperand(14), 2038 N->getOperand(15), 2039 N->getOperand(16), 2040 N->getOperand(17), 2041 N->getOperand(18), 2042 }; 2043 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG); 2044 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(), 2045 NewArgs); 2046 } 2047 } 2048 2049 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 2050 } 2051 2052 static bool 2053 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, 2054 SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { 2055 const R600InstrInfo *TII = 2056 static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo()); 2057 if (!Src.isMachineOpcode()) 2058 return false; 2059 switch (Src.getMachineOpcode()) { 2060 case AMDGPU::FNEG_R600: 2061 if (!Neg.getNode()) 2062 return false; 2063 Src = Src.getOperand(0); 2064 Neg = DAG.getTargetConstant(1, MVT::i32); 2065 return true; 2066 case AMDGPU::FABS_R600: 2067 if (!Abs.getNode()) 2068 return false; 2069 Src = Src.getOperand(0); 2070 Abs = DAG.getTargetConstant(1, MVT::i32); 2071 return true; 2072 case AMDGPU::CONST_COPY: { 2073 unsigned Opcode = ParentNode->getMachineOpcode(); 2074 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2075 2076 if (!Sel.getNode()) 2077 return false; 2078 2079 SDValue CstOffset = Src.getOperand(0); 2080 if (ParentNode->getValueType(0).isVector()) 2081 return false; 2082 2083 // Gather constants values 2084 int SrcIndices[] = { 2085 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 2086 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 2087 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), 2088 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 2089 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 2090 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 2091 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 2092 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 2093 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 2094 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 2095 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 2096 }; 2097 std::vector<unsigned> Consts; 2098 for (int OtherSrcIdx : SrcIndices) { 2099 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 2100 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 2101 continue; 2102 if (HasDst) { 2103 OtherSrcIdx--; 2104 OtherSelIdx--; 2105 } 2106 if (RegisterSDNode *Reg = 2107 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 2108 if (Reg->getReg() == AMDGPU::ALU_CONST) { 2109 ConstantSDNode *Cst 2110 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); 2111 Consts.push_back(Cst->getZExtValue()); 2112 } 2113 } 2114 } 2115 2116 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); 2117 Consts.push_back(Cst->getZExtValue()); 2118 if (!TII->fitsConstReadLimitations(Consts)) { 2119 return false; 2120 } 2121 2122 Sel = CstOffset; 2123 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); 2124 return true; 2125 } 2126 case AMDGPU::MOV_IMM_I32: 2127 case AMDGPU::MOV_IMM_F32: { 2128 unsigned ImmReg = AMDGPU::ALU_LITERAL_X; 2129 uint64_t ImmValue = 0; 2130 2131 2132 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { 2133 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); 2134 float FloatValue = FPC->getValueAPF().convertToFloat(); 2135 if (FloatValue == 0.0) { 2136 ImmReg = AMDGPU::ZERO; 2137 } else if (FloatValue == 0.5) { 2138 ImmReg = AMDGPU::HALF; 2139 } else if (FloatValue == 1.0) { 2140 ImmReg = AMDGPU::ONE; 2141 } else { 2142 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 2143 } 2144 } else { 2145 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); 2146 uint64_t Value = C->getZExtValue(); 2147 if (Value == 0) { 2148 ImmReg = AMDGPU::ZERO; 2149 } else if (Value == 1) { 2150 ImmReg = AMDGPU::ONE_INT; 2151 } else { 2152 ImmValue = Value; 2153 } 2154 } 2155 2156 // Check that we aren't already using an immediate. 2157 // XXX: It's possible for an instruction to have more than one 2158 // immediate operand, but this is not supported yet. 2159 if (ImmReg == AMDGPU::ALU_LITERAL_X) { 2160 if (!Imm.getNode()) 2161 return false; 2162 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); 2163 assert(C); 2164 if (C->getZExtValue()) 2165 return false; 2166 Imm = DAG.getTargetConstant(ImmValue, MVT::i32); 2167 } 2168 Src = DAG.getRegister(ImmReg, MVT::i32); 2169 return true; 2170 } 2171 default: 2172 return false; 2173 } 2174 } 2175 2176 2177 /// \brief Fold the instructions after selecting them 2178 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 2179 SelectionDAG &DAG) const { 2180 const R600InstrInfo *TII = 2181 static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo()); 2182 if (!Node->isMachineOpcode()) 2183 return Node; 2184 unsigned Opcode = Node->getMachineOpcode(); 2185 SDValue FakeOp; 2186 2187 std::vector<SDValue> Ops; 2188 for (const SDUse &I : Node->ops()) 2189 Ops.push_back(I); 2190 2191 if (Opcode == AMDGPU::DOT_4) { 2192 int OperandIdx[] = { 2193 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 2194 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 2195 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 2196 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 2197 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 2198 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 2199 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 2200 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 2201 }; 2202 int NegIdx[] = { 2203 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), 2204 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), 2205 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), 2206 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), 2207 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), 2208 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), 2209 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), 2210 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) 2211 }; 2212 int AbsIdx[] = { 2213 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), 2214 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), 2215 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), 2216 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), 2217 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), 2218 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), 2219 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), 2220 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) 2221 }; 2222 for (unsigned i = 0; i < 8; i++) { 2223 if (OperandIdx[i] < 0) 2224 return Node; 2225 SDValue &Src = Ops[OperandIdx[i] - 1]; 2226 SDValue &Neg = Ops[NegIdx[i] - 1]; 2227 SDValue &Abs = Ops[AbsIdx[i] - 1]; 2228 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2229 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2230 if (HasDst) 2231 SelIdx--; 2232 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2233 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 2234 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2235 } 2236 } else if (Opcode == AMDGPU::REG_SEQUENCE) { 2237 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 2238 SDValue &Src = Ops[i]; 2239 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 2240 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2241 } 2242 } else if (Opcode == AMDGPU::CLAMP_R600) { 2243 SDValue Src = Node->getOperand(0); 2244 if (!Src.isMachineOpcode() || 2245 !TII->hasInstrModifiers(Src.getMachineOpcode())) 2246 return Node; 2247 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), 2248 AMDGPU::OpName::clamp); 2249 if (ClampIdx < 0) 2250 return Node; 2251 std::vector<SDValue> Ops; 2252 unsigned NumOp = Src.getNumOperands(); 2253 for(unsigned i = 0; i < NumOp; ++i) 2254 Ops.push_back(Src.getOperand(i)); 2255 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32); 2256 return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node), 2257 Node->getVTList(), Ops); 2258 } else { 2259 if (!TII->hasInstrModifiers(Opcode)) 2260 return Node; 2261 int OperandIdx[] = { 2262 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 2263 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 2264 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) 2265 }; 2266 int NegIdx[] = { 2267 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), 2268 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), 2269 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) 2270 }; 2271 int AbsIdx[] = { 2272 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), 2273 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), 2274 -1 2275 }; 2276 for (unsigned i = 0; i < 3; i++) { 2277 if (OperandIdx[i] < 0) 2278 return Node; 2279 SDValue &Src = Ops[OperandIdx[i] - 1]; 2280 SDValue &Neg = Ops[NegIdx[i] - 1]; 2281 SDValue FakeAbs; 2282 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 2283 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2284 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2285 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); 2286 if (HasDst) { 2287 SelIdx--; 2288 ImmIdx--; 2289 } 2290 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2291 SDValue &Imm = Ops[ImmIdx]; 2292 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 2293 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2294 } 2295 } 2296 2297 return Node; 2298 } 2299