1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for R600 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "R600ISelLowering.h" 16 #include "AMDGPUFrameLowering.h" 17 #include "AMDGPUIntrinsicInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "R600Defines.h" 20 #include "R600InstrInfo.h" 21 #include "R600MachineFunctionInfo.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/CodeGen/CallingConvLower.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineRegisterInfo.h" 27 #include "llvm/CodeGen/SelectionDAG.h" 28 #include "llvm/IR/Argument.h" 29 #include "llvm/IR/Function.h" 30 31 using namespace llvm; 32 33 R600TargetLowering::R600TargetLowering(const TargetMachine &TM, 34 const R600Subtarget &STI) 35 : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { 36 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 37 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 38 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); 39 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); 40 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 41 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 42 43 computeRegisterProperties(STI.getRegisterInfo()); 44 45 // Legalize loads and stores to the private address space. 46 setOperationAction(ISD::LOAD, MVT::i32, Custom); 47 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 48 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 49 50 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 51 // spaces, so it is custom lowered to handle those where it isn't. 52 for (MVT VT : MVT::integer_valuetypes()) { 53 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 54 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); 55 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); 56 57 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 58 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); 59 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); 60 61 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 62 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); 63 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); 64 } 65 66 // Workaround for LegalizeDAG asserting on expansion of i1 vector loads. 67 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 68 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 69 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 70 71 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 72 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 73 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 74 75 76 setOperationAction(ISD::STORE, MVT::i8, Custom); 77 setOperationAction(ISD::STORE, MVT::i32, Custom); 78 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 79 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 80 81 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 82 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 83 84 // Workaround for LegalizeDAG asserting on expansion of i1 vector stores. 85 setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand); 86 setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand); 87 88 // Set condition code actions 89 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 90 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 91 setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 92 setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 93 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 94 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 95 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 96 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 97 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 98 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 99 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 100 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 101 102 setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 103 setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 104 setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 105 setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 106 107 setOperationAction(ISD::FCOS, MVT::f32, Custom); 108 setOperationAction(ISD::FSIN, MVT::f32, Custom); 109 110 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 111 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 112 113 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 114 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 115 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 116 117 setOperationAction(ISD::FSUB, MVT::f32, Expand); 118 119 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 120 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 121 122 setOperationAction(ISD::SETCC, MVT::i32, Expand); 123 setOperationAction(ISD::SETCC, MVT::f32, Expand); 124 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 125 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 126 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 127 128 setOperationAction(ISD::SELECT, MVT::i32, Expand); 129 setOperationAction(ISD::SELECT, MVT::f32, Expand); 130 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 131 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 132 133 // ADD, SUB overflow. 134 // TODO: turn these into Legal? 135 if (Subtarget->hasCARRY()) 136 setOperationAction(ISD::UADDO, MVT::i32, Custom); 137 138 if (Subtarget->hasBORROW()) 139 setOperationAction(ISD::USUBO, MVT::i32, Custom); 140 141 // Expand sign extension of vectors 142 if (!Subtarget->hasBFE()) 143 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 144 145 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); 146 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); 147 148 if (!Subtarget->hasBFE()) 149 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 150 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); 151 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); 152 153 if (!Subtarget->hasBFE()) 154 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 155 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 156 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); 157 158 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 159 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); 160 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); 161 162 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); 163 164 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 165 166 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); 167 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); 168 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 169 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 170 171 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); 172 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); 173 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 174 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 175 176 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 177 // to be Legal/Custom in order to avoid library calls. 178 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 179 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 180 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 181 182 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 183 184 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 185 for (MVT VT : ScalarIntVTs) { 186 setOperationAction(ISD::ADDC, VT, Expand); 187 setOperationAction(ISD::SUBC, VT, Expand); 188 setOperationAction(ISD::ADDE, VT, Expand); 189 setOperationAction(ISD::SUBE, VT, Expand); 190 } 191 192 setSchedulingPreference(Sched::Source); 193 194 195 setTargetDAGCombine(ISD::FP_ROUND); 196 setTargetDAGCombine(ISD::FP_TO_SINT); 197 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 198 setTargetDAGCombine(ISD::SELECT_CC); 199 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 200 } 201 202 const R600Subtarget *R600TargetLowering::getSubtarget() const { 203 return static_cast<const R600Subtarget *>(Subtarget); 204 } 205 206 static inline bool isEOP(MachineBasicBlock::iterator I) { 207 return std::next(I)->getOpcode() == AMDGPU::RETURN; 208 } 209 210 MachineBasicBlock * 211 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 212 MachineBasicBlock *BB) const { 213 MachineFunction * MF = BB->getParent(); 214 MachineRegisterInfo &MRI = MF->getRegInfo(); 215 MachineBasicBlock::iterator I = MI; 216 const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); 217 218 switch (MI.getOpcode()) { 219 default: 220 // Replace LDS_*_RET instruction that don't have any uses with the 221 // equivalent LDS_*_NORET instruction. 222 if (TII->isLDSRetInstr(MI.getOpcode())) { 223 int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); 224 assert(DstIdx != -1); 225 MachineInstrBuilder NewMI; 226 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add 227 // LDS_1A2D support and remove this special case. 228 if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) || 229 MI.getOpcode() == AMDGPU::LDS_CMPST_RET) 230 return BB; 231 232 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 233 TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode()))); 234 for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) { 235 NewMI.addOperand(MI.getOperand(i)); 236 } 237 } else { 238 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 239 } 240 break; 241 case AMDGPU::CLAMP_R600: { 242 MachineInstr *NewMI = TII->buildDefaultInstruction( 243 *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), 244 MI.getOperand(1).getReg()); 245 TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP); 246 break; 247 } 248 249 case AMDGPU::FABS_R600: { 250 MachineInstr *NewMI = TII->buildDefaultInstruction( 251 *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), 252 MI.getOperand(1).getReg()); 253 TII->addFlag(*NewMI, 0, MO_FLAG_ABS); 254 break; 255 } 256 257 case AMDGPU::FNEG_R600: { 258 MachineInstr *NewMI = TII->buildDefaultInstruction( 259 *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), 260 MI.getOperand(1).getReg()); 261 TII->addFlag(*NewMI, 0, MO_FLAG_NEG); 262 break; 263 } 264 265 case AMDGPU::MASK_WRITE: { 266 unsigned maskedRegister = MI.getOperand(0).getReg(); 267 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 268 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 269 TII->addFlag(*defInstr, 0, MO_FLAG_MASK); 270 break; 271 } 272 273 case AMDGPU::MOV_IMM_F32: 274 TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1) 275 .getFPImm() 276 ->getValueAPF() 277 .bitcastToAPInt() 278 .getZExtValue()); 279 break; 280 case AMDGPU::MOV_IMM_I32: 281 TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), 282 MI.getOperand(1).getImm()); 283 break; 284 case AMDGPU::MOV_IMM_GLOBAL_ADDR: { 285 //TODO: Perhaps combine this instruction with the next if possible 286 auto MIB = TII->buildDefaultInstruction( 287 *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X); 288 int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal); 289 //TODO: Ugh this is rather ugly 290 MIB->getOperand(Idx) = MI.getOperand(1); 291 break; 292 } 293 case AMDGPU::CONST_COPY: { 294 MachineInstr *NewMI = TII->buildDefaultInstruction( 295 *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); 296 TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel, 297 MI.getOperand(1).getImm()); 298 break; 299 } 300 301 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 302 case AMDGPU::RAT_WRITE_CACHELESS_64_eg: 303 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 304 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 305 .addOperand(MI.getOperand(0)) 306 .addOperand(MI.getOperand(1)) 307 .addImm(isEOP(I)); // Set End of program bit 308 break; 309 } 310 case AMDGPU::RAT_STORE_TYPED_eg: { 311 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 312 .addOperand(MI.getOperand(0)) 313 .addOperand(MI.getOperand(1)) 314 .addOperand(MI.getOperand(2)) 315 .addImm(isEOP(I)); // Set End of program bit 316 break; 317 } 318 319 case AMDGPU::TXD: { 320 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 321 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 322 MachineOperand &RID = MI.getOperand(4); 323 MachineOperand &SID = MI.getOperand(5); 324 unsigned TextureId = MI.getOperand(6).getImm(); 325 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 326 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 327 328 switch (TextureId) { 329 case 5: // Rect 330 CTX = CTY = 0; 331 break; 332 case 6: // Shadow1D 333 SrcW = SrcZ; 334 break; 335 case 7: // Shadow2D 336 SrcW = SrcZ; 337 break; 338 case 8: // ShadowRect 339 CTX = CTY = 0; 340 SrcW = SrcZ; 341 break; 342 case 9: // 1DArray 343 SrcZ = SrcY; 344 CTZ = 0; 345 break; 346 case 10: // 2DArray 347 CTZ = 0; 348 break; 349 case 11: // Shadow1DArray 350 SrcZ = SrcY; 351 CTZ = 0; 352 break; 353 case 12: // Shadow2DArray 354 CTZ = 0; 355 break; 356 } 357 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), 358 T0) 359 .addOperand(MI.getOperand(3)) 360 .addImm(SrcX) 361 .addImm(SrcY) 362 .addImm(SrcZ) 363 .addImm(SrcW) 364 .addImm(0) 365 .addImm(0) 366 .addImm(0) 367 .addImm(0) 368 .addImm(1) 369 .addImm(2) 370 .addImm(3) 371 .addOperand(RID) 372 .addOperand(SID) 373 .addImm(CTX) 374 .addImm(CTY) 375 .addImm(CTZ) 376 .addImm(CTW); 377 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), 378 T1) 379 .addOperand(MI.getOperand(2)) 380 .addImm(SrcX) 381 .addImm(SrcY) 382 .addImm(SrcZ) 383 .addImm(SrcW) 384 .addImm(0) 385 .addImm(0) 386 .addImm(0) 387 .addImm(0) 388 .addImm(1) 389 .addImm(2) 390 .addImm(3) 391 .addOperand(RID) 392 .addOperand(SID) 393 .addImm(CTX) 394 .addImm(CTY) 395 .addImm(CTZ) 396 .addImm(CTW); 397 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 398 .addOperand(MI.getOperand(0)) 399 .addOperand(MI.getOperand(1)) 400 .addImm(SrcX) 401 .addImm(SrcY) 402 .addImm(SrcZ) 403 .addImm(SrcW) 404 .addImm(0) 405 .addImm(0) 406 .addImm(0) 407 .addImm(0) 408 .addImm(1) 409 .addImm(2) 410 .addImm(3) 411 .addOperand(RID) 412 .addOperand(SID) 413 .addImm(CTX) 414 .addImm(CTY) 415 .addImm(CTZ) 416 .addImm(CTW) 417 .addReg(T0, RegState::Implicit) 418 .addReg(T1, RegState::Implicit); 419 break; 420 } 421 422 case AMDGPU::TXD_SHADOW: { 423 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 424 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 425 MachineOperand &RID = MI.getOperand(4); 426 MachineOperand &SID = MI.getOperand(5); 427 unsigned TextureId = MI.getOperand(6).getImm(); 428 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 429 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 430 431 switch (TextureId) { 432 case 5: // Rect 433 CTX = CTY = 0; 434 break; 435 case 6: // Shadow1D 436 SrcW = SrcZ; 437 break; 438 case 7: // Shadow2D 439 SrcW = SrcZ; 440 break; 441 case 8: // ShadowRect 442 CTX = CTY = 0; 443 SrcW = SrcZ; 444 break; 445 case 9: // 1DArray 446 SrcZ = SrcY; 447 CTZ = 0; 448 break; 449 case 10: // 2DArray 450 CTZ = 0; 451 break; 452 case 11: // Shadow1DArray 453 SrcZ = SrcY; 454 CTZ = 0; 455 break; 456 case 12: // Shadow2DArray 457 CTZ = 0; 458 break; 459 } 460 461 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), 462 T0) 463 .addOperand(MI.getOperand(3)) 464 .addImm(SrcX) 465 .addImm(SrcY) 466 .addImm(SrcZ) 467 .addImm(SrcW) 468 .addImm(0) 469 .addImm(0) 470 .addImm(0) 471 .addImm(0) 472 .addImm(1) 473 .addImm(2) 474 .addImm(3) 475 .addOperand(RID) 476 .addOperand(SID) 477 .addImm(CTX) 478 .addImm(CTY) 479 .addImm(CTZ) 480 .addImm(CTW); 481 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), 482 T1) 483 .addOperand(MI.getOperand(2)) 484 .addImm(SrcX) 485 .addImm(SrcY) 486 .addImm(SrcZ) 487 .addImm(SrcW) 488 .addImm(0) 489 .addImm(0) 490 .addImm(0) 491 .addImm(0) 492 .addImm(1) 493 .addImm(2) 494 .addImm(3) 495 .addOperand(RID) 496 .addOperand(SID) 497 .addImm(CTX) 498 .addImm(CTY) 499 .addImm(CTZ) 500 .addImm(CTW); 501 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 502 .addOperand(MI.getOperand(0)) 503 .addOperand(MI.getOperand(1)) 504 .addImm(SrcX) 505 .addImm(SrcY) 506 .addImm(SrcZ) 507 .addImm(SrcW) 508 .addImm(0) 509 .addImm(0) 510 .addImm(0) 511 .addImm(0) 512 .addImm(1) 513 .addImm(2) 514 .addImm(3) 515 .addOperand(RID) 516 .addOperand(SID) 517 .addImm(CTX) 518 .addImm(CTY) 519 .addImm(CTZ) 520 .addImm(CTW) 521 .addReg(T0, RegState::Implicit) 522 .addReg(T1, RegState::Implicit); 523 break; 524 } 525 526 case AMDGPU::BRANCH: 527 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 528 .addOperand(MI.getOperand(0)); 529 break; 530 531 case AMDGPU::BRANCH_COND_f32: { 532 MachineInstr *NewMI = 533 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 534 AMDGPU::PREDICATE_BIT) 535 .addOperand(MI.getOperand(1)) 536 .addImm(OPCODE_IS_NOT_ZERO) 537 .addImm(0); // Flags 538 TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); 539 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 540 .addOperand(MI.getOperand(0)) 541 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 542 break; 543 } 544 545 case AMDGPU::BRANCH_COND_i32: { 546 MachineInstr *NewMI = 547 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 548 AMDGPU::PREDICATE_BIT) 549 .addOperand(MI.getOperand(1)) 550 .addImm(OPCODE_IS_NOT_ZERO_INT) 551 .addImm(0); // Flags 552 TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); 553 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 554 .addOperand(MI.getOperand(0)) 555 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 556 break; 557 } 558 559 case AMDGPU::EG_ExportSwz: 560 case AMDGPU::R600_ExportSwz: { 561 // Instruction is left unmodified if its not the last one of its type 562 bool isLastInstructionOfItsType = true; 563 unsigned InstExportType = MI.getOperand(1).getImm(); 564 for (MachineBasicBlock::iterator NextExportInst = std::next(I), 565 EndBlock = BB->end(); NextExportInst != EndBlock; 566 NextExportInst = std::next(NextExportInst)) { 567 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 568 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 569 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 570 .getImm(); 571 if (CurrentInstExportType == InstExportType) { 572 isLastInstructionOfItsType = false; 573 break; 574 } 575 } 576 } 577 bool EOP = isEOP(I); 578 if (!EOP && !isLastInstructionOfItsType) 579 return BB; 580 unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40; 581 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 582 .addOperand(MI.getOperand(0)) 583 .addOperand(MI.getOperand(1)) 584 .addOperand(MI.getOperand(2)) 585 .addOperand(MI.getOperand(3)) 586 .addOperand(MI.getOperand(4)) 587 .addOperand(MI.getOperand(5)) 588 .addOperand(MI.getOperand(6)) 589 .addImm(CfInst) 590 .addImm(EOP); 591 break; 592 } 593 case AMDGPU::RETURN: { 594 // RETURN instructions must have the live-out registers as implicit uses, 595 // otherwise they appear dead. 596 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 597 MachineInstrBuilder MIB(*MF, MI); 598 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 599 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 600 return BB; 601 } 602 } 603 604 MI.eraseFromParent(); 605 return BB; 606 } 607 608 //===----------------------------------------------------------------------===// 609 // Custom DAG Lowering Operations 610 //===----------------------------------------------------------------------===// 611 612 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 613 MachineFunction &MF = DAG.getMachineFunction(); 614 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 615 switch (Op.getOpcode()) { 616 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 617 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 618 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 619 case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); 620 case ISD::SRA_PARTS: 621 case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); 622 case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); 623 case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); 624 case ISD::FCOS: 625 case ISD::FSIN: return LowerTrig(Op, DAG); 626 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 627 case ISD::STORE: return LowerSTORE(Op, DAG); 628 case ISD::LOAD: { 629 SDValue Result = LowerLOAD(Op, DAG); 630 assert((!Result.getNode() || 631 Result.getNode()->getNumValues() == 2) && 632 "Load should return a value and a chain"); 633 return Result; 634 } 635 636 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 637 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 638 case ISD::FrameIndex: return lowerFrameIndex(Op, DAG); 639 case ISD::INTRINSIC_VOID: { 640 SDValue Chain = Op.getOperand(0); 641 unsigned IntrinsicID = 642 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 643 switch (IntrinsicID) { 644 case AMDGPUIntrinsic::R600_store_swizzle: { 645 SDLoc DL(Op); 646 const SDValue Args[8] = { 647 Chain, 648 Op.getOperand(2), // Export Value 649 Op.getOperand(3), // ArrayBase 650 Op.getOperand(4), // Type 651 DAG.getConstant(0, DL, MVT::i32), // SWZ_X 652 DAG.getConstant(1, DL, MVT::i32), // SWZ_Y 653 DAG.getConstant(2, DL, MVT::i32), // SWZ_Z 654 DAG.getConstant(3, DL, MVT::i32) // SWZ_W 655 }; 656 return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args); 657 } 658 659 // default for switch(IntrinsicID) 660 default: break; 661 } 662 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 663 break; 664 } 665 case ISD::INTRINSIC_WO_CHAIN: { 666 unsigned IntrinsicID = 667 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 668 EVT VT = Op.getValueType(); 669 SDLoc DL(Op); 670 switch(IntrinsicID) { 671 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 672 case AMDGPUIntrinsic::r600_tex: 673 case AMDGPUIntrinsic::r600_texc: 674 case AMDGPUIntrinsic::r600_txl: 675 case AMDGPUIntrinsic::r600_txlc: 676 case AMDGPUIntrinsic::r600_txb: 677 case AMDGPUIntrinsic::r600_txbc: 678 case AMDGPUIntrinsic::r600_txf: 679 case AMDGPUIntrinsic::r600_txq: 680 case AMDGPUIntrinsic::r600_ddx: 681 case AMDGPUIntrinsic::r600_ddy: { 682 unsigned TextureOp; 683 switch (IntrinsicID) { 684 case AMDGPUIntrinsic::r600_tex: 685 TextureOp = 0; 686 break; 687 case AMDGPUIntrinsic::r600_texc: 688 TextureOp = 1; 689 break; 690 case AMDGPUIntrinsic::r600_txl: 691 TextureOp = 2; 692 break; 693 case AMDGPUIntrinsic::r600_txlc: 694 TextureOp = 3; 695 break; 696 case AMDGPUIntrinsic::r600_txb: 697 TextureOp = 4; 698 break; 699 case AMDGPUIntrinsic::r600_txbc: 700 TextureOp = 5; 701 break; 702 case AMDGPUIntrinsic::r600_txf: 703 TextureOp = 6; 704 break; 705 case AMDGPUIntrinsic::r600_txq: 706 TextureOp = 7; 707 break; 708 case AMDGPUIntrinsic::r600_ddx: 709 TextureOp = 8; 710 break; 711 case AMDGPUIntrinsic::r600_ddy: 712 TextureOp = 9; 713 break; 714 default: 715 llvm_unreachable("Unknow Texture Operation"); 716 } 717 718 SDValue TexArgs[19] = { 719 DAG.getConstant(TextureOp, DL, MVT::i32), 720 Op.getOperand(1), 721 DAG.getConstant(0, DL, MVT::i32), 722 DAG.getConstant(1, DL, MVT::i32), 723 DAG.getConstant(2, DL, MVT::i32), 724 DAG.getConstant(3, DL, MVT::i32), 725 Op.getOperand(2), 726 Op.getOperand(3), 727 Op.getOperand(4), 728 DAG.getConstant(0, DL, MVT::i32), 729 DAG.getConstant(1, DL, MVT::i32), 730 DAG.getConstant(2, DL, MVT::i32), 731 DAG.getConstant(3, DL, MVT::i32), 732 Op.getOperand(5), 733 Op.getOperand(6), 734 Op.getOperand(7), 735 Op.getOperand(8), 736 Op.getOperand(9), 737 Op.getOperand(10) 738 }; 739 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); 740 } 741 case AMDGPUIntrinsic::r600_dot4: { 742 SDValue Args[8] = { 743 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 744 DAG.getConstant(0, DL, MVT::i32)), 745 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 746 DAG.getConstant(0, DL, MVT::i32)), 747 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 748 DAG.getConstant(1, DL, MVT::i32)), 749 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 750 DAG.getConstant(1, DL, MVT::i32)), 751 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 752 DAG.getConstant(2, DL, MVT::i32)), 753 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 754 DAG.getConstant(2, DL, MVT::i32)), 755 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 756 DAG.getConstant(3, DL, MVT::i32)), 757 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 758 DAG.getConstant(3, DL, MVT::i32)) 759 }; 760 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); 761 } 762 763 case Intrinsic::r600_implicitarg_ptr: { 764 MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS); 765 uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); 766 return DAG.getConstant(ByteOffset, DL, PtrVT); 767 } 768 case Intrinsic::r600_read_ngroups_x: 769 return LowerImplicitParameter(DAG, VT, DL, 0); 770 case Intrinsic::r600_read_ngroups_y: 771 return LowerImplicitParameter(DAG, VT, DL, 1); 772 case Intrinsic::r600_read_ngroups_z: 773 return LowerImplicitParameter(DAG, VT, DL, 2); 774 case Intrinsic::r600_read_global_size_x: 775 return LowerImplicitParameter(DAG, VT, DL, 3); 776 case Intrinsic::r600_read_global_size_y: 777 return LowerImplicitParameter(DAG, VT, DL, 4); 778 case Intrinsic::r600_read_global_size_z: 779 return LowerImplicitParameter(DAG, VT, DL, 5); 780 case Intrinsic::r600_read_local_size_x: 781 return LowerImplicitParameter(DAG, VT, DL, 6); 782 case Intrinsic::r600_read_local_size_y: 783 return LowerImplicitParameter(DAG, VT, DL, 7); 784 case Intrinsic::r600_read_local_size_z: 785 return LowerImplicitParameter(DAG, VT, DL, 8); 786 787 case Intrinsic::r600_read_workdim: 788 case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name. 789 uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); 790 return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); 791 } 792 793 case Intrinsic::r600_read_tgid_x: 794 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 795 AMDGPU::T1_X, VT); 796 case Intrinsic::r600_read_tgid_y: 797 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 798 AMDGPU::T1_Y, VT); 799 case Intrinsic::r600_read_tgid_z: 800 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 801 AMDGPU::T1_Z, VT); 802 case Intrinsic::r600_read_tidig_x: 803 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 804 AMDGPU::T0_X, VT); 805 case Intrinsic::r600_read_tidig_y: 806 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 807 AMDGPU::T0_Y, VT); 808 case Intrinsic::r600_read_tidig_z: 809 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 810 AMDGPU::T0_Z, VT); 811 812 // FIXME: Should be renamed to r600 prefix 813 case AMDGPUIntrinsic::AMDGPU_rsq_clamped: 814 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 815 816 case Intrinsic::r600_rsq: 817 case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name 818 // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. 819 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 820 } 821 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 822 break; 823 } 824 } // end switch(Op.getOpcode()) 825 return SDValue(); 826 } 827 828 void R600TargetLowering::ReplaceNodeResults(SDNode *N, 829 SmallVectorImpl<SDValue> &Results, 830 SelectionDAG &DAG) const { 831 switch (N->getOpcode()) { 832 default: 833 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 834 return; 835 case ISD::FP_TO_UINT: 836 if (N->getValueType(0) == MVT::i1) { 837 Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 838 return; 839 } 840 // Fall-through. Since we don't care about out of bounds values 841 // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint 842 // considers some extra cases which are not necessary here. 843 case ISD::FP_TO_SINT: { 844 SDValue Result; 845 if (expandFP_TO_SINT(N, Result, DAG)) 846 Results.push_back(Result); 847 return; 848 } 849 case ISD::SDIVREM: { 850 SDValue Op = SDValue(N, 1); 851 SDValue RES = LowerSDIVREM(Op, DAG); 852 Results.push_back(RES); 853 Results.push_back(RES.getValue(1)); 854 break; 855 } 856 case ISD::UDIVREM: { 857 SDValue Op = SDValue(N, 0); 858 LowerUDIVREM64(Op, DAG, Results); 859 break; 860 } 861 } 862 } 863 864 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, 865 SDValue Vector) const { 866 867 SDLoc DL(Vector); 868 EVT VecVT = Vector.getValueType(); 869 EVT EltVT = VecVT.getVectorElementType(); 870 SmallVector<SDValue, 8> Args; 871 872 for (unsigned i = 0, e = VecVT.getVectorNumElements(); 873 i != e; ++i) { 874 Args.push_back(DAG.getNode( 875 ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, 876 DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout())))); 877 } 878 879 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); 880 } 881 882 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 883 SelectionDAG &DAG) const { 884 885 SDLoc DL(Op); 886 SDValue Vector = Op.getOperand(0); 887 SDValue Index = Op.getOperand(1); 888 889 if (isa<ConstantSDNode>(Index) || 890 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 891 return Op; 892 893 Vector = vectorToVerticalVector(DAG, Vector); 894 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), 895 Vector, Index); 896 } 897 898 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 899 SelectionDAG &DAG) const { 900 SDLoc DL(Op); 901 SDValue Vector = Op.getOperand(0); 902 SDValue Value = Op.getOperand(1); 903 SDValue Index = Op.getOperand(2); 904 905 if (isa<ConstantSDNode>(Index) || 906 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 907 return Op; 908 909 Vector = vectorToVerticalVector(DAG, Vector); 910 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), 911 Vector, Value, Index); 912 return vectorToVerticalVector(DAG, Insert); 913 } 914 915 SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 916 SDValue Op, 917 SelectionDAG &DAG) const { 918 919 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 920 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 921 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 922 923 const DataLayout &DL = DAG.getDataLayout(); 924 const GlobalValue *GV = GSD->getGlobal(); 925 MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 926 927 SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT); 928 return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA); 929 } 930 931 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 932 // On hw >= R700, COS/SIN input must be between -1. and 1. 933 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 934 EVT VT = Op.getValueType(); 935 SDValue Arg = Op.getOperand(0); 936 SDLoc DL(Op); 937 938 // TODO: Should this propagate fast-math-flags? 939 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 940 DAG.getNode(ISD::FADD, DL, VT, 941 DAG.getNode(ISD::FMUL, DL, VT, Arg, 942 DAG.getConstantFP(0.15915494309, DL, MVT::f32)), 943 DAG.getConstantFP(0.5, DL, MVT::f32))); 944 unsigned TrigNode; 945 switch (Op.getOpcode()) { 946 case ISD::FCOS: 947 TrigNode = AMDGPUISD::COS_HW; 948 break; 949 case ISD::FSIN: 950 TrigNode = AMDGPUISD::SIN_HW; 951 break; 952 default: 953 llvm_unreachable("Wrong trig opcode"); 954 } 955 SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, 956 DAG.getNode(ISD::FADD, DL, VT, FractPart, 957 DAG.getConstantFP(-0.5, DL, MVT::f32))); 958 if (Gen >= R600Subtarget::R700) 959 return TrigVal; 960 // On R600 hw, COS/SIN input must be between -Pi and Pi. 961 return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, 962 DAG.getConstantFP(3.14159265359, DL, MVT::f32)); 963 } 964 965 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { 966 SDLoc DL(Op); 967 EVT VT = Op.getValueType(); 968 969 SDValue Lo = Op.getOperand(0); 970 SDValue Hi = Op.getOperand(1); 971 SDValue Shift = Op.getOperand(2); 972 SDValue Zero = DAG.getConstant(0, DL, VT); 973 SDValue One = DAG.getConstant(1, DL, VT); 974 975 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); 976 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); 977 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 978 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 979 980 // The dance around Width1 is necessary for 0 special case. 981 // Without it the CompShift might be 32, producing incorrect results in 982 // Overflow. So we do the shift in two steps, the alternative is to 983 // add a conditional to filter the special case. 984 985 SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); 986 Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); 987 988 SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); 989 HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); 990 SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); 991 992 SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); 993 SDValue LoBig = Zero; 994 995 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 996 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 997 998 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 999 } 1000 1001 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { 1002 SDLoc DL(Op); 1003 EVT VT = Op.getValueType(); 1004 1005 SDValue Lo = Op.getOperand(0); 1006 SDValue Hi = Op.getOperand(1); 1007 SDValue Shift = Op.getOperand(2); 1008 SDValue Zero = DAG.getConstant(0, DL, VT); 1009 SDValue One = DAG.getConstant(1, DL, VT); 1010 1011 const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; 1012 1013 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); 1014 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); 1015 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 1016 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 1017 1018 // The dance around Width1 is necessary for 0 special case. 1019 // Without it the CompShift might be 32, producing incorrect results in 1020 // Overflow. So we do the shift in two steps, the alternative is to 1021 // add a conditional to filter the special case. 1022 1023 SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); 1024 Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); 1025 1026 SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); 1027 SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); 1028 LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); 1029 1030 SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); 1031 SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; 1032 1033 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 1034 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 1035 1036 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 1037 } 1038 1039 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, 1040 unsigned mainop, unsigned ovf) const { 1041 SDLoc DL(Op); 1042 EVT VT = Op.getValueType(); 1043 1044 SDValue Lo = Op.getOperand(0); 1045 SDValue Hi = Op.getOperand(1); 1046 1047 SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); 1048 // Extend sign. 1049 OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, 1050 DAG.getValueType(MVT::i1)); 1051 1052 SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); 1053 1054 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); 1055 } 1056 1057 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 1058 SDLoc DL(Op); 1059 return DAG.getNode( 1060 ISD::SETCC, 1061 DL, 1062 MVT::i1, 1063 Op, DAG.getConstantFP(0.0f, DL, MVT::f32), 1064 DAG.getCondCode(ISD::SETNE) 1065 ); 1066 } 1067 1068 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 1069 const SDLoc &DL, 1070 unsigned DwordOffset) const { 1071 unsigned ByteOffset = DwordOffset * 4; 1072 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1073 AMDGPUAS::CONSTANT_BUFFER_0); 1074 1075 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 1076 assert(isInt<16>(ByteOffset)); 1077 1078 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 1079 DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR 1080 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 1081 false, false, false, 0); 1082 } 1083 1084 bool R600TargetLowering::isZero(SDValue Op) const { 1085 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 1086 return Cst->isNullValue(); 1087 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 1088 return CstFP->isZero(); 1089 } else { 1090 return false; 1091 } 1092 } 1093 1094 bool R600TargetLowering::isHWTrueValue(SDValue Op) const { 1095 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 1096 return CFP->isExactlyValue(1.0); 1097 } 1098 return isAllOnesConstant(Op); 1099 } 1100 1101 bool R600TargetLowering::isHWFalseValue(SDValue Op) const { 1102 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 1103 return CFP->getValueAPF().isZero(); 1104 } 1105 return isNullConstant(Op); 1106 } 1107 1108 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 1109 SDLoc DL(Op); 1110 EVT VT = Op.getValueType(); 1111 1112 SDValue LHS = Op.getOperand(0); 1113 SDValue RHS = Op.getOperand(1); 1114 SDValue True = Op.getOperand(2); 1115 SDValue False = Op.getOperand(3); 1116 SDValue CC = Op.getOperand(4); 1117 SDValue Temp; 1118 1119 if (VT == MVT::f32) { 1120 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); 1121 SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); 1122 if (MinMax) 1123 return MinMax; 1124 } 1125 1126 // LHS and RHS are guaranteed to be the same value type 1127 EVT CompareVT = LHS.getValueType(); 1128 1129 // Check if we can lower this to a native operation. 1130 1131 // Try to lower to a SET* instruction: 1132 // 1133 // SET* can match the following patterns: 1134 // 1135 // select_cc f32, f32, -1, 0, cc_supported 1136 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 1137 // select_cc i32, i32, -1, 0, cc_supported 1138 // 1139 1140 // Move hardware True/False values to the correct operand. 1141 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1142 ISD::CondCode InverseCC = 1143 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 1144 if (isHWTrueValue(False) && isHWFalseValue(True)) { 1145 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 1146 std::swap(False, True); 1147 CC = DAG.getCondCode(InverseCC); 1148 } else { 1149 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 1150 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 1151 std::swap(False, True); 1152 std::swap(LHS, RHS); 1153 CC = DAG.getCondCode(SwapInvCC); 1154 } 1155 } 1156 } 1157 1158 if (isHWTrueValue(True) && isHWFalseValue(False) && 1159 (CompareVT == VT || VT == MVT::i32)) { 1160 // This can be matched by a SET* instruction. 1161 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 1162 } 1163 1164 // Try to lower to a CND* instruction: 1165 // 1166 // CND* can match the following patterns: 1167 // 1168 // select_cc f32, 0.0, f32, f32, cc_supported 1169 // select_cc f32, 0.0, i32, i32, cc_supported 1170 // select_cc i32, 0, f32, f32, cc_supported 1171 // select_cc i32, 0, i32, i32, cc_supported 1172 // 1173 1174 // Try to move the zero value to the RHS 1175 if (isZero(LHS)) { 1176 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1177 // Try swapping the operands 1178 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 1179 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 1180 std::swap(LHS, RHS); 1181 CC = DAG.getCondCode(CCSwapped); 1182 } else { 1183 // Try inverting the conditon and then swapping the operands 1184 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); 1185 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 1186 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 1187 std::swap(True, False); 1188 std::swap(LHS, RHS); 1189 CC = DAG.getCondCode(CCSwapped); 1190 } 1191 } 1192 } 1193 if (isZero(RHS)) { 1194 SDValue Cond = LHS; 1195 SDValue Zero = RHS; 1196 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1197 if (CompareVT != VT) { 1198 // Bitcast True / False to the correct types. This will end up being 1199 // a nop, but it allows us to define only a single pattern in the 1200 // .TD files for each CND* instruction rather than having to have 1201 // one pattern for integer True/False and one for fp True/False 1202 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 1203 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 1204 } 1205 1206 switch (CCOpcode) { 1207 case ISD::SETONE: 1208 case ISD::SETUNE: 1209 case ISD::SETNE: 1210 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 1211 Temp = True; 1212 True = False; 1213 False = Temp; 1214 break; 1215 default: 1216 break; 1217 } 1218 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 1219 Cond, Zero, 1220 True, False, 1221 DAG.getCondCode(CCOpcode)); 1222 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 1223 } 1224 1225 // If we make it this for it means we have no native instructions to handle 1226 // this SELECT_CC, so we must lower it. 1227 SDValue HWTrue, HWFalse; 1228 1229 if (CompareVT == MVT::f32) { 1230 HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); 1231 HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); 1232 } else if (CompareVT == MVT::i32) { 1233 HWTrue = DAG.getConstant(-1, DL, CompareVT); 1234 HWFalse = DAG.getConstant(0, DL, CompareVT); 1235 } 1236 else { 1237 llvm_unreachable("Unhandled value type in LowerSELECT_CC"); 1238 } 1239 1240 // Lower this unsupported SELECT_CC into a combination of two supported 1241 // SELECT_CC operations. 1242 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 1243 1244 return DAG.getNode(ISD::SELECT_CC, DL, VT, 1245 Cond, HWFalse, 1246 True, False, 1247 DAG.getCondCode(ISD::SETNE)); 1248 } 1249 1250 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to 1251 /// convert these pointers to a register index. Each register holds 1252 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 1253 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 1254 /// for indirect addressing. 1255 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 1256 unsigned StackWidth, 1257 SelectionDAG &DAG) const { 1258 unsigned SRLPad; 1259 switch(StackWidth) { 1260 case 1: 1261 SRLPad = 2; 1262 break; 1263 case 2: 1264 SRLPad = 3; 1265 break; 1266 case 4: 1267 SRLPad = 4; 1268 break; 1269 default: llvm_unreachable("Invalid stack width"); 1270 } 1271 1272 SDLoc DL(Ptr); 1273 return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, 1274 DAG.getConstant(SRLPad, DL, MVT::i32)); 1275 } 1276 1277 void R600TargetLowering::getStackAddress(unsigned StackWidth, 1278 unsigned ElemIdx, 1279 unsigned &Channel, 1280 unsigned &PtrIncr) const { 1281 switch (StackWidth) { 1282 default: 1283 case 1: 1284 Channel = 0; 1285 if (ElemIdx > 0) { 1286 PtrIncr = 1; 1287 } else { 1288 PtrIncr = 0; 1289 } 1290 break; 1291 case 2: 1292 Channel = ElemIdx % 2; 1293 if (ElemIdx == 2) { 1294 PtrIncr = 1; 1295 } else { 1296 PtrIncr = 0; 1297 } 1298 break; 1299 case 4: 1300 Channel = ElemIdx; 1301 PtrIncr = 0; 1302 break; 1303 } 1304 } 1305 1306 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, 1307 SelectionDAG &DAG) const { 1308 SDLoc DL(Store); 1309 1310 unsigned Mask = 0; 1311 if (Store->getMemoryVT() == MVT::i8) { 1312 Mask = 0xff; 1313 } else if (Store->getMemoryVT() == MVT::i16) { 1314 Mask = 0xffff; 1315 } 1316 1317 SDValue Chain = Store->getChain(); 1318 SDValue BasePtr = Store->getBasePtr(); 1319 EVT MemVT = Store->getMemoryVT(); 1320 1321 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, 1322 DAG.getConstant(2, DL, MVT::i32)); 1323 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 1324 Chain, Ptr, 1325 DAG.getTargetConstant(0, DL, MVT::i32)); 1326 1327 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, 1328 DAG.getConstant(0x3, DL, MVT::i32)); 1329 1330 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1331 DAG.getConstant(3, DL, MVT::i32)); 1332 1333 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, 1334 Store->getValue()); 1335 1336 SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); 1337 1338 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 1339 MaskedValue, ShiftAmt); 1340 1341 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, 1342 DAG.getConstant(Mask, DL, MVT::i32), 1343 ShiftAmt); 1344 DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, 1345 DAG.getConstant(0xffffffff, DL, MVT::i32)); 1346 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 1347 1348 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 1349 return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1350 Chain, Value, Ptr, 1351 DAG.getTargetConstant(0, DL, MVT::i32)); 1352 } 1353 1354 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1355 if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG)) 1356 return Result; 1357 1358 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 1359 unsigned AS = StoreNode->getAddressSpace(); 1360 SDValue Value = StoreNode->getValue(); 1361 EVT ValueVT = Value.getValueType(); 1362 1363 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && 1364 ValueVT.isVector()) { 1365 return SplitVectorStore(Op, DAG); 1366 } 1367 1368 SDLoc DL(Op); 1369 SDValue Chain = StoreNode->getChain(); 1370 SDValue Ptr = StoreNode->getBasePtr(); 1371 1372 if (AS == AMDGPUAS::GLOBAL_ADDRESS) { 1373 if (StoreNode->isTruncatingStore()) { 1374 EVT VT = Value.getValueType(); 1375 assert(VT.bitsLE(MVT::i32)); 1376 EVT MemVT = StoreNode->getMemoryVT(); 1377 SDValue MaskConstant; 1378 if (MemVT == MVT::i8) { 1379 MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); 1380 } else { 1381 assert(MemVT == MVT::i16); 1382 MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); 1383 } 1384 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, 1385 DAG.getConstant(2, DL, MVT::i32)); 1386 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, 1387 DAG.getConstant(0x00000003, DL, VT)); 1388 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 1389 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 1390 DAG.getConstant(3, DL, VT)); 1391 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); 1392 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); 1393 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 1394 // vector instead. 1395 SDValue Src[4] = { 1396 ShiftedValue, 1397 DAG.getConstant(0, DL, MVT::i32), 1398 DAG.getConstant(0, DL, MVT::i32), 1399 Mask 1400 }; 1401 SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src); 1402 SDValue Args[3] = { Chain, Input, DWordAddr }; 1403 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 1404 Op->getVTList(), Args, MemVT, 1405 StoreNode->getMemOperand()); 1406 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && 1407 ValueVT.bitsGE(MVT::i32)) { 1408 // Convert pointer from byte address to dword address. 1409 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 1410 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 1411 Ptr, DAG.getConstant(2, DL, MVT::i32))); 1412 1413 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 1414 llvm_unreachable("Truncated and indexed stores not supported yet"); 1415 } else { 1416 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1417 } 1418 return Chain; 1419 } 1420 } 1421 1422 if (AS != AMDGPUAS::PRIVATE_ADDRESS) 1423 return SDValue(); 1424 1425 EVT MemVT = StoreNode->getMemoryVT(); 1426 if (MemVT.bitsLT(MVT::i32)) 1427 return lowerPrivateTruncStore(StoreNode, DAG); 1428 1429 // Lowering for indirect addressing 1430 const MachineFunction &MF = DAG.getMachineFunction(); 1431 const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); 1432 unsigned StackWidth = TFL->getStackWidth(MF); 1433 1434 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1435 1436 if (ValueVT.isVector()) { 1437 unsigned NumElemVT = ValueVT.getVectorNumElements(); 1438 EVT ElemVT = ValueVT.getVectorElementType(); 1439 SmallVector<SDValue, 4> Stores(NumElemVT); 1440 1441 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1442 "vector width in load"); 1443 1444 for (unsigned i = 0; i < NumElemVT; ++i) { 1445 unsigned Channel, PtrIncr; 1446 getStackAddress(StackWidth, i, Channel, PtrIncr); 1447 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1448 DAG.getConstant(PtrIncr, DL, MVT::i32)); 1449 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 1450 Value, DAG.getConstant(i, DL, MVT::i32)); 1451 1452 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1453 Chain, Elem, Ptr, 1454 DAG.getTargetConstant(Channel, DL, MVT::i32)); 1455 } 1456 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 1457 } else { 1458 if (ValueVT == MVT::i8) { 1459 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 1460 } 1461 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 1462 DAG.getTargetConstant(0, DL, MVT::i32)); // Channel 1463 } 1464 1465 return Chain; 1466 } 1467 1468 // return (512 + (kc_bank << 12) 1469 static int 1470 ConstantAddressBlock(unsigned AddressSpace) { 1471 switch (AddressSpace) { 1472 case AMDGPUAS::CONSTANT_BUFFER_0: 1473 return 512; 1474 case AMDGPUAS::CONSTANT_BUFFER_1: 1475 return 512 + 4096; 1476 case AMDGPUAS::CONSTANT_BUFFER_2: 1477 return 512 + 4096 * 2; 1478 case AMDGPUAS::CONSTANT_BUFFER_3: 1479 return 512 + 4096 * 3; 1480 case AMDGPUAS::CONSTANT_BUFFER_4: 1481 return 512 + 4096 * 4; 1482 case AMDGPUAS::CONSTANT_BUFFER_5: 1483 return 512 + 4096 * 5; 1484 case AMDGPUAS::CONSTANT_BUFFER_6: 1485 return 512 + 4096 * 6; 1486 case AMDGPUAS::CONSTANT_BUFFER_7: 1487 return 512 + 4096 * 7; 1488 case AMDGPUAS::CONSTANT_BUFFER_8: 1489 return 512 + 4096 * 8; 1490 case AMDGPUAS::CONSTANT_BUFFER_9: 1491 return 512 + 4096 * 9; 1492 case AMDGPUAS::CONSTANT_BUFFER_10: 1493 return 512 + 4096 * 10; 1494 case AMDGPUAS::CONSTANT_BUFFER_11: 1495 return 512 + 4096 * 11; 1496 case AMDGPUAS::CONSTANT_BUFFER_12: 1497 return 512 + 4096 * 12; 1498 case AMDGPUAS::CONSTANT_BUFFER_13: 1499 return 512 + 4096 * 13; 1500 case AMDGPUAS::CONSTANT_BUFFER_14: 1501 return 512 + 4096 * 14; 1502 case AMDGPUAS::CONSTANT_BUFFER_15: 1503 return 512 + 4096 * 15; 1504 default: 1505 return -1; 1506 } 1507 } 1508 1509 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, 1510 SelectionDAG &DAG) const { 1511 SDLoc DL(Op); 1512 LoadSDNode *Load = cast<LoadSDNode>(Op); 1513 ISD::LoadExtType ExtType = Load->getExtensionType(); 1514 EVT MemVT = Load->getMemoryVT(); 1515 1516 // <SI && AS=PRIVATE && EXTLOAD && size < 32bit, 1517 // register (2-)byte extract. 1518 1519 // Get Register holding the target. 1520 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), 1521 DAG.getConstant(2, DL, MVT::i32)); 1522 // Load the Register. 1523 SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), 1524 Load->getChain(), 1525 Ptr, 1526 DAG.getTargetConstant(0, DL, MVT::i32), 1527 Op.getOperand(2)); 1528 1529 // Get offset within the register. 1530 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, 1531 Load->getBasePtr(), 1532 DAG.getConstant(0x3, DL, MVT::i32)); 1533 1534 // Bit offset of target byte (byteIdx * 8). 1535 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1536 DAG.getConstant(3, DL, MVT::i32)); 1537 1538 // Shift to the right. 1539 Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); 1540 1541 // Eliminate the upper bits by setting them to ... 1542 EVT MemEltVT = MemVT.getScalarType(); 1543 1544 // ... ones. 1545 if (ExtType == ISD::SEXTLOAD) { 1546 SDValue MemEltVTNode = DAG.getValueType(MemEltVT); 1547 1548 SDValue Ops[] = { 1549 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), 1550 Load->getChain() 1551 }; 1552 1553 return DAG.getMergeValues(Ops, DL); 1554 } 1555 1556 // ... or zeros. 1557 SDValue Ops[] = { 1558 DAG.getZeroExtendInReg(Ret, DL, MemEltVT), 1559 Load->getChain() 1560 }; 1561 1562 return DAG.getMergeValues(Ops, DL); 1563 } 1564 1565 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1566 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1567 unsigned AS = LoadNode->getAddressSpace(); 1568 EVT MemVT = LoadNode->getMemoryVT(); 1569 ISD::LoadExtType ExtType = LoadNode->getExtensionType(); 1570 1571 if (AS == AMDGPUAS::PRIVATE_ADDRESS && 1572 ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) { 1573 return lowerPrivateExtLoad(Op, DAG); 1574 } 1575 1576 SDLoc DL(Op); 1577 EVT VT = Op.getValueType(); 1578 SDValue Chain = LoadNode->getChain(); 1579 SDValue Ptr = LoadNode->getBasePtr(); 1580 1581 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { 1582 SDValue MergedValues[2] = { 1583 scalarizeVectorLoad(LoadNode, DAG), 1584 Chain 1585 }; 1586 return DAG.getMergeValues(MergedValues, DL); 1587 } 1588 1589 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1590 if (ConstantBlock > -1 && 1591 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 1592 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 1593 SDValue Result; 1594 if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) || 1595 isa<Constant>(LoadNode->getMemOperand()->getValue()) || 1596 isa<ConstantSDNode>(Ptr)) { 1597 SDValue Slots[4]; 1598 for (unsigned i = 0; i < 4; i++) { 1599 // We want Const position encoded with the following formula : 1600 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1601 // const_index is Ptr computed by llvm using an alignment of 16. 1602 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1603 // then div by 4 at the ISel step 1604 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1605 DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); 1606 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1607 } 1608 EVT NewVT = MVT::v4i32; 1609 unsigned NumElements = 4; 1610 if (VT.isVector()) { 1611 NewVT = VT; 1612 NumElements = VT.getVectorNumElements(); 1613 } 1614 Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); 1615 } else { 1616 // non-constant ptr can't be folded, keeps it as a v4f32 load 1617 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1618 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 1619 DAG.getConstant(4, DL, MVT::i32)), 1620 DAG.getConstant(LoadNode->getAddressSpace() - 1621 AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) 1622 ); 1623 } 1624 1625 if (!VT.isVector()) { 1626 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1627 DAG.getConstant(0, DL, MVT::i32)); 1628 } 1629 1630 SDValue MergedValues[2] = { 1631 Result, 1632 Chain 1633 }; 1634 return DAG.getMergeValues(MergedValues, DL); 1635 } 1636 1637 SDValue LoweredLoad; 1638 1639 // For most operations returning SDValue() will result in the node being 1640 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 1641 // need to manually expand loads that may be legal in some address spaces and 1642 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 1643 // compute shaders, since the data is sign extended when it is uploaded to the 1644 // buffer. However SEXT loads from other address spaces are not supported, so 1645 // we need to expand them here. 1646 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 1647 EVT MemVT = LoadNode->getMemoryVT(); 1648 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 1649 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, 1650 LoadNode->getPointerInfo(), MemVT, 1651 LoadNode->isVolatile(), 1652 LoadNode->isNonTemporal(), 1653 LoadNode->isInvariant(), 1654 LoadNode->getAlignment()); 1655 SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, 1656 DAG.getValueType(MemVT)); 1657 1658 SDValue MergedValues[2] = { Res, Chain }; 1659 return DAG.getMergeValues(MergedValues, DL); 1660 } 1661 1662 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1663 return SDValue(); 1664 } 1665 1666 // Lowering for indirect addressing 1667 const MachineFunction &MF = DAG.getMachineFunction(); 1668 const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); 1669 unsigned StackWidth = TFL->getStackWidth(MF); 1670 1671 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1672 1673 if (VT.isVector()) { 1674 unsigned NumElemVT = VT.getVectorNumElements(); 1675 EVT ElemVT = VT.getVectorElementType(); 1676 SDValue Loads[4]; 1677 1678 assert(NumElemVT <= 4); 1679 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1680 "vector width in load"); 1681 1682 for (unsigned i = 0; i < NumElemVT; ++i) { 1683 unsigned Channel, PtrIncr; 1684 getStackAddress(StackWidth, i, Channel, PtrIncr); 1685 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1686 DAG.getConstant(PtrIncr, DL, MVT::i32)); 1687 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 1688 Chain, Ptr, 1689 DAG.getTargetConstant(Channel, DL, MVT::i32), 1690 Op.getOperand(2)); 1691 } 1692 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT); 1693 LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT)); 1694 } else { 1695 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 1696 Chain, Ptr, 1697 DAG.getTargetConstant(0, DL, MVT::i32), // Channel 1698 Op.getOperand(2)); 1699 } 1700 1701 SDValue Ops[2] = { 1702 LoweredLoad, 1703 Chain 1704 }; 1705 1706 return DAG.getMergeValues(Ops, DL); 1707 } 1708 1709 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1710 SDValue Chain = Op.getOperand(0); 1711 SDValue Cond = Op.getOperand(1); 1712 SDValue Jump = Op.getOperand(2); 1713 1714 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), 1715 Chain, Jump, Cond); 1716 } 1717 1718 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, 1719 SelectionDAG &DAG) const { 1720 MachineFunction &MF = DAG.getMachineFunction(); 1721 const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); 1722 1723 FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); 1724 1725 unsigned FrameIndex = FIN->getIndex(); 1726 unsigned IgnoredFrameReg; 1727 unsigned Offset = 1728 TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); 1729 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), 1730 Op.getValueType()); 1731 } 1732 1733 /// XXX Only kernel functions are supported, so we can assume for now that 1734 /// every function is a kernel function, but in the future we should use 1735 /// separate calling conventions for kernel and non-kernel functions. 1736 SDValue R600TargetLowering::LowerFormalArguments( 1737 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 1738 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 1739 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 1740 SmallVector<CCValAssign, 16> ArgLocs; 1741 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1742 *DAG.getContext()); 1743 MachineFunction &MF = DAG.getMachineFunction(); 1744 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 1745 1746 SmallVector<ISD::InputArg, 8> LocalIns; 1747 1748 getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); 1749 1750 AnalyzeFormalArguments(CCInfo, LocalIns); 1751 1752 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1753 CCValAssign &VA = ArgLocs[i]; 1754 const ISD::InputArg &In = Ins[i]; 1755 EVT VT = In.VT; 1756 EVT MemVT = VA.getLocVT(); 1757 if (!VT.isVector() && MemVT.isVector()) { 1758 // Get load source type if scalarized. 1759 MemVT = MemVT.getVectorElementType(); 1760 } 1761 1762 if (AMDGPU::isShader(CallConv)) { 1763 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); 1764 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1765 InVals.push_back(Register); 1766 continue; 1767 } 1768 1769 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1770 AMDGPUAS::CONSTANT_BUFFER_0); 1771 1772 // i64 isn't a legal type, so the register type used ends up as i32, which 1773 // isn't expected here. It attempts to create this sextload, but it ends up 1774 // being invalid. Somehow this seems to work with i64 arguments, but breaks 1775 // for <1 x i64>. 1776 1777 // The first 36 bytes of the input buffer contains information about 1778 // thread group and global sizes. 1779 ISD::LoadExtType Ext = ISD::NON_EXTLOAD; 1780 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { 1781 // FIXME: This should really check the extload type, but the handling of 1782 // extload vector parameters seems to be broken. 1783 1784 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 1785 Ext = ISD::SEXTLOAD; 1786 } 1787 1788 // Compute the offset from the value. 1789 // XXX - I think PartOffset should give you this, but it seems to give the 1790 // size of the register which isn't useful. 1791 1792 unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); 1793 unsigned PartOffset = VA.getLocMemOffset(); 1794 unsigned Offset = 36 + VA.getLocMemOffset(); 1795 1796 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); 1797 SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, 1798 DAG.getConstant(Offset, DL, MVT::i32), 1799 DAG.getUNDEF(MVT::i32), 1800 PtrInfo, 1801 MemVT, false, true, true, 4); 1802 1803 // 4 is the preferred alignment for the CONSTANT memory space. 1804 InVals.push_back(Arg); 1805 MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); 1806 } 1807 return Chain; 1808 } 1809 1810 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1811 EVT VT) const { 1812 if (!VT.isVector()) 1813 return MVT::i32; 1814 return VT.changeVectorElementTypeToInteger(); 1815 } 1816 1817 bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 1818 unsigned AddrSpace, 1819 unsigned Align, 1820 bool *IsFast) const { 1821 if (IsFast) 1822 *IsFast = false; 1823 1824 if (!VT.isSimple() || VT == MVT::Other) 1825 return false; 1826 1827 if (VT.bitsLT(MVT::i32)) 1828 return false; 1829 1830 // TODO: This is a rough estimate. 1831 if (IsFast) 1832 *IsFast = true; 1833 1834 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 1835 } 1836 1837 static SDValue CompactSwizzlableVector( 1838 SelectionDAG &DAG, SDValue VectorEntry, 1839 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1840 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1841 assert(RemapSwizzle.empty()); 1842 SDValue NewBldVec[4] = { 1843 VectorEntry.getOperand(0), 1844 VectorEntry.getOperand(1), 1845 VectorEntry.getOperand(2), 1846 VectorEntry.getOperand(3) 1847 }; 1848 1849 for (unsigned i = 0; i < 4; i++) { 1850 if (NewBldVec[i].isUndef()) 1851 // We mask write here to teach later passes that the ith element of this 1852 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 1853 // break false dependencies and additionnaly make assembly easier to read. 1854 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 1855 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1856 if (C->isZero()) { 1857 RemapSwizzle[i] = 4; // SEL_0 1858 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1859 } else if (C->isExactlyValue(1.0)) { 1860 RemapSwizzle[i] = 5; // SEL_1 1861 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1862 } 1863 } 1864 1865 if (NewBldVec[i].isUndef()) 1866 continue; 1867 for (unsigned j = 0; j < i; j++) { 1868 if (NewBldVec[i] == NewBldVec[j]) { 1869 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1870 RemapSwizzle[i] = j; 1871 break; 1872 } 1873 } 1874 } 1875 1876 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), 1877 NewBldVec); 1878 } 1879 1880 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1881 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1882 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1883 assert(RemapSwizzle.empty()); 1884 SDValue NewBldVec[4] = { 1885 VectorEntry.getOperand(0), 1886 VectorEntry.getOperand(1), 1887 VectorEntry.getOperand(2), 1888 VectorEntry.getOperand(3) 1889 }; 1890 bool isUnmovable[4] = { false, false, false, false }; 1891 for (unsigned i = 0; i < 4; i++) { 1892 RemapSwizzle[i] = i; 1893 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1894 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1895 ->getZExtValue(); 1896 if (i == Idx) 1897 isUnmovable[Idx] = true; 1898 } 1899 } 1900 1901 for (unsigned i = 0; i < 4; i++) { 1902 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1903 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1904 ->getZExtValue(); 1905 if (isUnmovable[Idx]) 1906 continue; 1907 // Swap i and Idx 1908 std::swap(NewBldVec[Idx], NewBldVec[i]); 1909 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 1910 break; 1911 } 1912 } 1913 1914 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), 1915 NewBldVec); 1916 } 1917 1918 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], 1919 SelectionDAG &DAG, 1920 const SDLoc &DL) const { 1921 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); 1922 // Old -> New swizzle values 1923 DenseMap<unsigned, unsigned> SwizzleRemap; 1924 1925 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1926 for (unsigned i = 0; i < 4; i++) { 1927 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1928 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1929 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1930 } 1931 1932 SwizzleRemap.clear(); 1933 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1934 for (unsigned i = 0; i < 4; i++) { 1935 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1936 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1937 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1938 } 1939 1940 return BuildVector; 1941 } 1942 1943 1944 //===----------------------------------------------------------------------===// 1945 // Custom DAG Optimizations 1946 //===----------------------------------------------------------------------===// 1947 1948 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1949 DAGCombinerInfo &DCI) const { 1950 SelectionDAG &DAG = DCI.DAG; 1951 1952 switch (N->getOpcode()) { 1953 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1954 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1955 case ISD::FP_ROUND: { 1956 SDValue Arg = N->getOperand(0); 1957 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1958 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), 1959 Arg.getOperand(0)); 1960 } 1961 break; 1962 } 1963 1964 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1965 // (i32 select_cc f32, f32, -1, 0 cc) 1966 // 1967 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1968 // this to one of the SET*_DX10 instructions. 1969 case ISD::FP_TO_SINT: { 1970 SDValue FNeg = N->getOperand(0); 1971 if (FNeg.getOpcode() != ISD::FNEG) { 1972 return SDValue(); 1973 } 1974 SDValue SelectCC = FNeg.getOperand(0); 1975 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1976 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1977 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1978 !isHWTrueValue(SelectCC.getOperand(2)) || 1979 !isHWFalseValue(SelectCC.getOperand(3))) { 1980 return SDValue(); 1981 } 1982 1983 SDLoc dl(N); 1984 return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0), 1985 SelectCC.getOperand(0), // LHS 1986 SelectCC.getOperand(1), // RHS 1987 DAG.getConstant(-1, dl, MVT::i32), // True 1988 DAG.getConstant(0, dl, MVT::i32), // False 1989 SelectCC.getOperand(4)); // CC 1990 1991 break; 1992 } 1993 1994 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 1995 // => build_vector elt0, ... , NewEltIdx, ... , eltN 1996 case ISD::INSERT_VECTOR_ELT: { 1997 SDValue InVec = N->getOperand(0); 1998 SDValue InVal = N->getOperand(1); 1999 SDValue EltNo = N->getOperand(2); 2000 SDLoc dl(N); 2001 2002 // If the inserted element is an UNDEF, just use the input vector. 2003 if (InVal.isUndef()) 2004 return InVec; 2005 2006 EVT VT = InVec.getValueType(); 2007 2008 // If we can't generate a legal BUILD_VECTOR, exit 2009 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 2010 return SDValue(); 2011 2012 // Check that we know which element is being inserted 2013 if (!isa<ConstantSDNode>(EltNo)) 2014 return SDValue(); 2015 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 2016 2017 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 2018 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 2019 // vector elements. 2020 SmallVector<SDValue, 8> Ops; 2021 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 2022 Ops.append(InVec.getNode()->op_begin(), 2023 InVec.getNode()->op_end()); 2024 } else if (InVec.isUndef()) { 2025 unsigned NElts = VT.getVectorNumElements(); 2026 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 2027 } else { 2028 return SDValue(); 2029 } 2030 2031 // Insert the element 2032 if (Elt < Ops.size()) { 2033 // All the operands of BUILD_VECTOR must have the same type; 2034 // we enforce that here. 2035 EVT OpVT = Ops[0].getValueType(); 2036 if (InVal.getValueType() != OpVT) 2037 InVal = OpVT.bitsGT(InVal.getValueType()) ? 2038 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : 2039 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); 2040 Ops[Elt] = InVal; 2041 } 2042 2043 // Return the new vector 2044 return DAG.getBuildVector(VT, dl, Ops); 2045 } 2046 2047 // Extract_vec (Build_vector) generated by custom lowering 2048 // also needs to be customly combined 2049 case ISD::EXTRACT_VECTOR_ELT: { 2050 SDValue Arg = N->getOperand(0); 2051 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 2052 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 2053 unsigned Element = Const->getZExtValue(); 2054 return Arg->getOperand(Element); 2055 } 2056 } 2057 if (Arg.getOpcode() == ISD::BITCAST && 2058 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 2059 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 2060 unsigned Element = Const->getZExtValue(); 2061 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), 2062 Arg->getOperand(0).getOperand(Element)); 2063 } 2064 } 2065 break; 2066 } 2067 2068 case ISD::SELECT_CC: { 2069 // Try common optimizations 2070 if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI)) 2071 return Ret; 2072 2073 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 2074 // selectcc x, y, a, b, inv(cc) 2075 // 2076 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 2077 // selectcc x, y, a, b, cc 2078 SDValue LHS = N->getOperand(0); 2079 if (LHS.getOpcode() != ISD::SELECT_CC) { 2080 return SDValue(); 2081 } 2082 2083 SDValue RHS = N->getOperand(1); 2084 SDValue True = N->getOperand(2); 2085 SDValue False = N->getOperand(3); 2086 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 2087 2088 if (LHS.getOperand(2).getNode() != True.getNode() || 2089 LHS.getOperand(3).getNode() != False.getNode() || 2090 RHS.getNode() != False.getNode()) { 2091 return SDValue(); 2092 } 2093 2094 switch (NCC) { 2095 default: return SDValue(); 2096 case ISD::SETNE: return LHS; 2097 case ISD::SETEQ: { 2098 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 2099 LHSCC = ISD::getSetCCInverse(LHSCC, 2100 LHS.getOperand(0).getValueType().isInteger()); 2101 if (DCI.isBeforeLegalizeOps() || 2102 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 2103 return DAG.getSelectCC(SDLoc(N), 2104 LHS.getOperand(0), 2105 LHS.getOperand(1), 2106 LHS.getOperand(2), 2107 LHS.getOperand(3), 2108 LHSCC); 2109 break; 2110 } 2111 } 2112 return SDValue(); 2113 } 2114 2115 case AMDGPUISD::EXPORT: { 2116 SDValue Arg = N->getOperand(1); 2117 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2118 break; 2119 2120 SDValue NewArgs[8] = { 2121 N->getOperand(0), // Chain 2122 SDValue(), 2123 N->getOperand(2), // ArrayBase 2124 N->getOperand(3), // Type 2125 N->getOperand(4), // SWZ_X 2126 N->getOperand(5), // SWZ_Y 2127 N->getOperand(6), // SWZ_Z 2128 N->getOperand(7) // SWZ_W 2129 }; 2130 SDLoc DL(N); 2131 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); 2132 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); 2133 } 2134 case AMDGPUISD::TEXTURE_FETCH: { 2135 SDValue Arg = N->getOperand(1); 2136 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2137 break; 2138 2139 SDValue NewArgs[19] = { 2140 N->getOperand(0), 2141 N->getOperand(1), 2142 N->getOperand(2), 2143 N->getOperand(3), 2144 N->getOperand(4), 2145 N->getOperand(5), 2146 N->getOperand(6), 2147 N->getOperand(7), 2148 N->getOperand(8), 2149 N->getOperand(9), 2150 N->getOperand(10), 2151 N->getOperand(11), 2152 N->getOperand(12), 2153 N->getOperand(13), 2154 N->getOperand(14), 2155 N->getOperand(15), 2156 N->getOperand(16), 2157 N->getOperand(17), 2158 N->getOperand(18), 2159 }; 2160 SDLoc DL(N); 2161 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); 2162 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); 2163 } 2164 } 2165 2166 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 2167 } 2168 2169 bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, 2170 SDValue &Src, SDValue &Neg, SDValue &Abs, 2171 SDValue &Sel, SDValue &Imm, 2172 SelectionDAG &DAG) const { 2173 const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); 2174 if (!Src.isMachineOpcode()) 2175 return false; 2176 2177 switch (Src.getMachineOpcode()) { 2178 case AMDGPU::FNEG_R600: 2179 if (!Neg.getNode()) 2180 return false; 2181 Src = Src.getOperand(0); 2182 Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 2183 return true; 2184 case AMDGPU::FABS_R600: 2185 if (!Abs.getNode()) 2186 return false; 2187 Src = Src.getOperand(0); 2188 Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 2189 return true; 2190 case AMDGPU::CONST_COPY: { 2191 unsigned Opcode = ParentNode->getMachineOpcode(); 2192 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2193 2194 if (!Sel.getNode()) 2195 return false; 2196 2197 SDValue CstOffset = Src.getOperand(0); 2198 if (ParentNode->getValueType(0).isVector()) 2199 return false; 2200 2201 // Gather constants values 2202 int SrcIndices[] = { 2203 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 2204 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 2205 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), 2206 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 2207 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 2208 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 2209 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 2210 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 2211 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 2212 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 2213 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 2214 }; 2215 std::vector<unsigned> Consts; 2216 for (int OtherSrcIdx : SrcIndices) { 2217 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 2218 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 2219 continue; 2220 if (HasDst) { 2221 OtherSrcIdx--; 2222 OtherSelIdx--; 2223 } 2224 if (RegisterSDNode *Reg = 2225 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 2226 if (Reg->getReg() == AMDGPU::ALU_CONST) { 2227 ConstantSDNode *Cst 2228 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); 2229 Consts.push_back(Cst->getZExtValue()); 2230 } 2231 } 2232 } 2233 2234 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); 2235 Consts.push_back(Cst->getZExtValue()); 2236 if (!TII->fitsConstReadLimitations(Consts)) { 2237 return false; 2238 } 2239 2240 Sel = CstOffset; 2241 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); 2242 return true; 2243 } 2244 case AMDGPU::MOV_IMM_GLOBAL_ADDR: 2245 // Check if the Imm slot is used. Taken from below. 2246 if (cast<ConstantSDNode>(Imm)->getZExtValue()) 2247 return false; 2248 Imm = Src.getOperand(0); 2249 Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32); 2250 return true; 2251 case AMDGPU::MOV_IMM_I32: 2252 case AMDGPU::MOV_IMM_F32: { 2253 unsigned ImmReg = AMDGPU::ALU_LITERAL_X; 2254 uint64_t ImmValue = 0; 2255 2256 2257 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { 2258 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); 2259 float FloatValue = FPC->getValueAPF().convertToFloat(); 2260 if (FloatValue == 0.0) { 2261 ImmReg = AMDGPU::ZERO; 2262 } else if (FloatValue == 0.5) { 2263 ImmReg = AMDGPU::HALF; 2264 } else if (FloatValue == 1.0) { 2265 ImmReg = AMDGPU::ONE; 2266 } else { 2267 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 2268 } 2269 } else { 2270 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); 2271 uint64_t Value = C->getZExtValue(); 2272 if (Value == 0) { 2273 ImmReg = AMDGPU::ZERO; 2274 } else if (Value == 1) { 2275 ImmReg = AMDGPU::ONE_INT; 2276 } else { 2277 ImmValue = Value; 2278 } 2279 } 2280 2281 // Check that we aren't already using an immediate. 2282 // XXX: It's possible for an instruction to have more than one 2283 // immediate operand, but this is not supported yet. 2284 if (ImmReg == AMDGPU::ALU_LITERAL_X) { 2285 if (!Imm.getNode()) 2286 return false; 2287 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); 2288 assert(C); 2289 if (C->getZExtValue()) 2290 return false; 2291 Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); 2292 } 2293 Src = DAG.getRegister(ImmReg, MVT::i32); 2294 return true; 2295 } 2296 default: 2297 return false; 2298 } 2299 } 2300 2301 /// \brief Fold the instructions after selecting them 2302 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 2303 SelectionDAG &DAG) const { 2304 const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); 2305 if (!Node->isMachineOpcode()) 2306 return Node; 2307 2308 unsigned Opcode = Node->getMachineOpcode(); 2309 SDValue FakeOp; 2310 2311 std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); 2312 2313 if (Opcode == AMDGPU::DOT_4) { 2314 int OperandIdx[] = { 2315 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 2316 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 2317 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 2318 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 2319 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 2320 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 2321 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 2322 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 2323 }; 2324 int NegIdx[] = { 2325 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), 2326 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), 2327 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), 2328 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), 2329 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), 2330 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), 2331 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), 2332 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) 2333 }; 2334 int AbsIdx[] = { 2335 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), 2336 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), 2337 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), 2338 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), 2339 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), 2340 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), 2341 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), 2342 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) 2343 }; 2344 for (unsigned i = 0; i < 8; i++) { 2345 if (OperandIdx[i] < 0) 2346 return Node; 2347 SDValue &Src = Ops[OperandIdx[i] - 1]; 2348 SDValue &Neg = Ops[NegIdx[i] - 1]; 2349 SDValue &Abs = Ops[AbsIdx[i] - 1]; 2350 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2351 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2352 if (HasDst) 2353 SelIdx--; 2354 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2355 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 2356 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2357 } 2358 } else if (Opcode == AMDGPU::REG_SEQUENCE) { 2359 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 2360 SDValue &Src = Ops[i]; 2361 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 2362 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2363 } 2364 } else if (Opcode == AMDGPU::CLAMP_R600) { 2365 SDValue Src = Node->getOperand(0); 2366 if (!Src.isMachineOpcode() || 2367 !TII->hasInstrModifiers(Src.getMachineOpcode())) 2368 return Node; 2369 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), 2370 AMDGPU::OpName::clamp); 2371 if (ClampIdx < 0) 2372 return Node; 2373 SDLoc DL(Node); 2374 std::vector<SDValue> Ops(Src->op_begin(), Src->op_end()); 2375 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32); 2376 return DAG.getMachineNode(Src.getMachineOpcode(), DL, 2377 Node->getVTList(), Ops); 2378 } else { 2379 if (!TII->hasInstrModifiers(Opcode)) 2380 return Node; 2381 int OperandIdx[] = { 2382 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 2383 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 2384 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) 2385 }; 2386 int NegIdx[] = { 2387 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), 2388 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), 2389 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) 2390 }; 2391 int AbsIdx[] = { 2392 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), 2393 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), 2394 -1 2395 }; 2396 for (unsigned i = 0; i < 3; i++) { 2397 if (OperandIdx[i] < 0) 2398 return Node; 2399 SDValue &Src = Ops[OperandIdx[i] - 1]; 2400 SDValue &Neg = Ops[NegIdx[i] - 1]; 2401 SDValue FakeAbs; 2402 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 2403 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2404 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2405 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); 2406 if (HasDst) { 2407 SelIdx--; 2408 ImmIdx--; 2409 } 2410 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2411 SDValue &Imm = Ops[ImmIdx]; 2412 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 2413 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2414 } 2415 } 2416 2417 return Node; 2418 } 2419