1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the interfaces that AArch64 uses to lower LLVM code into a 11 // selection DAG. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #define DEBUG_TYPE "aarch64-isel" 16 #include "AArch64.h" 17 #include "AArch64ISelLowering.h" 18 #include "AArch64MachineFunctionInfo.h" 19 #include "AArch64TargetMachine.h" 20 #include "AArch64TargetObjectFile.h" 21 #include "Utils/AArch64BaseInfo.h" 22 #include "llvm/CodeGen/Analysis.h" 23 #include "llvm/CodeGen/CallingConvLower.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineRegisterInfo.h" 27 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 28 #include "llvm/IR/CallingConv.h" 29 30 using namespace llvm; 31 32 static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) { 33 const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>(); 34 35 if (Subtarget->isTargetLinux()) 36 return new AArch64LinuxTargetObjectFile(); 37 if (Subtarget->isTargetELF()) 38 return new TargetLoweringObjectFileELF(); 39 llvm_unreachable("unknown subtarget type"); 40 } 41 42 43 AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) 44 : TargetLowering(TM, createTLOF(TM)), 45 Subtarget(&TM.getSubtarget<AArch64Subtarget>()), 46 RegInfo(TM.getRegisterInfo()), 47 Itins(TM.getInstrItineraryData()) { 48 49 // SIMD compares set the entire lane's bits to 1 50 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 51 52 // Scalar register <-> type mapping 53 addRegisterClass(MVT::i32, &AArch64::GPR32RegClass); 54 addRegisterClass(MVT::i64, &AArch64::GPR64RegClass); 55 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 56 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 57 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 58 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 59 60 computeRegisterProperties(); 61 62 // Some atomic operations can be folded into load-acquire or store-release 63 // instructions on AArch64. It's marginally simpler to let LLVM expand 64 // everything out to a barrier and then recombine the (few) barriers we can. 65 setInsertFencesForAtomic(true); 66 setTargetDAGCombine(ISD::ATOMIC_FENCE); 67 setTargetDAGCombine(ISD::ATOMIC_STORE); 68 69 // We combine OR nodes for bitfield and NEON BSL operations. 70 setTargetDAGCombine(ISD::OR); 71 72 setTargetDAGCombine(ISD::AND); 73 setTargetDAGCombine(ISD::SRA); 74 75 // AArch64 does not have i1 loads, or much of anything for i1 really. 76 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 77 setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); 78 setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); 79 80 setStackPointerRegisterToSaveRestore(AArch64::XSP); 81 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 82 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 83 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 84 85 // We'll lower globals to wrappers for selection. 86 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 87 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 88 89 // A64 instructions have the comparison predicate attached to the user of the 90 // result, but having a separate comparison is valuable for matching. 91 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 92 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 93 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 94 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 95 96 setOperationAction(ISD::SELECT, MVT::i32, Custom); 97 setOperationAction(ISD::SELECT, MVT::i64, Custom); 98 setOperationAction(ISD::SELECT, MVT::f32, Custom); 99 setOperationAction(ISD::SELECT, MVT::f64, Custom); 100 101 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 102 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 103 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 104 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 105 106 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 107 108 setOperationAction(ISD::SETCC, MVT::i32, Custom); 109 setOperationAction(ISD::SETCC, MVT::i64, Custom); 110 setOperationAction(ISD::SETCC, MVT::f32, Custom); 111 setOperationAction(ISD::SETCC, MVT::f64, Custom); 112 113 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 114 setOperationAction(ISD::JumpTable, MVT::i32, Custom); 115 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 116 117 setOperationAction(ISD::VASTART, MVT::Other, Custom); 118 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 119 setOperationAction(ISD::VAEND, MVT::Other, Expand); 120 setOperationAction(ISD::VAARG, MVT::Other, Expand); 121 122 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 123 124 setOperationAction(ISD::ROTL, MVT::i32, Expand); 125 setOperationAction(ISD::ROTL, MVT::i64, Expand); 126 127 setOperationAction(ISD::UREM, MVT::i32, Expand); 128 setOperationAction(ISD::UREM, MVT::i64, Expand); 129 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 130 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 131 132 setOperationAction(ISD::SREM, MVT::i32, Expand); 133 setOperationAction(ISD::SREM, MVT::i64, Expand); 134 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 135 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 136 137 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 138 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 139 140 // Legal floating-point operations. 141 setOperationAction(ISD::FABS, MVT::f32, Legal); 142 setOperationAction(ISD::FABS, MVT::f64, Legal); 143 144 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 145 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 146 147 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 148 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 149 150 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 151 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 152 153 setOperationAction(ISD::FNEG, MVT::f32, Legal); 154 setOperationAction(ISD::FNEG, MVT::f64, Legal); 155 156 setOperationAction(ISD::FRINT, MVT::f32, Legal); 157 setOperationAction(ISD::FRINT, MVT::f64, Legal); 158 159 setOperationAction(ISD::FSQRT, MVT::f32, Legal); 160 setOperationAction(ISD::FSQRT, MVT::f64, Legal); 161 162 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 163 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 164 165 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 166 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 167 setOperationAction(ISD::ConstantFP, MVT::f128, Legal); 168 169 // Illegal floating-point operations. 170 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 171 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 172 173 setOperationAction(ISD::FCOS, MVT::f32, Expand); 174 setOperationAction(ISD::FCOS, MVT::f64, Expand); 175 176 setOperationAction(ISD::FEXP, MVT::f32, Expand); 177 setOperationAction(ISD::FEXP, MVT::f64, Expand); 178 179 setOperationAction(ISD::FEXP2, MVT::f32, Expand); 180 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 181 182 setOperationAction(ISD::FLOG, MVT::f32, Expand); 183 setOperationAction(ISD::FLOG, MVT::f64, Expand); 184 185 setOperationAction(ISD::FLOG2, MVT::f32, Expand); 186 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 187 188 setOperationAction(ISD::FLOG10, MVT::f32, Expand); 189 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 190 191 setOperationAction(ISD::FPOW, MVT::f32, Expand); 192 setOperationAction(ISD::FPOW, MVT::f64, Expand); 193 194 setOperationAction(ISD::FPOWI, MVT::f32, Expand); 195 setOperationAction(ISD::FPOWI, MVT::f64, Expand); 196 197 setOperationAction(ISD::FREM, MVT::f32, Expand); 198 setOperationAction(ISD::FREM, MVT::f64, Expand); 199 200 setOperationAction(ISD::FSIN, MVT::f32, Expand); 201 setOperationAction(ISD::FSIN, MVT::f64, Expand); 202 203 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 204 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 205 206 // Virtually no operation on f128 is legal, but LLVM can't expand them when 207 // there's a valid register class, so we need custom operations in most cases. 208 setOperationAction(ISD::FABS, MVT::f128, Expand); 209 setOperationAction(ISD::FADD, MVT::f128, Custom); 210 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 211 setOperationAction(ISD::FCOS, MVT::f128, Expand); 212 setOperationAction(ISD::FDIV, MVT::f128, Custom); 213 setOperationAction(ISD::FMA, MVT::f128, Expand); 214 setOperationAction(ISD::FMUL, MVT::f128, Custom); 215 setOperationAction(ISD::FNEG, MVT::f128, Expand); 216 setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand); 217 setOperationAction(ISD::FP_ROUND, MVT::f128, Expand); 218 setOperationAction(ISD::FPOW, MVT::f128, Expand); 219 setOperationAction(ISD::FREM, MVT::f128, Expand); 220 setOperationAction(ISD::FRINT, MVT::f128, Expand); 221 setOperationAction(ISD::FSIN, MVT::f128, Expand); 222 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 223 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 224 setOperationAction(ISD::FSUB, MVT::f128, Custom); 225 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 226 setOperationAction(ISD::SETCC, MVT::f128, Custom); 227 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 228 setOperationAction(ISD::SELECT, MVT::f128, Expand); 229 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 230 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 231 232 // Lowering for many of the conversions is actually specified by the non-f128 233 // type. The LowerXXX function will be trivial when f128 isn't involved. 234 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 235 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 236 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 237 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 238 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 239 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 240 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 241 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 242 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 243 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 244 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 245 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 246 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 247 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 248 249 // This prevents LLVM trying to compress double constants into a floating 250 // constant-pool entry and trying to load from there. It's of doubtful benefit 251 // for A64: we'd need LDR followed by FCVT, I believe. 252 setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand); 253 setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); 254 setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); 255 256 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 257 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 258 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 259 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 260 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 261 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 262 263 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 264 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 265 266 setExceptionPointerRegister(AArch64::X0); 267 setExceptionSelectorRegister(AArch64::X1); 268 } 269 270 EVT AArch64TargetLowering::getSetCCResultType(EVT VT) const { 271 // It's reasonably important that this value matches the "natural" legal 272 // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself 273 // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64). 274 if (!VT.isVector()) return MVT::i32; 275 return VT.changeVectorElementTypeToInteger(); 276 } 277 278 static void getExclusiveOperation(unsigned Size, unsigned &ldrOpc, 279 unsigned &strOpc) { 280 switch (Size) { 281 default: llvm_unreachable("unsupported size for atomic binary op!"); 282 case 1: 283 ldrOpc = AArch64::LDXR_byte; 284 strOpc = AArch64::STXR_byte; 285 break; 286 case 2: 287 ldrOpc = AArch64::LDXR_hword; 288 strOpc = AArch64::STXR_hword; 289 break; 290 case 4: 291 ldrOpc = AArch64::LDXR_word; 292 strOpc = AArch64::STXR_word; 293 break; 294 case 8: 295 ldrOpc = AArch64::LDXR_dword; 296 strOpc = AArch64::STXR_dword; 297 break; 298 } 299 } 300 301 MachineBasicBlock * 302 AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, 303 unsigned Size, 304 unsigned BinOpcode) const { 305 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 306 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 307 308 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 309 MachineFunction *MF = BB->getParent(); 310 MachineFunction::iterator It = BB; 311 ++It; 312 313 unsigned dest = MI->getOperand(0).getReg(); 314 unsigned ptr = MI->getOperand(1).getReg(); 315 unsigned incr = MI->getOperand(2).getReg(); 316 DebugLoc dl = MI->getDebugLoc(); 317 318 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 319 320 unsigned ldrOpc, strOpc; 321 getExclusiveOperation(Size, ldrOpc, strOpc); 322 323 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 324 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 325 MF->insert(It, loopMBB); 326 MF->insert(It, exitMBB); 327 328 // Transfer the remainder of BB and its successor edges to exitMBB. 329 exitMBB->splice(exitMBB->begin(), BB, 330 llvm::next(MachineBasicBlock::iterator(MI)), 331 BB->end()); 332 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 333 334 const TargetRegisterClass *TRC 335 = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; 336 unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC); 337 338 // thisMBB: 339 // ... 340 // fallthrough --> loopMBB 341 BB->addSuccessor(loopMBB); 342 343 // loopMBB: 344 // ldxr dest, ptr 345 // <binop> scratch, dest, incr 346 // stxr stxr_status, scratch, ptr 347 // cbnz stxr_status, loopMBB 348 // fallthrough --> exitMBB 349 BB = loopMBB; 350 BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 351 if (BinOpcode) { 352 // All arithmetic operations we'll be creating are designed to take an extra 353 // shift or extend operand, which we can conveniently set to zero. 354 355 // Operand order needs to go the other way for NAND. 356 if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl) 357 BuildMI(BB, dl, TII->get(BinOpcode), scratch) 358 .addReg(incr).addReg(dest).addImm(0); 359 else 360 BuildMI(BB, dl, TII->get(BinOpcode), scratch) 361 .addReg(dest).addReg(incr).addImm(0); 362 } 363 364 // From the stxr, the register is GPR32; from the cmp it's GPR32wsp 365 unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 366 MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass); 367 368 BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr); 369 BuildMI(BB, dl, TII->get(AArch64::CBNZw)) 370 .addReg(stxr_status).addMBB(loopMBB); 371 372 BB->addSuccessor(loopMBB); 373 BB->addSuccessor(exitMBB); 374 375 // exitMBB: 376 // ... 377 BB = exitMBB; 378 379 MI->eraseFromParent(); // The instruction is gone now. 380 381 return BB; 382 } 383 384 MachineBasicBlock * 385 AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI, 386 MachineBasicBlock *BB, 387 unsigned Size, 388 unsigned CmpOp, 389 A64CC::CondCodes Cond) const { 390 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 391 392 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 393 MachineFunction *MF = BB->getParent(); 394 MachineFunction::iterator It = BB; 395 ++It; 396 397 unsigned dest = MI->getOperand(0).getReg(); 398 unsigned ptr = MI->getOperand(1).getReg(); 399 unsigned incr = MI->getOperand(2).getReg(); 400 unsigned oldval = dest; 401 DebugLoc dl = MI->getDebugLoc(); 402 403 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 404 const TargetRegisterClass *TRC, *TRCsp; 405 if (Size == 8) { 406 TRC = &AArch64::GPR64RegClass; 407 TRCsp = &AArch64::GPR64xspRegClass; 408 } else { 409 TRC = &AArch64::GPR32RegClass; 410 TRCsp = &AArch64::GPR32wspRegClass; 411 } 412 413 unsigned ldrOpc, strOpc; 414 getExclusiveOperation(Size, ldrOpc, strOpc); 415 416 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 417 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 418 MF->insert(It, loopMBB); 419 MF->insert(It, exitMBB); 420 421 // Transfer the remainder of BB and its successor edges to exitMBB. 422 exitMBB->splice(exitMBB->begin(), BB, 423 llvm::next(MachineBasicBlock::iterator(MI)), 424 BB->end()); 425 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 426 427 unsigned scratch = MRI.createVirtualRegister(TRC); 428 MRI.constrainRegClass(scratch, TRCsp); 429 430 // thisMBB: 431 // ... 432 // fallthrough --> loopMBB 433 BB->addSuccessor(loopMBB); 434 435 // loopMBB: 436 // ldxr dest, ptr 437 // cmp incr, dest (, sign extend if necessary) 438 // csel scratch, dest, incr, cond 439 // stxr stxr_status, scratch, ptr 440 // cbnz stxr_status, loopMBB 441 // fallthrough --> exitMBB 442 BB = loopMBB; 443 BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 444 445 // Build compare and cmov instructions. 446 MRI.constrainRegClass(incr, TRCsp); 447 BuildMI(BB, dl, TII->get(CmpOp)) 448 .addReg(incr).addReg(oldval).addImm(0); 449 450 BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc), 451 scratch) 452 .addReg(oldval).addReg(incr).addImm(Cond); 453 454 unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 455 MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass); 456 457 BuildMI(BB, dl, TII->get(strOpc), stxr_status) 458 .addReg(scratch).addReg(ptr); 459 BuildMI(BB, dl, TII->get(AArch64::CBNZw)) 460 .addReg(stxr_status).addMBB(loopMBB); 461 462 BB->addSuccessor(loopMBB); 463 BB->addSuccessor(exitMBB); 464 465 // exitMBB: 466 // ... 467 BB = exitMBB; 468 469 MI->eraseFromParent(); // The instruction is gone now. 470 471 return BB; 472 } 473 474 MachineBasicBlock * 475 AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI, 476 MachineBasicBlock *BB, 477 unsigned Size) const { 478 unsigned dest = MI->getOperand(0).getReg(); 479 unsigned ptr = MI->getOperand(1).getReg(); 480 unsigned oldval = MI->getOperand(2).getReg(); 481 unsigned newval = MI->getOperand(3).getReg(); 482 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 483 DebugLoc dl = MI->getDebugLoc(); 484 485 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 486 const TargetRegisterClass *TRCsp; 487 TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass; 488 489 unsigned ldrOpc, strOpc; 490 getExclusiveOperation(Size, ldrOpc, strOpc); 491 492 MachineFunction *MF = BB->getParent(); 493 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 494 MachineFunction::iterator It = BB; 495 ++It; // insert the new blocks after the current block 496 497 MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); 498 MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); 499 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 500 MF->insert(It, loop1MBB); 501 MF->insert(It, loop2MBB); 502 MF->insert(It, exitMBB); 503 504 // Transfer the remainder of BB and its successor edges to exitMBB. 505 exitMBB->splice(exitMBB->begin(), BB, 506 llvm::next(MachineBasicBlock::iterator(MI)), 507 BB->end()); 508 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 509 510 // thisMBB: 511 // ... 512 // fallthrough --> loop1MBB 513 BB->addSuccessor(loop1MBB); 514 515 // loop1MBB: 516 // ldxr dest, [ptr] 517 // cmp dest, oldval 518 // b.ne exitMBB 519 BB = loop1MBB; 520 BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 521 522 unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl; 523 MRI.constrainRegClass(dest, TRCsp); 524 BuildMI(BB, dl, TII->get(CmpOp)) 525 .addReg(dest).addReg(oldval).addImm(0); 526 BuildMI(BB, dl, TII->get(AArch64::Bcc)) 527 .addImm(A64CC::NE).addMBB(exitMBB); 528 BB->addSuccessor(loop2MBB); 529 BB->addSuccessor(exitMBB); 530 531 // loop2MBB: 532 // strex stxr_status, newval, [ptr] 533 // cbnz stxr_status, loop1MBB 534 BB = loop2MBB; 535 unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 536 MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass); 537 538 BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr); 539 BuildMI(BB, dl, TII->get(AArch64::CBNZw)) 540 .addReg(stxr_status).addMBB(loop1MBB); 541 BB->addSuccessor(loop1MBB); 542 BB->addSuccessor(exitMBB); 543 544 // exitMBB: 545 // ... 546 BB = exitMBB; 547 548 MI->eraseFromParent(); // The instruction is gone now. 549 550 return BB; 551 } 552 553 MachineBasicBlock * 554 AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, 555 MachineBasicBlock *MBB) const { 556 // We materialise the F128CSEL pseudo-instruction using conditional branches 557 // and loads, giving an instruciton sequence like: 558 // str q0, [sp] 559 // b.ne IfTrue 560 // b Finish 561 // IfTrue: 562 // str q1, [sp] 563 // Finish: 564 // ldr q0, [sp] 565 // 566 // Using virtual registers would probably not be beneficial since COPY 567 // instructions are expensive for f128 (there's no actual instruction to 568 // implement them). 569 // 570 // An alternative would be to do an integer-CSEL on some address. E.g.: 571 // mov x0, sp 572 // add x1, sp, #16 573 // str q0, [x0] 574 // str q1, [x1] 575 // csel x0, x0, x1, ne 576 // ldr q0, [x0] 577 // 578 // It's unclear which approach is actually optimal. 579 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 580 MachineFunction *MF = MBB->getParent(); 581 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 582 DebugLoc DL = MI->getDebugLoc(); 583 MachineFunction::iterator It = MBB; 584 ++It; 585 586 unsigned DestReg = MI->getOperand(0).getReg(); 587 unsigned IfTrueReg = MI->getOperand(1).getReg(); 588 unsigned IfFalseReg = MI->getOperand(2).getReg(); 589 unsigned CondCode = MI->getOperand(3).getImm(); 590 bool NZCVKilled = MI->getOperand(4).isKill(); 591 592 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 593 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 594 MF->insert(It, TrueBB); 595 MF->insert(It, EndBB); 596 597 // Transfer rest of current basic-block to EndBB 598 EndBB->splice(EndBB->begin(), MBB, 599 llvm::next(MachineBasicBlock::iterator(MI)), 600 MBB->end()); 601 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 602 603 // We need somewhere to store the f128 value needed. 604 int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16); 605 606 // [... start of incoming MBB ...] 607 // str qIFFALSE, [sp] 608 // b.cc IfTrue 609 // b Done 610 BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR)) 611 .addReg(IfFalseReg) 612 .addFrameIndex(ScratchFI) 613 .addImm(0); 614 BuildMI(MBB, DL, TII->get(AArch64::Bcc)) 615 .addImm(CondCode) 616 .addMBB(TrueBB); 617 BuildMI(MBB, DL, TII->get(AArch64::Bimm)) 618 .addMBB(EndBB); 619 MBB->addSuccessor(TrueBB); 620 MBB->addSuccessor(EndBB); 621 622 // IfTrue: 623 // str qIFTRUE, [sp] 624 BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR)) 625 .addReg(IfTrueReg) 626 .addFrameIndex(ScratchFI) 627 .addImm(0); 628 629 // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the 630 // blocks. 631 TrueBB->addSuccessor(EndBB); 632 633 // Done: 634 // ldr qDEST, [sp] 635 // [... rest of incoming MBB ...] 636 if (!NZCVKilled) 637 EndBB->addLiveIn(AArch64::NZCV); 638 MachineInstr *StartOfEnd = EndBB->begin(); 639 BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg) 640 .addFrameIndex(ScratchFI) 641 .addImm(0); 642 643 MI->eraseFromParent(); 644 return EndBB; 645 } 646 647 MachineBasicBlock * 648 AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 649 MachineBasicBlock *MBB) const { 650 switch (MI->getOpcode()) { 651 default: llvm_unreachable("Unhandled instruction with custom inserter"); 652 case AArch64::F128CSEL: 653 return EmitF128CSEL(MI, MBB); 654 case AArch64::ATOMIC_LOAD_ADD_I8: 655 return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl); 656 case AArch64::ATOMIC_LOAD_ADD_I16: 657 return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl); 658 case AArch64::ATOMIC_LOAD_ADD_I32: 659 return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl); 660 case AArch64::ATOMIC_LOAD_ADD_I64: 661 return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl); 662 663 case AArch64::ATOMIC_LOAD_SUB_I8: 664 return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl); 665 case AArch64::ATOMIC_LOAD_SUB_I16: 666 return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl); 667 case AArch64::ATOMIC_LOAD_SUB_I32: 668 return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl); 669 case AArch64::ATOMIC_LOAD_SUB_I64: 670 return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl); 671 672 case AArch64::ATOMIC_LOAD_AND_I8: 673 return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl); 674 case AArch64::ATOMIC_LOAD_AND_I16: 675 return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl); 676 case AArch64::ATOMIC_LOAD_AND_I32: 677 return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl); 678 case AArch64::ATOMIC_LOAD_AND_I64: 679 return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl); 680 681 case AArch64::ATOMIC_LOAD_OR_I8: 682 return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl); 683 case AArch64::ATOMIC_LOAD_OR_I16: 684 return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl); 685 case AArch64::ATOMIC_LOAD_OR_I32: 686 return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl); 687 case AArch64::ATOMIC_LOAD_OR_I64: 688 return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl); 689 690 case AArch64::ATOMIC_LOAD_XOR_I8: 691 return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl); 692 case AArch64::ATOMIC_LOAD_XOR_I16: 693 return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl); 694 case AArch64::ATOMIC_LOAD_XOR_I32: 695 return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl); 696 case AArch64::ATOMIC_LOAD_XOR_I64: 697 return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl); 698 699 case AArch64::ATOMIC_LOAD_NAND_I8: 700 return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl); 701 case AArch64::ATOMIC_LOAD_NAND_I16: 702 return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl); 703 case AArch64::ATOMIC_LOAD_NAND_I32: 704 return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl); 705 case AArch64::ATOMIC_LOAD_NAND_I64: 706 return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl); 707 708 case AArch64::ATOMIC_LOAD_MIN_I8: 709 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT); 710 case AArch64::ATOMIC_LOAD_MIN_I16: 711 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT); 712 case AArch64::ATOMIC_LOAD_MIN_I32: 713 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT); 714 case AArch64::ATOMIC_LOAD_MIN_I64: 715 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT); 716 717 case AArch64::ATOMIC_LOAD_MAX_I8: 718 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT); 719 case AArch64::ATOMIC_LOAD_MAX_I16: 720 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT); 721 case AArch64::ATOMIC_LOAD_MAX_I32: 722 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT); 723 case AArch64::ATOMIC_LOAD_MAX_I64: 724 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT); 725 726 case AArch64::ATOMIC_LOAD_UMIN_I8: 727 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI); 728 case AArch64::ATOMIC_LOAD_UMIN_I16: 729 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI); 730 case AArch64::ATOMIC_LOAD_UMIN_I32: 731 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI); 732 case AArch64::ATOMIC_LOAD_UMIN_I64: 733 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI); 734 735 case AArch64::ATOMIC_LOAD_UMAX_I8: 736 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO); 737 case AArch64::ATOMIC_LOAD_UMAX_I16: 738 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO); 739 case AArch64::ATOMIC_LOAD_UMAX_I32: 740 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO); 741 case AArch64::ATOMIC_LOAD_UMAX_I64: 742 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO); 743 744 case AArch64::ATOMIC_SWAP_I8: 745 return emitAtomicBinary(MI, MBB, 1, 0); 746 case AArch64::ATOMIC_SWAP_I16: 747 return emitAtomicBinary(MI, MBB, 2, 0); 748 case AArch64::ATOMIC_SWAP_I32: 749 return emitAtomicBinary(MI, MBB, 4, 0); 750 case AArch64::ATOMIC_SWAP_I64: 751 return emitAtomicBinary(MI, MBB, 8, 0); 752 753 case AArch64::ATOMIC_CMP_SWAP_I8: 754 return emitAtomicCmpSwap(MI, MBB, 1); 755 case AArch64::ATOMIC_CMP_SWAP_I16: 756 return emitAtomicCmpSwap(MI, MBB, 2); 757 case AArch64::ATOMIC_CMP_SWAP_I32: 758 return emitAtomicCmpSwap(MI, MBB, 4); 759 case AArch64::ATOMIC_CMP_SWAP_I64: 760 return emitAtomicCmpSwap(MI, MBB, 8); 761 } 762 } 763 764 765 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 766 switch (Opcode) { 767 case AArch64ISD::BR_CC: return "AArch64ISD::BR_CC"; 768 case AArch64ISD::Call: return "AArch64ISD::Call"; 769 case AArch64ISD::FPMOV: return "AArch64ISD::FPMOV"; 770 case AArch64ISD::GOTLoad: return "AArch64ISD::GOTLoad"; 771 case AArch64ISD::BFI: return "AArch64ISD::BFI"; 772 case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; 773 case AArch64ISD::Ret: return "AArch64ISD::Ret"; 774 case AArch64ISD::SBFX: return "AArch64ISD::SBFX"; 775 case AArch64ISD::SELECT_CC: return "AArch64ISD::SELECT_CC"; 776 case AArch64ISD::SETCC: return "AArch64ISD::SETCC"; 777 case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; 778 case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; 779 case AArch64ISD::TLSDESCCALL: return "AArch64ISD::TLSDESCCALL"; 780 case AArch64ISD::WrapperSmall: return "AArch64ISD::WrapperSmall"; 781 782 default: return NULL; 783 } 784 } 785 786 static const uint16_t AArch64FPRArgRegs[] = { 787 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 788 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7 789 }; 790 static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs); 791 792 static const uint16_t AArch64ArgRegs[] = { 793 AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, 794 AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7 795 }; 796 static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs); 797 798 static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT, 799 CCValAssign::LocInfo LocInfo, 800 ISD::ArgFlagsTy ArgFlags, CCState &State) { 801 // Mark all remaining general purpose registers as allocated. We don't 802 // backtrack: if (for example) an i128 gets put on the stack, no subsequent 803 // i64 will go in registers (C.11). 804 for (unsigned i = 0; i < NumArgRegs; ++i) 805 State.AllocateReg(AArch64ArgRegs[i]); 806 807 return false; 808 } 809 810 #include "AArch64GenCallingConv.inc" 811 812 CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 813 814 switch(CC) { 815 default: llvm_unreachable("Unsupported calling convention"); 816 case CallingConv::Fast: 817 case CallingConv::C: 818 return CC_A64_APCS; 819 } 820 } 821 822 void 823 AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, 824 DebugLoc DL, SDValue &Chain) const { 825 MachineFunction &MF = DAG.getMachineFunction(); 826 MachineFrameInfo *MFI = MF.getFrameInfo(); 827 AArch64MachineFunctionInfo *FuncInfo 828 = MF.getInfo<AArch64MachineFunctionInfo>(); 829 830 SmallVector<SDValue, 8> MemOps; 831 832 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs, 833 NumArgRegs); 834 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs, 835 NumFPRArgRegs); 836 837 unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR); 838 int GPRIdx = 0; 839 if (GPRSaveSize != 0) { 840 GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); 841 842 SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy()); 843 844 for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) { 845 unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass); 846 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 847 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, 848 MachinePointerInfo::getStack(i * 8), 849 false, false, 0); 850 MemOps.push_back(Store); 851 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, 852 DAG.getConstant(8, getPointerTy())); 853 } 854 } 855 856 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 857 int FPRIdx = 0; 858 if (FPRSaveSize != 0) { 859 FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); 860 861 SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); 862 863 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 864 unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i], 865 &AArch64::FPR128RegClass); 866 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 867 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, 868 MachinePointerInfo::getStack(i * 16), 869 false, false, 0); 870 MemOps.push_back(Store); 871 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, 872 DAG.getConstant(16, getPointerTy())); 873 } 874 } 875 876 int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true); 877 878 FuncInfo->setVariadicStackIdx(StackIdx); 879 FuncInfo->setVariadicGPRIdx(GPRIdx); 880 FuncInfo->setVariadicGPRSize(GPRSaveSize); 881 FuncInfo->setVariadicFPRIdx(FPRIdx); 882 FuncInfo->setVariadicFPRSize(FPRSaveSize); 883 884 if (!MemOps.empty()) { 885 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0], 886 MemOps.size()); 887 } 888 } 889 890 891 SDValue 892 AArch64TargetLowering::LowerFormalArguments(SDValue Chain, 893 CallingConv::ID CallConv, bool isVarArg, 894 const SmallVectorImpl<ISD::InputArg> &Ins, 895 DebugLoc dl, SelectionDAG &DAG, 896 SmallVectorImpl<SDValue> &InVals) const { 897 MachineFunction &MF = DAG.getMachineFunction(); 898 AArch64MachineFunctionInfo *FuncInfo 899 = MF.getInfo<AArch64MachineFunctionInfo>(); 900 MachineFrameInfo *MFI = MF.getFrameInfo(); 901 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 902 903 SmallVector<CCValAssign, 16> ArgLocs; 904 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 905 getTargetMachine(), ArgLocs, *DAG.getContext()); 906 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 907 908 SmallVector<SDValue, 16> ArgValues; 909 910 SDValue ArgValue; 911 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 912 CCValAssign &VA = ArgLocs[i]; 913 ISD::ArgFlagsTy Flags = Ins[i].Flags; 914 915 if (Flags.isByVal()) { 916 // Byval is used for small structs and HFAs in the PCS, but the system 917 // should work in a non-compliant manner for larger structs. 918 EVT PtrTy = getPointerTy(); 919 int Size = Flags.getByValSize(); 920 unsigned NumRegs = (Size + 7) / 8; 921 922 unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs, 923 VA.getLocMemOffset(), 924 false); 925 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy); 926 InVals.push_back(FrameIdxN); 927 928 continue; 929 } else if (VA.isRegLoc()) { 930 MVT RegVT = VA.getLocVT(); 931 const TargetRegisterClass *RC = getRegClassFor(RegVT); 932 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 933 934 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 935 } else { // VA.isRegLoc() 936 assert(VA.isMemLoc()); 937 938 int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 939 VA.getLocMemOffset(), true); 940 941 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 942 ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, 943 MachinePointerInfo::getFixedStack(FI), 944 false, false, false, 0); 945 946 947 } 948 949 switch (VA.getLocInfo()) { 950 default: llvm_unreachable("Unknown loc info!"); 951 case CCValAssign::Full: break; 952 case CCValAssign::BCvt: 953 ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue); 954 break; 955 case CCValAssign::SExt: 956 case CCValAssign::ZExt: 957 case CCValAssign::AExt: { 958 unsigned DestSize = VA.getValVT().getSizeInBits(); 959 unsigned DestSubReg; 960 961 switch (DestSize) { 962 case 8: DestSubReg = AArch64::sub_8; break; 963 case 16: DestSubReg = AArch64::sub_16; break; 964 case 32: DestSubReg = AArch64::sub_32; break; 965 case 64: DestSubReg = AArch64::sub_64; break; 966 default: llvm_unreachable("Unexpected argument promotion"); 967 } 968 969 ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, 970 VA.getValVT(), ArgValue, 971 DAG.getTargetConstant(DestSubReg, MVT::i32)), 972 0); 973 break; 974 } 975 } 976 977 InVals.push_back(ArgValue); 978 } 979 980 if (isVarArg) 981 SaveVarArgRegisters(CCInfo, DAG, dl, Chain); 982 983 unsigned StackArgSize = CCInfo.getNextStackOffset(); 984 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 985 // This is a non-standard ABI so by fiat I say we're allowed to make full 986 // use of the stack area to be popped, which must be aligned to 16 bytes in 987 // any case: 988 StackArgSize = RoundUpToAlignment(StackArgSize, 16); 989 990 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 991 // a multiple of 16. 992 FuncInfo->setArgumentStackToRestore(StackArgSize); 993 994 // This realignment carries over to the available bytes below. Our own 995 // callers will guarantee the space is free by giving an aligned value to 996 // CALLSEQ_START. 997 } 998 // Even if we're not expected to free up the space, it's useful to know how 999 // much is there while considering tail calls (because we can reuse it). 1000 FuncInfo->setBytesInStackArgArea(StackArgSize); 1001 1002 return Chain; 1003 } 1004 1005 SDValue 1006 AArch64TargetLowering::LowerReturn(SDValue Chain, 1007 CallingConv::ID CallConv, bool isVarArg, 1008 const SmallVectorImpl<ISD::OutputArg> &Outs, 1009 const SmallVectorImpl<SDValue> &OutVals, 1010 DebugLoc dl, SelectionDAG &DAG) const { 1011 // CCValAssign - represent the assignment of the return value to a location. 1012 SmallVector<CCValAssign, 16> RVLocs; 1013 1014 // CCState - Info about the registers and stack slots. 1015 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1016 getTargetMachine(), RVLocs, *DAG.getContext()); 1017 1018 // Analyze outgoing return values. 1019 CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv)); 1020 1021 SDValue Flag; 1022 SmallVector<SDValue, 4> RetOps(1, Chain); 1023 1024 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 1025 // PCS: "If the type, T, of the result of a function is such that 1026 // void func(T arg) would require that arg be passed as a value in a 1027 // register (or set of registers) according to the rules in 5.4, then the 1028 // result is returned in the same registers as would be used for such an 1029 // argument. 1030 // 1031 // Otherwise, the caller shall reserve a block of memory of sufficient 1032 // size and alignment to hold the result. The address of the memory block 1033 // shall be passed as an additional argument to the function in x8." 1034 // 1035 // This is implemented in two places. The register-return values are dealt 1036 // with here, more complex returns are passed as an sret parameter, which 1037 // means we don't have to worry about it during actual return. 1038 CCValAssign &VA = RVLocs[i]; 1039 assert(VA.isRegLoc() && "Only register-returns should be created by PCS"); 1040 1041 1042 SDValue Arg = OutVals[i]; 1043 1044 // There's no convenient note in the ABI about this as there is for normal 1045 // arguments, but it says return values are passed in the same registers as 1046 // an argument would be. I believe that includes the comments about 1047 // unspecified higher bits, putting the burden of widening on the *caller* 1048 // for return values. 1049 switch (VA.getLocInfo()) { 1050 default: llvm_unreachable("Unknown loc info"); 1051 case CCValAssign::Full: break; 1052 case CCValAssign::SExt: 1053 case CCValAssign::ZExt: 1054 case CCValAssign::AExt: 1055 // Floating-point values should only be extended when they're going into 1056 // memory, which can't happen here so an integer extend is acceptable. 1057 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1058 break; 1059 case CCValAssign::BCvt: 1060 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1061 break; 1062 } 1063 1064 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 1065 Flag = Chain.getValue(1); 1066 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1067 } 1068 1069 RetOps[0] = Chain; // Update chain. 1070 1071 // Add the flag if we have it. 1072 if (Flag.getNode()) 1073 RetOps.push_back(Flag); 1074 1075 return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other, 1076 &RetOps[0], RetOps.size()); 1077 } 1078 1079 SDValue 1080 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 1081 SmallVectorImpl<SDValue> &InVals) const { 1082 SelectionDAG &DAG = CLI.DAG; 1083 DebugLoc &dl = CLI.DL; 1084 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 1085 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 1086 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 1087 SDValue Chain = CLI.Chain; 1088 SDValue Callee = CLI.Callee; 1089 bool &IsTailCall = CLI.IsTailCall; 1090 CallingConv::ID CallConv = CLI.CallConv; 1091 bool IsVarArg = CLI.IsVarArg; 1092 1093 MachineFunction &MF = DAG.getMachineFunction(); 1094 AArch64MachineFunctionInfo *FuncInfo 1095 = MF.getInfo<AArch64MachineFunctionInfo>(); 1096 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 1097 bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet(); 1098 bool IsSibCall = false; 1099 1100 if (IsTailCall) { 1101 IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1102 IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1103 Outs, OutVals, Ins, DAG); 1104 1105 // A sibling call is one where we're under the usual C ABI and not planning 1106 // to change that but can still do a tail call: 1107 if (!TailCallOpt && IsTailCall) 1108 IsSibCall = true; 1109 } 1110 1111 SmallVector<CCValAssign, 16> ArgLocs; 1112 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), 1113 getTargetMachine(), ArgLocs, *DAG.getContext()); 1114 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1115 1116 // On AArch64 (and all other architectures I'm aware of) the most this has to 1117 // do is adjust the stack pointer. 1118 unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16); 1119 if (IsSibCall) { 1120 // Since we're not changing the ABI to make this a tail call, the memory 1121 // operands are already available in the caller's incoming argument space. 1122 NumBytes = 0; 1123 } 1124 1125 // FPDiff is the byte offset of the call's argument area from the callee's. 1126 // Stores to callee stack arguments will be placed in FixedStackSlots offset 1127 // by this amount for a tail call. In a sibling call it must be 0 because the 1128 // caller will deallocate the entire stack and the callee still expects its 1129 // arguments to begin at SP+0. Completely unused for non-tail calls. 1130 int FPDiff = 0; 1131 1132 if (IsTailCall && !IsSibCall) { 1133 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 1134 1135 // FPDiff will be negative if this tail call requires more space than we 1136 // would automatically have in our incoming argument space. Positive if we 1137 // can actually shrink the stack. 1138 FPDiff = NumReusableBytes - NumBytes; 1139 1140 // The stack pointer must be 16-byte aligned at all times it's used for a 1141 // memory operation, which in practice means at *all* times and in 1142 // particular across call boundaries. Therefore our own arguments started at 1143 // a 16-byte aligned SP and the delta applied for the tail call should 1144 // satisfy the same constraint. 1145 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 1146 } 1147 1148 if (!IsSibCall) 1149 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1150 1151 SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP, 1152 getPointerTy()); 1153 1154 SmallVector<SDValue, 8> MemOpChains; 1155 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1156 1157 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1158 CCValAssign &VA = ArgLocs[i]; 1159 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1160 SDValue Arg = OutVals[i]; 1161 1162 // Callee does the actual widening, so all extensions just use an implicit 1163 // definition of the rest of the Loc. Aesthetically, this would be nicer as 1164 // an ANY_EXTEND, but that isn't valid for floating-point types and this 1165 // alternative works on integer types too. 1166 switch (VA.getLocInfo()) { 1167 default: llvm_unreachable("Unknown loc info!"); 1168 case CCValAssign::Full: break; 1169 case CCValAssign::SExt: 1170 case CCValAssign::ZExt: 1171 case CCValAssign::AExt: { 1172 unsigned SrcSize = VA.getValVT().getSizeInBits(); 1173 unsigned SrcSubReg; 1174 1175 switch (SrcSize) { 1176 case 8: SrcSubReg = AArch64::sub_8; break; 1177 case 16: SrcSubReg = AArch64::sub_16; break; 1178 case 32: SrcSubReg = AArch64::sub_32; break; 1179 case 64: SrcSubReg = AArch64::sub_64; break; 1180 default: llvm_unreachable("Unexpected argument promotion"); 1181 } 1182 1183 Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, 1184 VA.getLocVT(), 1185 DAG.getUNDEF(VA.getLocVT()), 1186 Arg, 1187 DAG.getTargetConstant(SrcSubReg, MVT::i32)), 1188 0); 1189 1190 break; 1191 } 1192 case CCValAssign::BCvt: 1193 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1194 break; 1195 } 1196 1197 if (VA.isRegLoc()) { 1198 // A normal register (sub-) argument. For now we just note it down because 1199 // we want to copy things into registers as late as possible to avoid 1200 // register-pressure (and possibly worse). 1201 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1202 continue; 1203 } 1204 1205 assert(VA.isMemLoc() && "unexpected argument location"); 1206 1207 SDValue DstAddr; 1208 MachinePointerInfo DstInfo; 1209 if (IsTailCall) { 1210 uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() : 1211 VA.getLocVT().getSizeInBits(); 1212 OpSize = (OpSize + 7) / 8; 1213 int32_t Offset = VA.getLocMemOffset() + FPDiff; 1214 int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 1215 1216 DstAddr = DAG.getFrameIndex(FI, getPointerTy()); 1217 DstInfo = MachinePointerInfo::getFixedStack(FI); 1218 1219 // Make sure any stack arguments overlapping with where we're storing are 1220 // loaded before this eventual operation. Otherwise they'll be clobbered. 1221 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 1222 } else { 1223 SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1224 1225 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1226 DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset()); 1227 } 1228 1229 if (Flags.isByVal()) { 1230 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64); 1231 SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode, 1232 Flags.getByValAlign(), 1233 /*isVolatile = */ false, 1234 /*alwaysInline = */ false, 1235 DstInfo, MachinePointerInfo(0)); 1236 MemOpChains.push_back(Cpy); 1237 } else { 1238 // Normal stack argument, put it where it's needed. 1239 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo, 1240 false, false, 0); 1241 MemOpChains.push_back(Store); 1242 } 1243 } 1244 1245 // The loads and stores generated above shouldn't clash with each 1246 // other. Combining them with this TokenFactor notes that fact for the rest of 1247 // the backend. 1248 if (!MemOpChains.empty()) 1249 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1250 &MemOpChains[0], MemOpChains.size()); 1251 1252 // Most of the rest of the instructions need to be glued together; we don't 1253 // want assignments to actual registers used by a call to be rearranged by a 1254 // well-meaning scheduler. 1255 SDValue InFlag; 1256 1257 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1258 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1259 RegsToPass[i].second, InFlag); 1260 InFlag = Chain.getValue(1); 1261 } 1262 1263 // The linker is responsible for inserting veneers when necessary to put a 1264 // function call destination in range, so we don't need to bother with a 1265 // wrapper here. 1266 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1267 const GlobalValue *GV = G->getGlobal(); 1268 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy()); 1269 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1270 const char *Sym = S->getSymbol(); 1271 Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy()); 1272 } 1273 1274 // We don't usually want to end the call-sequence here because we would tidy 1275 // the frame up *after* the call, however in the ABI-changing tail-call case 1276 // we've carefully laid out the parameters so that when sp is reset they'll be 1277 // in the correct location. 1278 if (IsTailCall && !IsSibCall) { 1279 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1280 DAG.getIntPtrConstant(0, true), InFlag); 1281 InFlag = Chain.getValue(1); 1282 } 1283 1284 // We produce the following DAG scheme for the actual call instruction: 1285 // (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag? 1286 // 1287 // Most arguments aren't going to be used and just keep the values live as 1288 // far as LLVM is concerned. It's expected to be selected as simply "bl 1289 // callee" (for a direct, non-tail call). 1290 std::vector<SDValue> Ops; 1291 Ops.push_back(Chain); 1292 Ops.push_back(Callee); 1293 1294 if (IsTailCall) { 1295 // Each tail call may have to adjust the stack by a different amount, so 1296 // this information must travel along with the operation for eventual 1297 // consumption by emitEpilogue. 1298 Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32)); 1299 } 1300 1301 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 1302 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 1303 RegsToPass[i].second.getValueType())); 1304 1305 1306 // Add a register mask operand representing the call-preserved registers. This 1307 // is used later in codegen to constrain register-allocation. 1308 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 1309 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 1310 assert(Mask && "Missing call preserved mask for calling convention"); 1311 Ops.push_back(DAG.getRegisterMask(Mask)); 1312 1313 // If we needed glue, put it in as the last argument. 1314 if (InFlag.getNode()) 1315 Ops.push_back(InFlag); 1316 1317 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 1318 1319 if (IsTailCall) { 1320 return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); 1321 } 1322 1323 Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size()); 1324 InFlag = Chain.getValue(1); 1325 1326 // Now we can reclaim the stack, just as well do it before working out where 1327 // our return value is. 1328 if (!IsSibCall) { 1329 uint64_t CalleePopBytes 1330 = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0; 1331 1332 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1333 DAG.getIntPtrConstant(CalleePopBytes, true), 1334 InFlag); 1335 InFlag = Chain.getValue(1); 1336 } 1337 1338 return LowerCallResult(Chain, InFlag, CallConv, 1339 IsVarArg, Ins, dl, DAG, InVals); 1340 } 1341 1342 SDValue 1343 AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1344 CallingConv::ID CallConv, bool IsVarArg, 1345 const SmallVectorImpl<ISD::InputArg> &Ins, 1346 DebugLoc dl, SelectionDAG &DAG, 1347 SmallVectorImpl<SDValue> &InVals) const { 1348 // Assign locations to each value returned by this call. 1349 SmallVector<CCValAssign, 16> RVLocs; 1350 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), 1351 getTargetMachine(), RVLocs, *DAG.getContext()); 1352 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv)); 1353 1354 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1355 CCValAssign VA = RVLocs[i]; 1356 1357 // Return values that are too big to fit into registers should use an sret 1358 // pointer, so this can be a lot simpler than the main argument code. 1359 assert(VA.isRegLoc() && "Memory locations not expected for call return"); 1360 1361 SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 1362 InFlag); 1363 Chain = Val.getValue(1); 1364 InFlag = Val.getValue(2); 1365 1366 switch (VA.getLocInfo()) { 1367 default: llvm_unreachable("Unknown loc info!"); 1368 case CCValAssign::Full: break; 1369 case CCValAssign::BCvt: 1370 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 1371 break; 1372 case CCValAssign::ZExt: 1373 case CCValAssign::SExt: 1374 case CCValAssign::AExt: 1375 // Floating-point arguments only get extended/truncated if they're going 1376 // in memory, so using the integer operation is acceptable here. 1377 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 1378 break; 1379 } 1380 1381 InVals.push_back(Val); 1382 } 1383 1384 return Chain; 1385 } 1386 1387 bool 1388 AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 1389 CallingConv::ID CalleeCC, 1390 bool IsVarArg, 1391 bool IsCalleeStructRet, 1392 bool IsCallerStructRet, 1393 const SmallVectorImpl<ISD::OutputArg> &Outs, 1394 const SmallVectorImpl<SDValue> &OutVals, 1395 const SmallVectorImpl<ISD::InputArg> &Ins, 1396 SelectionDAG& DAG) const { 1397 1398 // For CallingConv::C this function knows whether the ABI needs 1399 // changing. That's not true for other conventions so they will have to opt in 1400 // manually. 1401 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1402 return false; 1403 1404 const MachineFunction &MF = DAG.getMachineFunction(); 1405 const Function *CallerF = MF.getFunction(); 1406 CallingConv::ID CallerCC = CallerF->getCallingConv(); 1407 bool CCMatch = CallerCC == CalleeCC; 1408 1409 // Byval parameters hand the function a pointer directly into the stack area 1410 // we want to reuse during a tail call. Working around this *is* possible (see 1411 // X86) but less efficient and uglier in LowerCall. 1412 for (Function::const_arg_iterator i = CallerF->arg_begin(), 1413 e = CallerF->arg_end(); i != e; ++i) 1414 if (i->hasByValAttr()) 1415 return false; 1416 1417 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 1418 if (IsTailCallConvention(CalleeCC) && CCMatch) 1419 return true; 1420 return false; 1421 } 1422 1423 // Now we search for cases where we can use a tail call without changing the 1424 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 1425 // concept. 1426 1427 // I want anyone implementing a new calling convention to think long and hard 1428 // about this assert. 1429 assert((!IsVarArg || CalleeCC == CallingConv::C) 1430 && "Unexpected variadic calling convention"); 1431 1432 if (IsVarArg && !Outs.empty()) { 1433 // At least two cases here: if caller is fastcc then we can't have any 1434 // memory arguments (we'd be expected to clean up the stack afterwards). If 1435 // caller is C then we could potentially use its argument area. 1436 1437 // FIXME: for now we take the most conservative of these in both cases: 1438 // disallow all variadic memory operands. 1439 SmallVector<CCValAssign, 16> ArgLocs; 1440 CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(), 1441 getTargetMachine(), ArgLocs, *DAG.getContext()); 1442 1443 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 1444 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 1445 if (!ArgLocs[i].isRegLoc()) 1446 return false; 1447 } 1448 1449 // If the calling conventions do not match, then we'd better make sure the 1450 // results are returned in the same way as what the caller expects. 1451 if (!CCMatch) { 1452 SmallVector<CCValAssign, 16> RVLocs1; 1453 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 1454 getTargetMachine(), RVLocs1, *DAG.getContext()); 1455 CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC)); 1456 1457 SmallVector<CCValAssign, 16> RVLocs2; 1458 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 1459 getTargetMachine(), RVLocs2, *DAG.getContext()); 1460 CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC)); 1461 1462 if (RVLocs1.size() != RVLocs2.size()) 1463 return false; 1464 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 1465 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 1466 return false; 1467 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 1468 return false; 1469 if (RVLocs1[i].isRegLoc()) { 1470 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 1471 return false; 1472 } else { 1473 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 1474 return false; 1475 } 1476 } 1477 } 1478 1479 // Nothing more to check if the callee is taking no arguments 1480 if (Outs.empty()) 1481 return true; 1482 1483 SmallVector<CCValAssign, 16> ArgLocs; 1484 CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(), 1485 getTargetMachine(), ArgLocs, *DAG.getContext()); 1486 1487 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 1488 1489 const AArch64MachineFunctionInfo *FuncInfo 1490 = MF.getInfo<AArch64MachineFunctionInfo>(); 1491 1492 // If the stack arguments for this call would fit into our own save area then 1493 // the call can be made tail. 1494 return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); 1495 } 1496 1497 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 1498 bool TailCallOpt) const { 1499 return CallCC == CallingConv::Fast && TailCallOpt; 1500 } 1501 1502 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { 1503 return CallCC == CallingConv::Fast; 1504 } 1505 1506 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 1507 SelectionDAG &DAG, 1508 MachineFrameInfo *MFI, 1509 int ClobberedFI) const { 1510 SmallVector<SDValue, 8> ArgChains; 1511 int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); 1512 int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; 1513 1514 // Include the original chain at the beginning of the list. When this is 1515 // used by target LowerCall hooks, this helps legalize find the 1516 // CALLSEQ_BEGIN node. 1517 ArgChains.push_back(Chain); 1518 1519 // Add a chain value for each stack argument corresponding 1520 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 1521 UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U) 1522 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 1523 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 1524 if (FI->getIndex() < 0) { 1525 int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); 1526 int64_t InLastByte = InFirstByte; 1527 InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; 1528 1529 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 1530 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 1531 ArgChains.push_back(SDValue(L, 1)); 1532 } 1533 1534 // Build a tokenfactor for all the chains. 1535 return DAG.getNode(ISD::TokenFactor, Chain.getDebugLoc(), MVT::Other, 1536 &ArgChains[0], ArgChains.size()); 1537 } 1538 1539 static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) { 1540 switch (CC) { 1541 case ISD::SETEQ: return A64CC::EQ; 1542 case ISD::SETGT: return A64CC::GT; 1543 case ISD::SETGE: return A64CC::GE; 1544 case ISD::SETLT: return A64CC::LT; 1545 case ISD::SETLE: return A64CC::LE; 1546 case ISD::SETNE: return A64CC::NE; 1547 case ISD::SETUGT: return A64CC::HI; 1548 case ISD::SETUGE: return A64CC::HS; 1549 case ISD::SETULT: return A64CC::LO; 1550 case ISD::SETULE: return A64CC::LS; 1551 default: llvm_unreachable("Unexpected condition code"); 1552 } 1553 } 1554 1555 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const { 1556 // icmp is implemented using adds/subs immediate, which take an unsigned 1557 // 12-bit immediate, optionally shifted left by 12 bits. 1558 1559 // Symmetric by using adds/subs 1560 if (Val < 0) 1561 Val = -Val; 1562 1563 return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0; 1564 } 1565 1566 SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS, 1567 ISD::CondCode CC, SDValue &A64cc, 1568 SelectionDAG &DAG, DebugLoc &dl) const { 1569 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 1570 int64_t C = 0; 1571 EVT VT = RHSC->getValueType(0); 1572 bool knownInvalid = false; 1573 1574 // I'm not convinced the rest of LLVM handles these edge cases properly, but 1575 // we can at least get it right. 1576 if (isSignedIntSetCC(CC)) { 1577 C = RHSC->getSExtValue(); 1578 } else if (RHSC->getZExtValue() > INT64_MAX) { 1579 // A 64-bit constant not representable by a signed 64-bit integer is far 1580 // too big to fit into a SUBS immediate anyway. 1581 knownInvalid = true; 1582 } else { 1583 C = RHSC->getZExtValue(); 1584 } 1585 1586 if (!knownInvalid && !isLegalICmpImmediate(C)) { 1587 // Constant does not fit, try adjusting it by one? 1588 switch (CC) { 1589 default: break; 1590 case ISD::SETLT: 1591 case ISD::SETGE: 1592 if (isLegalICmpImmediate(C-1)) { 1593 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 1594 RHS = DAG.getConstant(C-1, VT); 1595 } 1596 break; 1597 case ISD::SETULT: 1598 case ISD::SETUGE: 1599 if (isLegalICmpImmediate(C-1)) { 1600 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 1601 RHS = DAG.getConstant(C-1, VT); 1602 } 1603 break; 1604 case ISD::SETLE: 1605 case ISD::SETGT: 1606 if (isLegalICmpImmediate(C+1)) { 1607 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 1608 RHS = DAG.getConstant(C+1, VT); 1609 } 1610 break; 1611 case ISD::SETULE: 1612 case ISD::SETUGT: 1613 if (isLegalICmpImmediate(C+1)) { 1614 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 1615 RHS = DAG.getConstant(C+1, VT); 1616 } 1617 break; 1618 } 1619 } 1620 } 1621 1622 A64CC::CondCodes CondCode = IntCCToA64CC(CC); 1623 A64cc = DAG.getConstant(CondCode, MVT::i32); 1624 return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 1625 DAG.getCondCode(CC)); 1626 } 1627 1628 static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC, 1629 A64CC::CondCodes &Alternative) { 1630 A64CC::CondCodes CondCode = A64CC::Invalid; 1631 Alternative = A64CC::Invalid; 1632 1633 switch (CC) { 1634 default: llvm_unreachable("Unknown FP condition!"); 1635 case ISD::SETEQ: 1636 case ISD::SETOEQ: CondCode = A64CC::EQ; break; 1637 case ISD::SETGT: 1638 case ISD::SETOGT: CondCode = A64CC::GT; break; 1639 case ISD::SETGE: 1640 case ISD::SETOGE: CondCode = A64CC::GE; break; 1641 case ISD::SETOLT: CondCode = A64CC::MI; break; 1642 case ISD::SETOLE: CondCode = A64CC::LS; break; 1643 case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break; 1644 case ISD::SETO: CondCode = A64CC::VC; break; 1645 case ISD::SETUO: CondCode = A64CC::VS; break; 1646 case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break; 1647 case ISD::SETUGT: CondCode = A64CC::HI; break; 1648 case ISD::SETUGE: CondCode = A64CC::PL; break; 1649 case ISD::SETLT: 1650 case ISD::SETULT: CondCode = A64CC::LT; break; 1651 case ISD::SETLE: 1652 case ISD::SETULE: CondCode = A64CC::LE; break; 1653 case ISD::SETNE: 1654 case ISD::SETUNE: CondCode = A64CC::NE; break; 1655 } 1656 return CondCode; 1657 } 1658 1659 SDValue 1660 AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 1661 DebugLoc DL = Op.getDebugLoc(); 1662 EVT PtrVT = getPointerTy(); 1663 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 1664 1665 assert(getTargetMachine().getCodeModel() == CodeModel::Small 1666 && "Only small code model supported at the moment"); 1667 1668 // The most efficient code is PC-relative anyway for the small memory model, 1669 // so we don't need to worry about relocation model. 1670 return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 1671 DAG.getTargetBlockAddress(BA, PtrVT, 0, 1672 AArch64II::MO_NO_FLAG), 1673 DAG.getTargetBlockAddress(BA, PtrVT, 0, 1674 AArch64II::MO_LO12), 1675 DAG.getConstant(/*Alignment=*/ 4, MVT::i32)); 1676 } 1677 1678 1679 // (BRCOND chain, val, dest) 1680 SDValue 1681 AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1682 DebugLoc dl = Op.getDebugLoc(); 1683 SDValue Chain = Op.getOperand(0); 1684 SDValue TheBit = Op.getOperand(1); 1685 SDValue DestBB = Op.getOperand(2); 1686 1687 // AArch64 BooleanContents is the default UndefinedBooleanContent, which means 1688 // that as the consumer we are responsible for ignoring rubbish in higher 1689 // bits. 1690 TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit, 1691 DAG.getConstant(1, MVT::i32)); 1692 1693 SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit, 1694 DAG.getConstant(0, TheBit.getValueType()), 1695 DAG.getCondCode(ISD::SETNE)); 1696 1697 return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain, 1698 A64CMP, DAG.getConstant(A64CC::NE, MVT::i32), 1699 DestBB); 1700 } 1701 1702 // (BR_CC chain, condcode, lhs, rhs, dest) 1703 SDValue 1704 AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 1705 DebugLoc dl = Op.getDebugLoc(); 1706 SDValue Chain = Op.getOperand(0); 1707 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 1708 SDValue LHS = Op.getOperand(2); 1709 SDValue RHS = Op.getOperand(3); 1710 SDValue DestBB = Op.getOperand(4); 1711 1712 if (LHS.getValueType() == MVT::f128) { 1713 // f128 comparisons are lowered to runtime calls by a routine which sets 1714 // LHS, RHS and CC appropriately for the rest of this function to continue. 1715 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 1716 1717 // If softenSetCCOperands returned a scalar, we need to compare the result 1718 // against zero to select between true and false values. 1719 if (RHS.getNode() == 0) { 1720 RHS = DAG.getConstant(0, LHS.getValueType()); 1721 CC = ISD::SETNE; 1722 } 1723 } 1724 1725 if (LHS.getValueType().isInteger()) { 1726 SDValue A64cc; 1727 1728 // Integers are handled in a separate function because the combinations of 1729 // immediates and tests can get hairy and we may want to fiddle things. 1730 SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl); 1731 1732 return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, 1733 Chain, CmpOp, A64cc, DestBB); 1734 } 1735 1736 // Note that some LLVM floating-point CondCodes can't be lowered to a single 1737 // conditional branch, hence FPCCToA64CC can set a second test, where either 1738 // passing is sufficient. 1739 A64CC::CondCodes CondCode, Alternative = A64CC::Invalid; 1740 CondCode = FPCCToA64CC(CC, Alternative); 1741 SDValue A64cc = DAG.getConstant(CondCode, MVT::i32); 1742 SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 1743 DAG.getCondCode(CC)); 1744 SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, 1745 Chain, SetCC, A64cc, DestBB); 1746 1747 if (Alternative != A64CC::Invalid) { 1748 A64cc = DAG.getConstant(Alternative, MVT::i32); 1749 A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, 1750 A64BR_CC, SetCC, A64cc, DestBB); 1751 1752 } 1753 1754 return A64BR_CC; 1755 } 1756 1757 SDValue 1758 AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG, 1759 RTLIB::Libcall Call) const { 1760 ArgListTy Args; 1761 ArgListEntry Entry; 1762 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { 1763 EVT ArgVT = Op.getOperand(i).getValueType(); 1764 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 1765 Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy; 1766 Entry.isSExt = false; 1767 Entry.isZExt = false; 1768 Args.push_back(Entry); 1769 } 1770 SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy()); 1771 1772 Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext()); 1773 1774 // By default, the input chain to this libcall is the entry node of the 1775 // function. If the libcall is going to be emitted as a tail call then 1776 // isUsedByReturnOnly will change it to the right chain if the return 1777 // node which is being folded has a non-entry input chain. 1778 SDValue InChain = DAG.getEntryNode(); 1779 1780 // isTailCall may be true since the callee does not reference caller stack 1781 // frame. Check if it's in the right position. 1782 SDValue TCChain = InChain; 1783 bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain); 1784 if (isTailCall) 1785 InChain = TCChain; 1786 1787 TargetLowering:: 1788 CallLoweringInfo CLI(InChain, RetTy, false, false, false, false, 1789 0, getLibcallCallingConv(Call), isTailCall, 1790 /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, 1791 Callee, Args, DAG, Op->getDebugLoc()); 1792 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 1793 1794 if (!CallInfo.second.getNode()) 1795 // It's a tailcall, return the chain (which is the DAG root). 1796 return DAG.getRoot(); 1797 1798 return CallInfo.first; 1799 } 1800 1801 SDValue 1802 AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 1803 if (Op.getOperand(0).getValueType() != MVT::f128) { 1804 // It's legal except when f128 is involved 1805 return Op; 1806 } 1807 1808 RTLIB::Libcall LC; 1809 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 1810 1811 SDValue SrcVal = Op.getOperand(0); 1812 return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, 1813 /*isSigned*/ false, Op.getDebugLoc()); 1814 } 1815 1816 SDValue 1817 AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 1818 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 1819 1820 RTLIB::Libcall LC; 1821 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 1822 1823 return LowerF128ToCall(Op, DAG, LC); 1824 } 1825 1826 SDValue 1827 AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, 1828 bool IsSigned) const { 1829 if (Op.getOperand(0).getValueType() != MVT::f128) { 1830 // It's legal except when f128 is involved 1831 return Op; 1832 } 1833 1834 RTLIB::Libcall LC; 1835 if (IsSigned) 1836 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1837 else 1838 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1839 1840 return LowerF128ToCall(Op, DAG, LC); 1841 } 1842 1843 SDValue 1844 AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op, 1845 SelectionDAG &DAG) const { 1846 // TableGen doesn't have easy access to the CodeModel or RelocationModel, so 1847 // we make that distinction here. 1848 1849 // We support the small memory model for now. 1850 assert(getTargetMachine().getCodeModel() == CodeModel::Small); 1851 1852 EVT PtrVT = getPointerTy(); 1853 DebugLoc dl = Op.getDebugLoc(); 1854 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 1855 const GlobalValue *GV = GN->getGlobal(); 1856 unsigned Alignment = GV->getAlignment(); 1857 Reloc::Model RelocM = getTargetMachine().getRelocationModel(); 1858 if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) { 1859 // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate 1860 // to zero when they remain undefined. In PIC mode the GOT can take care of 1861 // this, but in absolute mode we use a constant pool load. 1862 SDValue PoolAddr; 1863 PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT, 1864 DAG.getTargetConstantPool(GV, PtrVT, 0, 0, 1865 AArch64II::MO_NO_FLAG), 1866 DAG.getTargetConstantPool(GV, PtrVT, 0, 0, 1867 AArch64II::MO_LO12), 1868 DAG.getConstant(8, MVT::i32)); 1869 SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr, 1870 MachinePointerInfo::getConstantPool(), 1871 /*isVolatile=*/ false, 1872 /*isNonTemporal=*/ true, 1873 /*isInvariant=*/ true, 8); 1874 if (GN->getOffset() != 0) 1875 return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr, 1876 DAG.getConstant(GN->getOffset(), PtrVT)); 1877 1878 return GlobalAddr; 1879 } 1880 1881 if (Alignment == 0) { 1882 const PointerType *GVPtrTy = cast<PointerType>(GV->getType()); 1883 if (GVPtrTy->getElementType()->isSized()) { 1884 Alignment 1885 = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType()); 1886 } else { 1887 // Be conservative if we can't guess, not that it really matters: 1888 // functions and labels aren't valid for loads, and the methods used to 1889 // actually calculate an address work with any alignment. 1890 Alignment = 1; 1891 } 1892 } 1893 1894 unsigned char HiFixup, LoFixup; 1895 bool UseGOT = Subtarget->GVIsIndirectSymbol(GV, RelocM); 1896 1897 if (UseGOT) { 1898 HiFixup = AArch64II::MO_GOT; 1899 LoFixup = AArch64II::MO_GOT_LO12; 1900 Alignment = 8; 1901 } else { 1902 HiFixup = AArch64II::MO_NO_FLAG; 1903 LoFixup = AArch64II::MO_LO12; 1904 } 1905 1906 // AArch64's small model demands the following sequence: 1907 // ADRP x0, somewhere 1908 // ADD x0, x0, #:lo12:somewhere ; (or LDR directly). 1909 SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT, 1910 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 1911 HiFixup), 1912 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 1913 LoFixup), 1914 DAG.getConstant(Alignment, MVT::i32)); 1915 1916 if (UseGOT) { 1917 GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(), 1918 GlobalRef); 1919 } 1920 1921 if (GN->getOffset() != 0) 1922 return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef, 1923 DAG.getConstant(GN->getOffset(), PtrVT)); 1924 1925 return GlobalRef; 1926 } 1927 1928 SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr, 1929 SDValue DescAddr, 1930 DebugLoc DL, 1931 SelectionDAG &DAG) const { 1932 EVT PtrVT = getPointerTy(); 1933 1934 // The function we need to call is simply the first entry in the GOT for this 1935 // descriptor, load it in preparation. 1936 SDValue Func, Chain; 1937 Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(), 1938 DescAddr); 1939 1940 // The function takes only one argument: the address of the descriptor itself 1941 // in X0. 1942 SDValue Glue; 1943 Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue); 1944 Glue = Chain.getValue(1); 1945 1946 // Finally, there's a special calling-convention which means that the lookup 1947 // must preserve all registers (except X0, obviously). 1948 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 1949 const AArch64RegisterInfo *A64RI 1950 = static_cast<const AArch64RegisterInfo *>(TRI); 1951 const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask(); 1952 1953 // We're now ready to populate the argument list, as with a normal call: 1954 std::vector<SDValue> Ops; 1955 Ops.push_back(Chain); 1956 Ops.push_back(Func); 1957 Ops.push_back(SymAddr); 1958 Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT)); 1959 Ops.push_back(DAG.getRegisterMask(Mask)); 1960 Ops.push_back(Glue); 1961 1962 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 1963 Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0], 1964 Ops.size()); 1965 Glue = Chain.getValue(1); 1966 1967 // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it 1968 // back to the generic handling code. 1969 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 1970 } 1971 1972 SDValue 1973 AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 1974 SelectionDAG &DAG) const { 1975 assert(Subtarget->isTargetELF() && 1976 "TLS not implemented for non-ELF targets"); 1977 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 1978 1979 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 1980 1981 SDValue TPOff; 1982 EVT PtrVT = getPointerTy(); 1983 DebugLoc DL = Op.getDebugLoc(); 1984 const GlobalValue *GV = GA->getGlobal(); 1985 1986 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 1987 1988 if (Model == TLSModel::InitialExec) { 1989 TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 1990 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 1991 AArch64II::MO_GOTTPREL), 1992 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 1993 AArch64II::MO_GOTTPREL_LO12), 1994 DAG.getConstant(8, MVT::i32)); 1995 TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(), 1996 TPOff); 1997 } else if (Model == TLSModel::LocalExec) { 1998 SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 1999 AArch64II::MO_TPREL_G1); 2000 SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2001 AArch64II::MO_TPREL_G0_NC); 2002 2003 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar, 2004 DAG.getTargetConstant(0, MVT::i32)), 0); 2005 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT, 2006 TPOff, LoVar, 2007 DAG.getTargetConstant(0, MVT::i32)), 0); 2008 } else if (Model == TLSModel::GeneralDynamic) { 2009 // Accesses used in this sequence go via the TLS descriptor which lives in 2010 // the GOT. Prepare an address we can use to handle this. 2011 SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2012 AArch64II::MO_TLSDESC); 2013 SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2014 AArch64II::MO_TLSDESC_LO12); 2015 SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 2016 HiDesc, LoDesc, 2017 DAG.getConstant(8, MVT::i32)); 2018 SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0); 2019 2020 TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG); 2021 } else if (Model == TLSModel::LocalDynamic) { 2022 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 2023 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 2024 // the beginning of the module's TLS region, followed by a DTPREL offset 2025 // calculation. 2026 2027 // These accesses will need deduplicating if there's more than one. 2028 AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction() 2029 .getInfo<AArch64MachineFunctionInfo>(); 2030 MFI->incNumLocalDynamicTLSAccesses(); 2031 2032 2033 // Get the location of _TLS_MODULE_BASE_: 2034 SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 2035 AArch64II::MO_TLSDESC); 2036 SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 2037 AArch64II::MO_TLSDESC_LO12); 2038 SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 2039 HiDesc, LoDesc, 2040 DAG.getConstant(8, MVT::i32)); 2041 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT); 2042 2043 ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG); 2044 2045 // Get the variable's offset from _TLS_MODULE_BASE_ 2046 SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2047 AArch64II::MO_DTPREL_G1); 2048 SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2049 AArch64II::MO_DTPREL_G0_NC); 2050 2051 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar, 2052 DAG.getTargetConstant(0, MVT::i32)), 0); 2053 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT, 2054 TPOff, LoVar, 2055 DAG.getTargetConstant(0, MVT::i32)), 0); 2056 } else 2057 llvm_unreachable("Unsupported TLS access model"); 2058 2059 2060 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 2061 } 2062 2063 SDValue 2064 AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, 2065 bool IsSigned) const { 2066 if (Op.getValueType() != MVT::f128) { 2067 // Legal for everything except f128. 2068 return Op; 2069 } 2070 2071 RTLIB::Libcall LC; 2072 if (IsSigned) 2073 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2074 else 2075 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2076 2077 return LowerF128ToCall(Op, DAG, LC); 2078 } 2079 2080 2081 SDValue 2082 AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 2083 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 2084 DebugLoc dl = JT->getDebugLoc(); 2085 2086 // When compiling PIC, jump tables get put in the code section so a static 2087 // relocation-style is acceptable for both cases. 2088 return DAG.getNode(AArch64ISD::WrapperSmall, dl, getPointerTy(), 2089 DAG.getTargetJumpTable(JT->getIndex(), getPointerTy()), 2090 DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 2091 AArch64II::MO_LO12), 2092 DAG.getConstant(1, MVT::i32)); 2093 } 2094 2095 // (SELECT_CC lhs, rhs, iftrue, iffalse, condcode) 2096 SDValue 2097 AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 2098 DebugLoc dl = Op.getDebugLoc(); 2099 SDValue LHS = Op.getOperand(0); 2100 SDValue RHS = Op.getOperand(1); 2101 SDValue IfTrue = Op.getOperand(2); 2102 SDValue IfFalse = Op.getOperand(3); 2103 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 2104 2105 if (LHS.getValueType() == MVT::f128) { 2106 // f128 comparisons are lowered to libcalls, but slot in nicely here 2107 // afterwards. 2108 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 2109 2110 // If softenSetCCOperands returned a scalar, we need to compare the result 2111 // against zero to select between true and false values. 2112 if (RHS.getNode() == 0) { 2113 RHS = DAG.getConstant(0, LHS.getValueType()); 2114 CC = ISD::SETNE; 2115 } 2116 } 2117 2118 if (LHS.getValueType().isInteger()) { 2119 SDValue A64cc; 2120 2121 // Integers are handled in a separate function because the combinations of 2122 // immediates and tests can get hairy and we may want to fiddle things. 2123 SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl); 2124 2125 return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), 2126 CmpOp, IfTrue, IfFalse, A64cc); 2127 } 2128 2129 // Note that some LLVM floating-point CondCodes can't be lowered to a single 2130 // conditional branch, hence FPCCToA64CC can set a second test, where either 2131 // passing is sufficient. 2132 A64CC::CondCodes CondCode, Alternative = A64CC::Invalid; 2133 CondCode = FPCCToA64CC(CC, Alternative); 2134 SDValue A64cc = DAG.getConstant(CondCode, MVT::i32); 2135 SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 2136 DAG.getCondCode(CC)); 2137 SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, 2138 Op.getValueType(), 2139 SetCC, IfTrue, IfFalse, A64cc); 2140 2141 if (Alternative != A64CC::Invalid) { 2142 A64cc = DAG.getConstant(Alternative, MVT::i32); 2143 A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), 2144 SetCC, IfTrue, A64SELECT_CC, A64cc); 2145 2146 } 2147 2148 return A64SELECT_CC; 2149 } 2150 2151 // (SELECT testbit, iftrue, iffalse) 2152 SDValue 2153 AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 2154 DebugLoc dl = Op.getDebugLoc(); 2155 SDValue TheBit = Op.getOperand(0); 2156 SDValue IfTrue = Op.getOperand(1); 2157 SDValue IfFalse = Op.getOperand(2); 2158 2159 // AArch64 BooleanContents is the default UndefinedBooleanContent, which means 2160 // that as the consumer we are responsible for ignoring rubbish in higher 2161 // bits. 2162 TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit, 2163 DAG.getConstant(1, MVT::i32)); 2164 SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit, 2165 DAG.getConstant(0, TheBit.getValueType()), 2166 DAG.getCondCode(ISD::SETNE)); 2167 2168 return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), 2169 A64CMP, IfTrue, IfFalse, 2170 DAG.getConstant(A64CC::NE, MVT::i32)); 2171 } 2172 2173 // (SETCC lhs, rhs, condcode) 2174 SDValue 2175 AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 2176 DebugLoc dl = Op.getDebugLoc(); 2177 SDValue LHS = Op.getOperand(0); 2178 SDValue RHS = Op.getOperand(1); 2179 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 2180 EVT VT = Op.getValueType(); 2181 2182 if (LHS.getValueType() == MVT::f128) { 2183 // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS 2184 // for the rest of the function (some i32 or i64 values). 2185 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 2186 2187 // If softenSetCCOperands returned a scalar, use it. 2188 if (RHS.getNode() == 0) { 2189 assert(LHS.getValueType() == Op.getValueType() && 2190 "Unexpected setcc expansion!"); 2191 return LHS; 2192 } 2193 } 2194 2195 if (LHS.getValueType().isInteger()) { 2196 SDValue A64cc; 2197 2198 // Integers are handled in a separate function because the combinations of 2199 // immediates and tests can get hairy and we may want to fiddle things. 2200 SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl); 2201 2202 return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, 2203 CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT), 2204 A64cc); 2205 } 2206 2207 // Note that some LLVM floating-point CondCodes can't be lowered to a single 2208 // conditional branch, hence FPCCToA64CC can set a second test, where either 2209 // passing is sufficient. 2210 A64CC::CondCodes CondCode, Alternative = A64CC::Invalid; 2211 CondCode = FPCCToA64CC(CC, Alternative); 2212 SDValue A64cc = DAG.getConstant(CondCode, MVT::i32); 2213 SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 2214 DAG.getCondCode(CC)); 2215 SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, 2216 CmpOp, DAG.getConstant(1, VT), 2217 DAG.getConstant(0, VT), A64cc); 2218 2219 if (Alternative != A64CC::Invalid) { 2220 A64cc = DAG.getConstant(Alternative, MVT::i32); 2221 A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp, 2222 DAG.getConstant(1, VT), A64SELECT_CC, A64cc); 2223 } 2224 2225 return A64SELECT_CC; 2226 } 2227 2228 SDValue 2229 AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 2230 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 2231 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 2232 2233 // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes 2234 // rather than just 8. 2235 return DAG.getMemcpy(Op.getOperand(0), Op.getDebugLoc(), 2236 Op.getOperand(1), Op.getOperand(2), 2237 DAG.getConstant(32, MVT::i32), 8, false, false, 2238 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); 2239 } 2240 2241 SDValue 2242 AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2243 // The layout of the va_list struct is specified in the AArch64 Procedure Call 2244 // Standard, section B.3. 2245 MachineFunction &MF = DAG.getMachineFunction(); 2246 AArch64MachineFunctionInfo *FuncInfo 2247 = MF.getInfo<AArch64MachineFunctionInfo>(); 2248 DebugLoc DL = Op.getDebugLoc(); 2249 2250 SDValue Chain = Op.getOperand(0); 2251 SDValue VAList = Op.getOperand(1); 2252 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2253 SmallVector<SDValue, 4> MemOps; 2254 2255 // void *__stack at offset 0 2256 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(), 2257 getPointerTy()); 2258 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 2259 MachinePointerInfo(SV), false, false, 0)); 2260 2261 // void *__gr_top at offset 8 2262 int GPRSize = FuncInfo->getVariadicGPRSize(); 2263 if (GPRSize > 0) { 2264 SDValue GRTop, GRTopAddr; 2265 2266 GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2267 DAG.getConstant(8, getPointerTy())); 2268 2269 GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy()); 2270 GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop, 2271 DAG.getConstant(GPRSize, getPointerTy())); 2272 2273 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 2274 MachinePointerInfo(SV, 8), 2275 false, false, 0)); 2276 } 2277 2278 // void *__vr_top at offset 16 2279 int FPRSize = FuncInfo->getVariadicFPRSize(); 2280 if (FPRSize > 0) { 2281 SDValue VRTop, VRTopAddr; 2282 VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2283 DAG.getConstant(16, getPointerTy())); 2284 2285 VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy()); 2286 VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop, 2287 DAG.getConstant(FPRSize, getPointerTy())); 2288 2289 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 2290 MachinePointerInfo(SV, 16), 2291 false, false, 0)); 2292 } 2293 2294 // int __gr_offs at offset 24 2295 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2296 DAG.getConstant(24, getPointerTy())); 2297 MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32), 2298 GROffsAddr, MachinePointerInfo(SV, 24), 2299 false, false, 0)); 2300 2301 // int __vr_offs at offset 28 2302 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2303 DAG.getConstant(28, getPointerTy())); 2304 MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32), 2305 VROffsAddr, MachinePointerInfo(SV, 28), 2306 false, false, 0)); 2307 2308 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0], 2309 MemOps.size()); 2310 } 2311 2312 SDValue 2313 AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2314 switch (Op.getOpcode()) { 2315 default: llvm_unreachable("Don't know how to custom lower this!"); 2316 case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128); 2317 case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128); 2318 case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128); 2319 case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128); 2320 case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true); 2321 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false); 2322 case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true); 2323 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false); 2324 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 2325 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 2326 2327 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 2328 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 2329 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 2330 case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG); 2331 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 2332 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 2333 case ISD::SELECT: return LowerSELECT(Op, DAG); 2334 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 2335 case ISD::SETCC: return LowerSETCC(Op, DAG); 2336 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 2337 case ISD::VASTART: return LowerVASTART(Op, DAG); 2338 } 2339 2340 return SDValue(); 2341 } 2342 2343 static SDValue PerformANDCombine(SDNode *N, 2344 TargetLowering::DAGCombinerInfo &DCI) { 2345 2346 SelectionDAG &DAG = DCI.DAG; 2347 DebugLoc DL = N->getDebugLoc(); 2348 EVT VT = N->getValueType(0); 2349 2350 // We're looking for an SRA/SHL pair which form an SBFX. 2351 2352 if (VT != MVT::i32 && VT != MVT::i64) 2353 return SDValue(); 2354 2355 if (!isa<ConstantSDNode>(N->getOperand(1))) 2356 return SDValue(); 2357 2358 uint64_t TruncMask = N->getConstantOperandVal(1); 2359 if (!isMask_64(TruncMask)) 2360 return SDValue(); 2361 2362 uint64_t Width = CountPopulation_64(TruncMask); 2363 SDValue Shift = N->getOperand(0); 2364 2365 if (Shift.getOpcode() != ISD::SRL) 2366 return SDValue(); 2367 2368 if (!isa<ConstantSDNode>(Shift->getOperand(1))) 2369 return SDValue(); 2370 uint64_t LSB = Shift->getConstantOperandVal(1); 2371 2372 if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits()) 2373 return SDValue(); 2374 2375 return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0), 2376 DAG.getConstant(LSB, MVT::i64), 2377 DAG.getConstant(LSB + Width - 1, MVT::i64)); 2378 } 2379 2380 static SDValue PerformATOMIC_FENCECombine(SDNode *FenceNode, 2381 TargetLowering::DAGCombinerInfo &DCI) { 2382 // An atomic operation followed by an acquiring atomic fence can be reduced to 2383 // an acquiring load. The atomic operation provides a convenient pointer to 2384 // load from. If the original operation was a load anyway we can actually 2385 // combine the two operations into an acquiring load. 2386 SelectionDAG &DAG = DCI.DAG; 2387 SDValue AtomicOp = FenceNode->getOperand(0); 2388 AtomicSDNode *AtomicNode = dyn_cast<AtomicSDNode>(AtomicOp); 2389 2390 // A fence on its own can't be optimised 2391 if (!AtomicNode) 2392 return SDValue(); 2393 2394 AtomicOrdering FenceOrder 2395 = static_cast<AtomicOrdering>(FenceNode->getConstantOperandVal(1)); 2396 SynchronizationScope FenceScope 2397 = static_cast<SynchronizationScope>(FenceNode->getConstantOperandVal(2)); 2398 2399 if (FenceOrder != Acquire || FenceScope != AtomicNode->getSynchScope()) 2400 return SDValue(); 2401 2402 // If the original operation was an ATOMIC_LOAD then we'll be replacing it, so 2403 // the chain we use should be its input, otherwise we'll put our store after 2404 // it so we use its output chain. 2405 SDValue Chain = AtomicNode->getOpcode() == ISD::ATOMIC_LOAD ? 2406 AtomicNode->getChain() : AtomicOp; 2407 2408 // We have an acquire fence with a handy atomic operation nearby, we can 2409 // convert the fence into a load-acquire, discarding the result. 2410 DebugLoc DL = FenceNode->getDebugLoc(); 2411 SDValue Op = DAG.getAtomic(ISD::ATOMIC_LOAD, DL, AtomicNode->getMemoryVT(), 2412 AtomicNode->getValueType(0), 2413 Chain, // Chain 2414 AtomicOp.getOperand(1), // Pointer 2415 AtomicNode->getMemOperand(), Acquire, 2416 FenceScope); 2417 2418 if (AtomicNode->getOpcode() == ISD::ATOMIC_LOAD) 2419 DAG.ReplaceAllUsesWith(AtomicNode, Op.getNode()); 2420 2421 return Op.getValue(1); 2422 } 2423 2424 static SDValue PerformATOMIC_STORECombine(SDNode *N, 2425 TargetLowering::DAGCombinerInfo &DCI) { 2426 // A releasing atomic fence followed by an atomic store can be combined into a 2427 // single store operation. 2428 SelectionDAG &DAG = DCI.DAG; 2429 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(N); 2430 SDValue FenceOp = AtomicNode->getOperand(0); 2431 2432 if (FenceOp.getOpcode() != ISD::ATOMIC_FENCE) 2433 return SDValue(); 2434 2435 AtomicOrdering FenceOrder 2436 = static_cast<AtomicOrdering>(FenceOp->getConstantOperandVal(1)); 2437 SynchronizationScope FenceScope 2438 = static_cast<SynchronizationScope>(FenceOp->getConstantOperandVal(2)); 2439 2440 if (FenceOrder != Release || FenceScope != AtomicNode->getSynchScope()) 2441 return SDValue(); 2442 2443 DebugLoc DL = AtomicNode->getDebugLoc(); 2444 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, AtomicNode->getMemoryVT(), 2445 FenceOp.getOperand(0), // Chain 2446 AtomicNode->getOperand(1), // Pointer 2447 AtomicNode->getOperand(2), // Value 2448 AtomicNode->getMemOperand(), Release, 2449 FenceScope); 2450 } 2451 2452 /// For a true bitfield insert, the bits getting into that contiguous mask 2453 /// should come from the low part of an existing value: they must be formed from 2454 /// a compatible SHL operation (unless they're already low). This function 2455 /// checks that condition and returns the least-significant bit that's 2456 /// intended. If the operation not a field preparation, -1 is returned. 2457 static int32_t getLSBForBFI(SelectionDAG &DAG, DebugLoc DL, EVT VT, 2458 SDValue &MaskedVal, uint64_t Mask) { 2459 if (!isShiftedMask_64(Mask)) 2460 return -1; 2461 2462 // Now we need to alter MaskedVal so that it is an appropriate input for a BFI 2463 // instruction. BFI will do a left-shift by LSB before applying the mask we've 2464 // spotted, so in general we should pre-emptively "undo" that by making sure 2465 // the incoming bits have had a right-shift applied to them. 2466 // 2467 // This right shift, however, will combine with existing left/right shifts. In 2468 // the simplest case of a completely straight bitfield operation, it will be 2469 // expected to completely cancel out with an existing SHL. More complicated 2470 // cases (e.g. bitfield to bitfield copy) may still need a real shift before 2471 // the BFI. 2472 2473 uint64_t LSB = CountTrailingZeros_64(Mask); 2474 int64_t ShiftRightRequired = LSB; 2475 if (MaskedVal.getOpcode() == ISD::SHL && 2476 isa<ConstantSDNode>(MaskedVal.getOperand(1))) { 2477 ShiftRightRequired -= MaskedVal.getConstantOperandVal(1); 2478 MaskedVal = MaskedVal.getOperand(0); 2479 } else if (MaskedVal.getOpcode() == ISD::SRL && 2480 isa<ConstantSDNode>(MaskedVal.getOperand(1))) { 2481 ShiftRightRequired += MaskedVal.getConstantOperandVal(1); 2482 MaskedVal = MaskedVal.getOperand(0); 2483 } 2484 2485 if (ShiftRightRequired > 0) 2486 MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal, 2487 DAG.getConstant(ShiftRightRequired, MVT::i64)); 2488 else if (ShiftRightRequired < 0) { 2489 // We could actually end up with a residual left shift, for example with 2490 // "struc.bitfield = val << 1". 2491 MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal, 2492 DAG.getConstant(-ShiftRightRequired, MVT::i64)); 2493 } 2494 2495 return LSB; 2496 } 2497 2498 /// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by 2499 /// a mask and an extension. Returns true if a BFI was found and provides 2500 /// information on its surroundings. 2501 static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask, 2502 bool &Extended) { 2503 Extended = false; 2504 if (N.getOpcode() == ISD::ZERO_EXTEND) { 2505 Extended = true; 2506 N = N.getOperand(0); 2507 } 2508 2509 if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) { 2510 Mask = N->getConstantOperandVal(1); 2511 N = N.getOperand(0); 2512 } else { 2513 // Mask is the whole width. 2514 Mask = -1ULL >> (64 - N.getValueType().getSizeInBits()); 2515 } 2516 2517 if (N.getOpcode() == AArch64ISD::BFI) { 2518 BFI = N; 2519 return true; 2520 } 2521 2522 return false; 2523 } 2524 2525 /// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which 2526 /// is roughly equivalent to (and (BFI ...), mask). This form is used because it 2527 /// can often be further combined with a larger mask. Ultimately, we want mask 2528 /// to be 2^32-1 or 2^64-1 so the AND can be skipped. 2529 static SDValue tryCombineToBFI(SDNode *N, 2530 TargetLowering::DAGCombinerInfo &DCI, 2531 const AArch64Subtarget *Subtarget) { 2532 SelectionDAG &DAG = DCI.DAG; 2533 DebugLoc DL = N->getDebugLoc(); 2534 EVT VT = N->getValueType(0); 2535 2536 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 2537 2538 // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or 2539 // abandon the effort. 2540 SDValue LHS = N->getOperand(0); 2541 if (LHS.getOpcode() != ISD::AND) 2542 return SDValue(); 2543 2544 uint64_t LHSMask; 2545 if (isa<ConstantSDNode>(LHS.getOperand(1))) 2546 LHSMask = LHS->getConstantOperandVal(1); 2547 else 2548 return SDValue(); 2549 2550 // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask 2551 // is or abandon the effort. 2552 SDValue RHS = N->getOperand(1); 2553 if (RHS.getOpcode() != ISD::AND) 2554 return SDValue(); 2555 2556 uint64_t RHSMask; 2557 if (isa<ConstantSDNode>(RHS.getOperand(1))) 2558 RHSMask = RHS->getConstantOperandVal(1); 2559 else 2560 return SDValue(); 2561 2562 // Can't do anything if the masks are incompatible. 2563 if (LHSMask & RHSMask) 2564 return SDValue(); 2565 2566 // Now we need one of the masks to be a contiguous field. Without loss of 2567 // generality that should be the RHS one. 2568 SDValue Bitfield = LHS.getOperand(0); 2569 if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) { 2570 // We know that LHS is a candidate new value, and RHS isn't already a better 2571 // one. 2572 std::swap(LHS, RHS); 2573 std::swap(LHSMask, RHSMask); 2574 } 2575 2576 // We've done our best to put the right operands in the right places, all we 2577 // can do now is check whether a BFI exists. 2578 Bitfield = RHS.getOperand(0); 2579 int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask); 2580 if (LSB == -1) 2581 return SDValue(); 2582 2583 uint32_t Width = CountPopulation_64(RHSMask); 2584 assert(Width && "Expected non-zero bitfield width"); 2585 2586 SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT, 2587 LHS.getOperand(0), Bitfield, 2588 DAG.getConstant(LSB, MVT::i64), 2589 DAG.getConstant(Width, MVT::i64)); 2590 2591 // Mask is trivial 2592 if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits()))) 2593 return BFI; 2594 2595 return DAG.getNode(ISD::AND, DL, VT, BFI, 2596 DAG.getConstant(LHSMask | RHSMask, VT)); 2597 } 2598 2599 /// Search for the bitwise combining (with careful masks) of a MaskedBFI and its 2600 /// original input. This is surprisingly common because SROA splits things up 2601 /// into i8 chunks, so the originally detected MaskedBFI may actually only act 2602 /// on the low (say) byte of a word. This is then orred into the rest of the 2603 /// word afterwards. 2604 /// 2605 /// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)). 2606 /// 2607 /// If MASK1 and MASK2 are compatible, we can fold the whole thing into the 2608 /// MaskedBFI. We can also deal with a certain amount of extend/truncate being 2609 /// involved. 2610 static SDValue tryCombineToLargerBFI(SDNode *N, 2611 TargetLowering::DAGCombinerInfo &DCI, 2612 const AArch64Subtarget *Subtarget) { 2613 SelectionDAG &DAG = DCI.DAG; 2614 DebugLoc DL = N->getDebugLoc(); 2615 EVT VT = N->getValueType(0); 2616 2617 // First job is to hunt for a MaskedBFI on either the left or right. Swap 2618 // operands if it's actually on the right. 2619 SDValue BFI; 2620 SDValue PossExtraMask; 2621 uint64_t ExistingMask = 0; 2622 bool Extended = false; 2623 if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended)) 2624 PossExtraMask = N->getOperand(1); 2625 else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended)) 2626 PossExtraMask = N->getOperand(0); 2627 else 2628 return SDValue(); 2629 2630 // We can only combine a BFI with another compatible mask. 2631 if (PossExtraMask.getOpcode() != ISD::AND || 2632 !isa<ConstantSDNode>(PossExtraMask.getOperand(1))) 2633 return SDValue(); 2634 2635 uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1); 2636 2637 // Masks must be compatible. 2638 if (ExtraMask & ExistingMask) 2639 return SDValue(); 2640 2641 SDValue OldBFIVal = BFI.getOperand(0); 2642 SDValue NewBFIVal = BFI.getOperand(1); 2643 if (Extended) { 2644 // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be 2645 // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments 2646 // need to be made compatible. 2647 assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32 2648 && "Invalid types for BFI"); 2649 OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal); 2650 NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal); 2651 } 2652 2653 // We need the MaskedBFI to be combined with a mask of the *same* value. 2654 if (PossExtraMask.getOperand(0) != OldBFIVal) 2655 return SDValue(); 2656 2657 BFI = DAG.getNode(AArch64ISD::BFI, DL, VT, 2658 OldBFIVal, NewBFIVal, 2659 BFI.getOperand(2), BFI.getOperand(3)); 2660 2661 // If the masking is trivial, we don't need to create it. 2662 if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits()))) 2663 return BFI; 2664 2665 return DAG.getNode(ISD::AND, DL, VT, BFI, 2666 DAG.getConstant(ExtraMask | ExistingMask, VT)); 2667 } 2668 2669 /// An EXTR instruction is made up of two shifts, ORed together. This helper 2670 /// searches for and classifies those shifts. 2671 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, 2672 bool &FromHi) { 2673 if (N.getOpcode() == ISD::SHL) 2674 FromHi = false; 2675 else if (N.getOpcode() == ISD::SRL) 2676 FromHi = true; 2677 else 2678 return false; 2679 2680 if (!isa<ConstantSDNode>(N.getOperand(1))) 2681 return false; 2682 2683 ShiftAmount = N->getConstantOperandVal(1); 2684 Src = N->getOperand(0); 2685 return true; 2686 } 2687 2688 /// EXTR instruction extracts a contiguous chunk of bits from two existing 2689 /// registers viewed as a high/low pair. This function looks for the pattern: 2690 /// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an 2691 /// EXTR. Can't quite be done in TableGen because the two immediates aren't 2692 /// independent. 2693 static SDValue tryCombineToEXTR(SDNode *N, 2694 TargetLowering::DAGCombinerInfo &DCI) { 2695 SelectionDAG &DAG = DCI.DAG; 2696 DebugLoc DL = N->getDebugLoc(); 2697 EVT VT = N->getValueType(0); 2698 2699 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 2700 2701 if (VT != MVT::i32 && VT != MVT::i64) 2702 return SDValue(); 2703 2704 SDValue LHS; 2705 uint32_t ShiftLHS = 0; 2706 bool LHSFromHi = 0; 2707 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) 2708 return SDValue(); 2709 2710 SDValue RHS; 2711 uint32_t ShiftRHS = 0; 2712 bool RHSFromHi = 0; 2713 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) 2714 return SDValue(); 2715 2716 // If they're both trying to come from the high part of the register, they're 2717 // not really an EXTR. 2718 if (LHSFromHi == RHSFromHi) 2719 return SDValue(); 2720 2721 if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) 2722 return SDValue(); 2723 2724 if (LHSFromHi) { 2725 std::swap(LHS, RHS); 2726 std::swap(ShiftLHS, ShiftRHS); 2727 } 2728 2729 return DAG.getNode(AArch64ISD::EXTR, DL, VT, 2730 LHS, RHS, 2731 DAG.getConstant(ShiftRHS, MVT::i64)); 2732 } 2733 2734 /// Target-specific dag combine xforms for ISD::OR 2735 static SDValue PerformORCombine(SDNode *N, 2736 TargetLowering::DAGCombinerInfo &DCI, 2737 const AArch64Subtarget *Subtarget) { 2738 2739 SelectionDAG &DAG = DCI.DAG; 2740 EVT VT = N->getValueType(0); 2741 2742 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 2743 return SDValue(); 2744 2745 // Attempt to recognise bitfield-insert operations. 2746 SDValue Res = tryCombineToBFI(N, DCI, Subtarget); 2747 if (Res.getNode()) 2748 return Res; 2749 2750 // Attempt to combine an existing MaskedBFI operation into one with a larger 2751 // mask. 2752 Res = tryCombineToLargerBFI(N, DCI, Subtarget); 2753 if (Res.getNode()) 2754 return Res; 2755 2756 Res = tryCombineToEXTR(N, DCI); 2757 if (Res.getNode()) 2758 return Res; 2759 2760 return SDValue(); 2761 } 2762 2763 /// Target-specific dag combine xforms for ISD::SRA 2764 static SDValue PerformSRACombine(SDNode *N, 2765 TargetLowering::DAGCombinerInfo &DCI) { 2766 2767 SelectionDAG &DAG = DCI.DAG; 2768 DebugLoc DL = N->getDebugLoc(); 2769 EVT VT = N->getValueType(0); 2770 2771 // We're looking for an SRA/SHL pair which form an SBFX. 2772 2773 if (VT != MVT::i32 && VT != MVT::i64) 2774 return SDValue(); 2775 2776 if (!isa<ConstantSDNode>(N->getOperand(1))) 2777 return SDValue(); 2778 2779 uint64_t ExtraSignBits = N->getConstantOperandVal(1); 2780 SDValue Shift = N->getOperand(0); 2781 2782 if (Shift.getOpcode() != ISD::SHL) 2783 return SDValue(); 2784 2785 if (!isa<ConstantSDNode>(Shift->getOperand(1))) 2786 return SDValue(); 2787 2788 uint64_t BitsOnLeft = Shift->getConstantOperandVal(1); 2789 uint64_t Width = VT.getSizeInBits() - ExtraSignBits; 2790 uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft; 2791 2792 if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits()) 2793 return SDValue(); 2794 2795 return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0), 2796 DAG.getConstant(LSB, MVT::i64), 2797 DAG.getConstant(LSB + Width - 1, MVT::i64)); 2798 } 2799 2800 2801 SDValue 2802 AArch64TargetLowering::PerformDAGCombine(SDNode *N, 2803 DAGCombinerInfo &DCI) const { 2804 switch (N->getOpcode()) { 2805 default: break; 2806 case ISD::AND: return PerformANDCombine(N, DCI); 2807 case ISD::ATOMIC_FENCE: return PerformATOMIC_FENCECombine(N, DCI); 2808 case ISD::ATOMIC_STORE: return PerformATOMIC_STORECombine(N, DCI); 2809 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 2810 case ISD::SRA: return PerformSRACombine(N, DCI); 2811 } 2812 return SDValue(); 2813 } 2814 2815 AArch64TargetLowering::ConstraintType 2816 AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { 2817 if (Constraint.size() == 1) { 2818 switch (Constraint[0]) { 2819 default: break; 2820 case 'w': // An FP/SIMD vector register 2821 return C_RegisterClass; 2822 case 'I': // Constant that can be used with an ADD instruction 2823 case 'J': // Constant that can be used with a SUB instruction 2824 case 'K': // Constant that can be used with a 32-bit logical instruction 2825 case 'L': // Constant that can be used with a 64-bit logical instruction 2826 case 'M': // Constant that can be used as a 32-bit MOV immediate 2827 case 'N': // Constant that can be used as a 64-bit MOV immediate 2828 case 'Y': // Floating point constant zero 2829 case 'Z': // Integer constant zero 2830 return C_Other; 2831 case 'Q': // A memory reference with base register and no offset 2832 return C_Memory; 2833 case 'S': // A symbolic address 2834 return C_Other; 2835 } 2836 } 2837 2838 // FIXME: Ump, Utf, Usa, Ush 2839 // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes, 2840 // whatever they may be 2841 // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be 2842 // Usa: An absolute symbolic address 2843 // Ush: The high part (bits 32:12) of a pc-relative symbolic address 2844 assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa" 2845 && Constraint != "Ush" && "Unimplemented constraints"); 2846 2847 return TargetLowering::getConstraintType(Constraint); 2848 } 2849 2850 TargetLowering::ConstraintWeight 2851 AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info, 2852 const char *Constraint) const { 2853 2854 llvm_unreachable("Constraint weight unimplemented"); 2855 } 2856 2857 void 2858 AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 2859 std::string &Constraint, 2860 std::vector<SDValue> &Ops, 2861 SelectionDAG &DAG) const { 2862 SDValue Result(0, 0); 2863 2864 // Only length 1 constraints are C_Other. 2865 if (Constraint.size() != 1) return; 2866 2867 // Only C_Other constraints get lowered like this. That means constants for us 2868 // so return early if there's no hope the constraint can be lowered. 2869 2870 switch(Constraint[0]) { 2871 default: break; 2872 case 'I': case 'J': case 'K': case 'L': 2873 case 'M': case 'N': case 'Z': { 2874 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 2875 if (!C) 2876 return; 2877 2878 uint64_t CVal = C->getZExtValue(); 2879 uint32_t Bits; 2880 2881 switch (Constraint[0]) { 2882 default: 2883 // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J' 2884 // is a peculiarly useless SUB constraint. 2885 llvm_unreachable("Unimplemented C_Other constraint"); 2886 case 'I': 2887 if (CVal <= 0xfff) 2888 break; 2889 return; 2890 case 'K': 2891 if (A64Imms::isLogicalImm(32, CVal, Bits)) 2892 break; 2893 return; 2894 case 'L': 2895 if (A64Imms::isLogicalImm(64, CVal, Bits)) 2896 break; 2897 return; 2898 case 'Z': 2899 if (CVal == 0) 2900 break; 2901 return; 2902 } 2903 2904 Result = DAG.getTargetConstant(CVal, Op.getValueType()); 2905 break; 2906 } 2907 case 'S': { 2908 // An absolute symbolic address or label reference. 2909 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) { 2910 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), Op.getDebugLoc(), 2911 GA->getValueType(0)); 2912 } else if (const BlockAddressSDNode *BA 2913 = dyn_cast<BlockAddressSDNode>(Op)) { 2914 Result = DAG.getTargetBlockAddress(BA->getBlockAddress(), 2915 BA->getValueType(0)); 2916 } else if (const ExternalSymbolSDNode *ES 2917 = dyn_cast<ExternalSymbolSDNode>(Op)) { 2918 Result = DAG.getTargetExternalSymbol(ES->getSymbol(), 2919 ES->getValueType(0)); 2920 } else 2921 return; 2922 break; 2923 } 2924 case 'Y': 2925 if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) { 2926 if (CFP->isExactlyValue(0.0)) { 2927 Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0)); 2928 break; 2929 } 2930 } 2931 return; 2932 } 2933 2934 if (Result.getNode()) { 2935 Ops.push_back(Result); 2936 return; 2937 } 2938 2939 // It's an unknown constraint for us. Let generic code have a go. 2940 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 2941 } 2942 2943 std::pair<unsigned, const TargetRegisterClass*> 2944 AArch64TargetLowering::getRegForInlineAsmConstraint( 2945 const std::string &Constraint, 2946 EVT VT) const { 2947 if (Constraint.size() == 1) { 2948 switch (Constraint[0]) { 2949 case 'r': 2950 if (VT.getSizeInBits() <= 32) 2951 return std::make_pair(0U, &AArch64::GPR32RegClass); 2952 else if (VT == MVT::i64) 2953 return std::make_pair(0U, &AArch64::GPR64RegClass); 2954 break; 2955 case 'w': 2956 if (VT == MVT::f16) 2957 return std::make_pair(0U, &AArch64::FPR16RegClass); 2958 else if (VT == MVT::f32) 2959 return std::make_pair(0U, &AArch64::FPR32RegClass); 2960 else if (VT == MVT::f64) 2961 return std::make_pair(0U, &AArch64::FPR64RegClass); 2962 else if (VT.getSizeInBits() == 64) 2963 return std::make_pair(0U, &AArch64::VPR64RegClass); 2964 else if (VT == MVT::f128) 2965 return std::make_pair(0U, &AArch64::FPR128RegClass); 2966 else if (VT.getSizeInBits() == 128) 2967 return std::make_pair(0U, &AArch64::VPR128RegClass); 2968 break; 2969 } 2970 } 2971 2972 // Use the default implementation in TargetLowering to convert the register 2973 // constraint into a member of a register class. 2974 return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 2975 } 2976