1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that performs load / store related peephole 11 // optimizations. This pass should be run after register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AArch64InstrInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "MCTargetDesc/AArch64AddressingModes.h" 18 #include "llvm/ADT/BitVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/CodeGen/MachineBasicBlock.h" 21 #include "llvm/CodeGen/MachineFunctionPass.h" 22 #include "llvm/CodeGen/MachineInstr.h" 23 #include "llvm/CodeGen/MachineInstrBuilder.h" 24 #include "llvm/Support/CommandLine.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Support/ErrorHandling.h" 27 #include "llvm/Support/raw_ostream.h" 28 #include "llvm/Target/TargetInstrInfo.h" 29 #include "llvm/Target/TargetMachine.h" 30 #include "llvm/Target/TargetRegisterInfo.h" 31 using namespace llvm; 32 33 #define DEBUG_TYPE "aarch64-ldst-opt" 34 35 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine 36 /// load / store instructions to form ldp / stp instructions. 37 38 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 39 STATISTIC(NumPostFolded, "Number of post-index updates folded"); 40 STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 41 STATISTIC(NumUnscaledPairCreated, 42 "Number of load/store from unscaled generated"); 43 44 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", 45 cl::init(20), cl::Hidden); 46 47 // Place holder while testing unscaled load/store combining 48 static cl::opt<bool> EnableAArch64UnscaledMemOp( 49 "aarch64-unscaled-mem-op", cl::Hidden, 50 cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true)); 51 52 namespace { 53 struct AArch64LoadStoreOpt : public MachineFunctionPass { 54 static char ID; 55 AArch64LoadStoreOpt() : MachineFunctionPass(ID) {} 56 57 const AArch64InstrInfo *TII; 58 const TargetRegisterInfo *TRI; 59 60 // Scan the instructions looking for a load/store that can be combined 61 // with the current instruction into a load/store pair. 62 // Return the matching instruction if one is found, else MBB->end(). 63 // If a matching instruction is found, MergeForward is set to true if the 64 // merge is to remove the first instruction and replace the second with 65 // a pair-wise insn, and false if the reverse is true. 66 // \p SExtIdx[out] gives the index of the result of the load pair that 67 // must be extended. The value of SExtIdx assumes that the paired load 68 // produces the value in this order: (I, returned iterator), i.e., 69 // -1 means no value has to be extended, 0 means I, and 1 means the 70 // returned iterator. 71 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 72 bool &MergeForward, int &SExtIdx, 73 unsigned Limit); 74 // Merge the two instructions indicated into a single pair-wise instruction. 75 // If MergeForward is true, erase the first instruction and fold its 76 // operation into the second. If false, the reverse. Return the instruction 77 // following the first instruction (which may change during processing). 78 // \p SExtIdx index of the result that must be extended for a paired load. 79 // -1 means none, 0 means I, and 1 means Paired. 80 MachineBasicBlock::iterator 81 mergePairedInsns(MachineBasicBlock::iterator I, 82 MachineBasicBlock::iterator Paired, bool MergeForward, 83 int SExtIdx); 84 85 // Scan the instruction list to find a base register update that can 86 // be combined with the current instruction (a load or store) using 87 // pre or post indexed addressing with writeback. Scan forwards. 88 MachineBasicBlock::iterator 89 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, 90 int Value); 91 92 // Scan the instruction list to find a base register update that can 93 // be combined with the current instruction (a load or store) using 94 // pre or post indexed addressing with writeback. Scan backwards. 95 MachineBasicBlock::iterator 96 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 97 98 // Merge a pre-index base register update into a ld/st instruction. 99 MachineBasicBlock::iterator 100 mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, 101 MachineBasicBlock::iterator Update); 102 103 // Merge a post-index base register update into a ld/st instruction. 104 MachineBasicBlock::iterator 105 mergePostIdxUpdateInsn(MachineBasicBlock::iterator I, 106 MachineBasicBlock::iterator Update); 107 108 bool optimizeBlock(MachineBasicBlock &MBB); 109 110 bool runOnMachineFunction(MachineFunction &Fn) override; 111 112 const char *getPassName() const override { 113 return "AArch64 load / store optimization pass"; 114 } 115 116 private: 117 int getMemSize(MachineInstr *MemMI); 118 }; 119 char AArch64LoadStoreOpt::ID = 0; 120 } // namespace 121 122 static bool isUnscaledLdst(unsigned Opc) { 123 switch (Opc) { 124 default: 125 return false; 126 case AArch64::STURSi: 127 return true; 128 case AArch64::STURDi: 129 return true; 130 case AArch64::STURQi: 131 return true; 132 case AArch64::STURWi: 133 return true; 134 case AArch64::STURXi: 135 return true; 136 case AArch64::LDURSi: 137 return true; 138 case AArch64::LDURDi: 139 return true; 140 case AArch64::LDURQi: 141 return true; 142 case AArch64::LDURWi: 143 return true; 144 case AArch64::LDURXi: 145 return true; 146 case AArch64::LDURSWi: 147 return true; 148 } 149 } 150 151 // Size in bytes of the data moved by an unscaled load or store 152 int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) { 153 switch (MemMI->getOpcode()) { 154 default: 155 llvm_unreachable("Opcode has unknown size!"); 156 case AArch64::STRSui: 157 case AArch64::STURSi: 158 return 4; 159 case AArch64::STRDui: 160 case AArch64::STURDi: 161 return 8; 162 case AArch64::STRQui: 163 case AArch64::STURQi: 164 return 16; 165 case AArch64::STRWui: 166 case AArch64::STURWi: 167 return 4; 168 case AArch64::STRXui: 169 case AArch64::STURXi: 170 return 8; 171 case AArch64::LDRSui: 172 case AArch64::LDURSi: 173 return 4; 174 case AArch64::LDRDui: 175 case AArch64::LDURDi: 176 return 8; 177 case AArch64::LDRQui: 178 case AArch64::LDURQi: 179 return 16; 180 case AArch64::LDRWui: 181 case AArch64::LDURWi: 182 return 4; 183 case AArch64::LDRXui: 184 case AArch64::LDURXi: 185 return 8; 186 case AArch64::LDRSWui: 187 case AArch64::LDURSWi: 188 return 4; 189 } 190 } 191 192 static unsigned getMatchingNonSExtOpcode(unsigned Opc, 193 bool *IsValidLdStrOpc = nullptr) { 194 if (IsValidLdStrOpc) 195 *IsValidLdStrOpc = true; 196 switch (Opc) { 197 default: 198 if (IsValidLdStrOpc) 199 *IsValidLdStrOpc = false; 200 return UINT_MAX; 201 case AArch64::STRDui: 202 case AArch64::STURDi: 203 case AArch64::STRQui: 204 case AArch64::STURQi: 205 case AArch64::STRWui: 206 case AArch64::STURWi: 207 case AArch64::STRXui: 208 case AArch64::STURXi: 209 case AArch64::LDRDui: 210 case AArch64::LDURDi: 211 case AArch64::LDRQui: 212 case AArch64::LDURQi: 213 case AArch64::LDRWui: 214 case AArch64::LDURWi: 215 case AArch64::LDRXui: 216 case AArch64::LDURXi: 217 case AArch64::STRSui: 218 case AArch64::STURSi: 219 case AArch64::LDRSui: 220 case AArch64::LDURSi: 221 return Opc; 222 case AArch64::LDRSWui: 223 return AArch64::LDRWui; 224 case AArch64::LDURSWi: 225 return AArch64::LDURWi; 226 } 227 } 228 229 static unsigned getMatchingPairOpcode(unsigned Opc) { 230 switch (Opc) { 231 default: 232 llvm_unreachable("Opcode has no pairwise equivalent!"); 233 case AArch64::STRSui: 234 case AArch64::STURSi: 235 return AArch64::STPSi; 236 case AArch64::STRDui: 237 case AArch64::STURDi: 238 return AArch64::STPDi; 239 case AArch64::STRQui: 240 case AArch64::STURQi: 241 return AArch64::STPQi; 242 case AArch64::STRWui: 243 case AArch64::STURWi: 244 return AArch64::STPWi; 245 case AArch64::STRXui: 246 case AArch64::STURXi: 247 return AArch64::STPXi; 248 case AArch64::LDRSui: 249 case AArch64::LDURSi: 250 return AArch64::LDPSi; 251 case AArch64::LDRDui: 252 case AArch64::LDURDi: 253 return AArch64::LDPDi; 254 case AArch64::LDRQui: 255 case AArch64::LDURQi: 256 return AArch64::LDPQi; 257 case AArch64::LDRWui: 258 case AArch64::LDURWi: 259 return AArch64::LDPWi; 260 case AArch64::LDRXui: 261 case AArch64::LDURXi: 262 return AArch64::LDPXi; 263 case AArch64::LDRSWui: 264 case AArch64::LDURSWi: 265 return AArch64::LDPSWi; 266 } 267 } 268 269 static unsigned getPreIndexedOpcode(unsigned Opc) { 270 switch (Opc) { 271 default: 272 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 273 case AArch64::STRSui: 274 return AArch64::STRSpre; 275 case AArch64::STRDui: 276 return AArch64::STRDpre; 277 case AArch64::STRQui: 278 return AArch64::STRQpre; 279 case AArch64::STRWui: 280 return AArch64::STRWpre; 281 case AArch64::STRXui: 282 return AArch64::STRXpre; 283 case AArch64::LDRSui: 284 return AArch64::LDRSpre; 285 case AArch64::LDRDui: 286 return AArch64::LDRDpre; 287 case AArch64::LDRQui: 288 return AArch64::LDRQpre; 289 case AArch64::LDRWui: 290 return AArch64::LDRWpre; 291 case AArch64::LDRXui: 292 return AArch64::LDRXpre; 293 case AArch64::LDRSWui: 294 return AArch64::LDRSWpre; 295 } 296 } 297 298 static unsigned getPostIndexedOpcode(unsigned Opc) { 299 switch (Opc) { 300 default: 301 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 302 case AArch64::STRSui: 303 return AArch64::STRSpost; 304 case AArch64::STRDui: 305 return AArch64::STRDpost; 306 case AArch64::STRQui: 307 return AArch64::STRQpost; 308 case AArch64::STRWui: 309 return AArch64::STRWpost; 310 case AArch64::STRXui: 311 return AArch64::STRXpost; 312 case AArch64::LDRSui: 313 return AArch64::LDRSpost; 314 case AArch64::LDRDui: 315 return AArch64::LDRDpost; 316 case AArch64::LDRQui: 317 return AArch64::LDRQpost; 318 case AArch64::LDRWui: 319 return AArch64::LDRWpost; 320 case AArch64::LDRXui: 321 return AArch64::LDRXpost; 322 case AArch64::LDRSWui: 323 return AArch64::LDRSWpost; 324 } 325 } 326 327 MachineBasicBlock::iterator 328 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 329 MachineBasicBlock::iterator Paired, 330 bool MergeForward, int SExtIdx) { 331 MachineBasicBlock::iterator NextI = I; 332 ++NextI; 333 // If NextI is the second of the two instructions to be merged, we need 334 // to skip one further. Either way we merge will invalidate the iterator, 335 // and we don't need to scan the new instruction, as it's a pairwise 336 // instruction, which we're not considering for further action anyway. 337 if (NextI == Paired) 338 ++NextI; 339 340 unsigned Opc = 341 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); 342 bool IsUnscaled = isUnscaledLdst(Opc); 343 int OffsetStride = 344 IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1; 345 346 unsigned NewOpc = getMatchingPairOpcode(Opc); 347 // Insert our new paired instruction after whichever of the paired 348 // instructions MergeForward indicates. 349 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 350 // Also based on MergeForward is from where we copy the base register operand 351 // so we get the flags compatible with the input code. 352 MachineOperand &BaseRegOp = 353 MergeForward ? Paired->getOperand(1) : I->getOperand(1); 354 355 // Which register is Rt and which is Rt2 depends on the offset order. 356 MachineInstr *RtMI, *Rt2MI; 357 if (I->getOperand(2).getImm() == 358 Paired->getOperand(2).getImm() + OffsetStride) { 359 RtMI = Paired; 360 Rt2MI = I; 361 // Here we swapped the assumption made for SExtIdx. 362 // I.e., we turn ldp I, Paired into ldp Paired, I. 363 // Update the index accordingly. 364 if (SExtIdx != -1) 365 SExtIdx = (SExtIdx + 1) % 2; 366 } else { 367 RtMI = I; 368 Rt2MI = Paired; 369 } 370 // Handle Unscaled 371 int OffsetImm = RtMI->getOperand(2).getImm(); 372 if (IsUnscaled && EnableAArch64UnscaledMemOp) 373 OffsetImm /= OffsetStride; 374 375 // Construct the new instruction. 376 MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, 377 I->getDebugLoc(), TII->get(NewOpc)) 378 .addOperand(RtMI->getOperand(0)) 379 .addOperand(Rt2MI->getOperand(0)) 380 .addOperand(BaseRegOp) 381 .addImm(OffsetImm); 382 (void)MIB; 383 384 // FIXME: Do we need/want to copy the mem operands from the source 385 // instructions? Probably. What uses them after this? 386 387 DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); 388 DEBUG(I->print(dbgs())); 389 DEBUG(dbgs() << " "); 390 DEBUG(Paired->print(dbgs())); 391 DEBUG(dbgs() << " with instruction:\n "); 392 393 if (SExtIdx != -1) { 394 // Generate the sign extension for the proper result of the ldp. 395 // I.e., with X1, that would be: 396 // %W1<def> = KILL %W1, %X1<imp-def> 397 // %X1<def> = SBFMXri %X1<kill>, 0, 31 398 MachineOperand &DstMO = MIB->getOperand(SExtIdx); 399 // Right now, DstMO has the extended register, since it comes from an 400 // extended opcode. 401 unsigned DstRegX = DstMO.getReg(); 402 // Get the W variant of that register. 403 unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); 404 // Update the result of LDP to use the W instead of the X variant. 405 DstMO.setReg(DstRegW); 406 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 407 DEBUG(dbgs() << "\n"); 408 // Make the machine verifier happy by providing a definition for 409 // the X register. 410 // Insert this definition right after the generated LDP, i.e., before 411 // InsertionPoint. 412 MachineInstrBuilder MIBKill = 413 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 414 TII->get(TargetOpcode::KILL), DstRegW) 415 .addReg(DstRegW) 416 .addReg(DstRegX, RegState::Define); 417 MIBKill->getOperand(2).setImplicit(); 418 // Create the sign extension. 419 MachineInstrBuilder MIBSXTW = 420 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 421 TII->get(AArch64::SBFMXri), DstRegX) 422 .addReg(DstRegX) 423 .addImm(0) 424 .addImm(31); 425 (void)MIBSXTW; 426 DEBUG(dbgs() << " Extend operand:\n "); 427 DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); 428 DEBUG(dbgs() << "\n"); 429 } else { 430 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 431 DEBUG(dbgs() << "\n"); 432 } 433 434 // Erase the old instructions. 435 I->eraseFromParent(); 436 Paired->eraseFromParent(); 437 438 return NextI; 439 } 440 441 /// trackRegDefsUses - Remember what registers the specified instruction uses 442 /// and modifies. 443 static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs, 444 BitVector &UsedRegs, 445 const TargetRegisterInfo *TRI) { 446 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 447 MachineOperand &MO = MI->getOperand(i); 448 if (MO.isRegMask()) 449 ModifiedRegs.setBitsNotInMask(MO.getRegMask()); 450 451 if (!MO.isReg()) 452 continue; 453 unsigned Reg = MO.getReg(); 454 if (MO.isDef()) { 455 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 456 ModifiedRegs.set(*AI); 457 } else { 458 assert(MO.isUse() && "Reg operand not a def and not a use?!?"); 459 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 460 UsedRegs.set(*AI); 461 } 462 } 463 } 464 465 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 466 if (!IsUnscaled && (Offset > 63 || Offset < -64)) 467 return false; 468 if (IsUnscaled) { 469 // Convert the byte-offset used by unscaled into an "element" offset used 470 // by the scaled pair load/store instructions. 471 int ElemOffset = Offset / OffsetStride; 472 if (ElemOffset > 63 || ElemOffset < -64) 473 return false; 474 } 475 return true; 476 } 477 478 // Do alignment, specialized to power of 2 and for signed ints, 479 // avoiding having to do a C-style cast from uint_64t to int when 480 // using RoundUpToAlignment from include/llvm/Support/MathExtras.h. 481 // FIXME: Move this function to include/MathExtras.h? 482 static int alignTo(int Num, int PowOf2) { 483 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 484 } 485 486 /// findMatchingInsn - Scan the instructions looking for a load/store that can 487 /// be combined with the current instruction into a load/store pair. 488 MachineBasicBlock::iterator 489 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 490 bool &MergeForward, int &SExtIdx, 491 unsigned Limit) { 492 MachineBasicBlock::iterator E = I->getParent()->end(); 493 MachineBasicBlock::iterator MBBI = I; 494 MachineInstr *FirstMI = I; 495 ++MBBI; 496 497 int Opc = FirstMI->getOpcode(); 498 bool MayLoad = FirstMI->mayLoad(); 499 bool IsUnscaled = isUnscaledLdst(Opc); 500 unsigned Reg = FirstMI->getOperand(0).getReg(); 501 unsigned BaseReg = FirstMI->getOperand(1).getReg(); 502 int Offset = FirstMI->getOperand(2).getImm(); 503 504 // Early exit if the first instruction modifies the base register. 505 // e.g., ldr x0, [x0] 506 // Early exit if the offset if not possible to match. (6 bits of positive 507 // range, plus allow an extra one in case we find a later insn that matches 508 // with Offset-1 509 if (FirstMI->modifiesRegister(BaseReg, TRI)) 510 return E; 511 int OffsetStride = 512 IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1; 513 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 514 return E; 515 516 // Track which registers have been modified and used between the first insn 517 // (inclusive) and the second insn. 518 BitVector ModifiedRegs, UsedRegs; 519 ModifiedRegs.resize(TRI->getNumRegs()); 520 UsedRegs.resize(TRI->getNumRegs()); 521 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { 522 MachineInstr *MI = MBBI; 523 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 524 // optimization by changing how far we scan. 525 if (MI->isDebugValue()) 526 continue; 527 528 // Now that we know this is a real instruction, count it. 529 ++Count; 530 531 bool CanMergeOpc = Opc == MI->getOpcode(); 532 SExtIdx = -1; 533 if (!CanMergeOpc) { 534 bool IsValidLdStrOpc; 535 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); 536 if (!IsValidLdStrOpc) 537 continue; 538 // Opc will be the first instruction in the pair. 539 SExtIdx = NonSExtOpc == (unsigned)Opc ? 1 : 0; 540 CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); 541 } 542 543 if (CanMergeOpc && MI->getOperand(2).isImm()) { 544 // If we've found another instruction with the same opcode, check to see 545 // if the base and offset are compatible with our starting instruction. 546 // These instructions all have scaled immediate operands, so we just 547 // check for +1/-1. Make sure to check the new instruction offset is 548 // actually an immediate and not a symbolic reference destined for 549 // a relocation. 550 // 551 // Pairwise instructions have a 7-bit signed offset field. Single insns 552 // have a 12-bit unsigned offset field. To be a valid combine, the 553 // final offset must be in range. 554 unsigned MIBaseReg = MI->getOperand(1).getReg(); 555 int MIOffset = MI->getOperand(2).getImm(); 556 if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || 557 (Offset + OffsetStride == MIOffset))) { 558 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 559 // If this is a volatile load/store that otherwise matched, stop looking 560 // as something is going on that we don't have enough information to 561 // safely transform. Similarly, stop if we see a hint to avoid pairs. 562 if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 563 return E; 564 // If the resultant immediate offset of merging these instructions 565 // is out of range for a pairwise instruction, bail and keep looking. 566 bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode()); 567 if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { 568 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 569 continue; 570 } 571 // If the alignment requirements of the paired (scaled) instruction 572 // can't express the offset of the unscaled input, bail and keep 573 // looking. 574 if (IsUnscaled && EnableAArch64UnscaledMemOp && 575 (alignTo(MinOffset, OffsetStride) != MinOffset)) { 576 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 577 continue; 578 } 579 // If the destination register of the loads is the same register, bail 580 // and keep looking. A load-pair instruction with both destination 581 // registers the same is UNPREDICTABLE and will result in an exception. 582 if (MayLoad && Reg == MI->getOperand(0).getReg()) { 583 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 584 continue; 585 } 586 587 // If the Rt of the second instruction was not modified or used between 588 // the two instructions, we can combine the second into the first. 589 if (!ModifiedRegs[MI->getOperand(0).getReg()] && 590 !UsedRegs[MI->getOperand(0).getReg()]) { 591 MergeForward = false; 592 return MBBI; 593 } 594 595 // Likewise, if the Rt of the first instruction is not modified or used 596 // between the two instructions, we can combine the first into the 597 // second. 598 if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] && 599 !UsedRegs[FirstMI->getOperand(0).getReg()]) { 600 MergeForward = true; 601 return MBBI; 602 } 603 // Unable to combine these instructions due to interference in between. 604 // Keep looking. 605 } 606 } 607 608 // If the instruction wasn't a matching load or store, but does (or can) 609 // modify memory, stop searching, as we don't have alias analysis or 610 // anything like that to tell us whether the access is tromping on the 611 // locations we care about. The big one we want to catch is calls. 612 // 613 // FIXME: Theoretically, we can do better than that for SP and FP based 614 // references since we can effectively know where those are touching. It's 615 // unclear if it's worth the extra code, though. Most paired instructions 616 // will be sequential, perhaps with a few intervening non-memory related 617 // instructions. 618 if (MI->mayStore() || MI->isCall()) 619 return E; 620 // Likewise, if we're matching a store instruction, we don't want to 621 // move across a load, as it may be reading the same location. 622 if (FirstMI->mayStore() && MI->mayLoad()) 623 return E; 624 625 // Update modified / uses register lists. 626 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 627 628 // Otherwise, if the base register is modified, we have no match, so 629 // return early. 630 if (ModifiedRegs[BaseReg]) 631 return E; 632 } 633 return E; 634 } 635 636 MachineBasicBlock::iterator 637 AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, 638 MachineBasicBlock::iterator Update) { 639 assert((Update->getOpcode() == AArch64::ADDXri || 640 Update->getOpcode() == AArch64::SUBXri) && 641 "Unexpected base register update instruction to merge!"); 642 MachineBasicBlock::iterator NextI = I; 643 // Return the instruction following the merged instruction, which is 644 // the instruction following our unmerged load. Unless that's the add/sub 645 // instruction we're merging, in which case it's the one after that. 646 if (++NextI == Update) 647 ++NextI; 648 649 int Value = Update->getOperand(2).getImm(); 650 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 651 "Can't merge 1 << 12 offset into pre-indexed load / store"); 652 if (Update->getOpcode() == AArch64::SUBXri) 653 Value = -Value; 654 655 unsigned NewOpc = getPreIndexedOpcode(I->getOpcode()); 656 MachineInstrBuilder MIB = 657 BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 658 .addOperand(Update->getOperand(0)) 659 .addOperand(I->getOperand(0)) 660 .addOperand(I->getOperand(1)) 661 .addImm(Value); 662 (void)MIB; 663 664 DEBUG(dbgs() << "Creating pre-indexed load/store."); 665 DEBUG(dbgs() << " Replacing instructions:\n "); 666 DEBUG(I->print(dbgs())); 667 DEBUG(dbgs() << " "); 668 DEBUG(Update->print(dbgs())); 669 DEBUG(dbgs() << " with instruction:\n "); 670 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 671 DEBUG(dbgs() << "\n"); 672 673 // Erase the old instructions for the block. 674 I->eraseFromParent(); 675 Update->eraseFromParent(); 676 677 return NextI; 678 } 679 680 MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( 681 MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) { 682 assert((Update->getOpcode() == AArch64::ADDXri || 683 Update->getOpcode() == AArch64::SUBXri) && 684 "Unexpected base register update instruction to merge!"); 685 MachineBasicBlock::iterator NextI = I; 686 // Return the instruction following the merged instruction, which is 687 // the instruction following our unmerged load. Unless that's the add/sub 688 // instruction we're merging, in which case it's the one after that. 689 if (++NextI == Update) 690 ++NextI; 691 692 int Value = Update->getOperand(2).getImm(); 693 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 694 "Can't merge 1 << 12 offset into post-indexed load / store"); 695 if (Update->getOpcode() == AArch64::SUBXri) 696 Value = -Value; 697 698 unsigned NewOpc = getPostIndexedOpcode(I->getOpcode()); 699 MachineInstrBuilder MIB = 700 BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 701 .addOperand(Update->getOperand(0)) 702 .addOperand(I->getOperand(0)) 703 .addOperand(I->getOperand(1)) 704 .addImm(Value); 705 (void)MIB; 706 707 DEBUG(dbgs() << "Creating post-indexed load/store."); 708 DEBUG(dbgs() << " Replacing instructions:\n "); 709 DEBUG(I->print(dbgs())); 710 DEBUG(dbgs() << " "); 711 DEBUG(Update->print(dbgs())); 712 DEBUG(dbgs() << " with instruction:\n "); 713 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 714 DEBUG(dbgs() << "\n"); 715 716 // Erase the old instructions for the block. 717 I->eraseFromParent(); 718 Update->eraseFromParent(); 719 720 return NextI; 721 } 722 723 static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, 724 int Offset) { 725 switch (MI->getOpcode()) { 726 default: 727 break; 728 case AArch64::SUBXri: 729 // Negate the offset for a SUB instruction. 730 Offset *= -1; 731 // FALLTHROUGH 732 case AArch64::ADDXri: 733 // Make sure it's a vanilla immediate operand, not a relocation or 734 // anything else we can't handle. 735 if (!MI->getOperand(2).isImm()) 736 break; 737 // Watch out for 1 << 12 shifted value. 738 if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) 739 break; 740 // If the instruction has the base register as source and dest and the 741 // immediate will fit in a signed 9-bit integer, then we have a match. 742 if (MI->getOperand(0).getReg() == BaseReg && 743 MI->getOperand(1).getReg() == BaseReg && 744 MI->getOperand(2).getImm() <= 255 && 745 MI->getOperand(2).getImm() >= -256) { 746 // If we have a non-zero Offset, we check that it matches the amount 747 // we're adding to the register. 748 if (!Offset || Offset == MI->getOperand(2).getImm()) 749 return true; 750 } 751 break; 752 } 753 return false; 754 } 755 756 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 757 MachineBasicBlock::iterator I, unsigned Limit, int Value) { 758 MachineBasicBlock::iterator E = I->getParent()->end(); 759 MachineInstr *MemMI = I; 760 MachineBasicBlock::iterator MBBI = I; 761 const MachineFunction &MF = *MemMI->getParent()->getParent(); 762 763 unsigned DestReg = MemMI->getOperand(0).getReg(); 764 unsigned BaseReg = MemMI->getOperand(1).getReg(); 765 int Offset = MemMI->getOperand(2).getImm() * 766 TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); 767 768 // If the base register overlaps the destination register, we can't 769 // merge the update. 770 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 771 return E; 772 773 // Scan forward looking for post-index opportunities. 774 // Updating instructions can't be formed if the memory insn already 775 // has an offset other than the value we're looking for. 776 if (Offset != Value) 777 return E; 778 779 // Track which registers have been modified and used between the first insn 780 // (inclusive) and the second insn. 781 BitVector ModifiedRegs, UsedRegs; 782 ModifiedRegs.resize(TRI->getNumRegs()); 783 UsedRegs.resize(TRI->getNumRegs()); 784 ++MBBI; 785 for (unsigned Count = 0; MBBI != E; ++MBBI) { 786 MachineInstr *MI = MBBI; 787 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 788 // optimization by changing how far we scan. 789 if (MI->isDebugValue()) 790 continue; 791 792 // Now that we know this is a real instruction, count it. 793 ++Count; 794 795 // If we found a match, return it. 796 if (isMatchingUpdateInsn(MI, BaseReg, Value)) 797 return MBBI; 798 799 // Update the status of what the instruction clobbered and used. 800 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 801 802 // Otherwise, if the base register is used or modified, we have no match, so 803 // return early. 804 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 805 return E; 806 } 807 return E; 808 } 809 810 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 811 MachineBasicBlock::iterator I, unsigned Limit) { 812 MachineBasicBlock::iterator B = I->getParent()->begin(); 813 MachineBasicBlock::iterator E = I->getParent()->end(); 814 MachineInstr *MemMI = I; 815 MachineBasicBlock::iterator MBBI = I; 816 const MachineFunction &MF = *MemMI->getParent()->getParent(); 817 818 unsigned DestReg = MemMI->getOperand(0).getReg(); 819 unsigned BaseReg = MemMI->getOperand(1).getReg(); 820 int Offset = MemMI->getOperand(2).getImm(); 821 unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); 822 823 // If the load/store is the first instruction in the block, there's obviously 824 // not any matching update. Ditto if the memory offset isn't zero. 825 if (MBBI == B || Offset != 0) 826 return E; 827 // If the base register overlaps the destination register, we can't 828 // merge the update. 829 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 830 return E; 831 832 // Track which registers have been modified and used between the first insn 833 // (inclusive) and the second insn. 834 BitVector ModifiedRegs, UsedRegs; 835 ModifiedRegs.resize(TRI->getNumRegs()); 836 UsedRegs.resize(TRI->getNumRegs()); 837 --MBBI; 838 for (unsigned Count = 0; MBBI != B; --MBBI) { 839 MachineInstr *MI = MBBI; 840 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 841 // optimization by changing how far we scan. 842 if (MI->isDebugValue()) 843 continue; 844 845 // Now that we know this is a real instruction, count it. 846 ++Count; 847 848 // If we found a match, return it. 849 if (isMatchingUpdateInsn(MI, BaseReg, RegSize)) 850 return MBBI; 851 852 // Update the status of what the instruction clobbered and used. 853 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 854 855 // Otherwise, if the base register is used or modified, we have no match, so 856 // return early. 857 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 858 return E; 859 } 860 return E; 861 } 862 863 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { 864 bool Modified = false; 865 // Two tranformations to do here: 866 // 1) Find loads and stores that can be merged into a single load or store 867 // pair instruction. 868 // e.g., 869 // ldr x0, [x2] 870 // ldr x1, [x2, #8] 871 // ; becomes 872 // ldp x0, x1, [x2] 873 // 2) Find base register updates that can be merged into the load or store 874 // as a base-reg writeback. 875 // e.g., 876 // ldr x0, [x2] 877 // add x2, x2, #4 878 // ; becomes 879 // ldr x0, [x2], #4 880 881 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 882 MBBI != E;) { 883 MachineInstr *MI = MBBI; 884 switch (MI->getOpcode()) { 885 default: 886 // Just move on to the next instruction. 887 ++MBBI; 888 break; 889 case AArch64::STRSui: 890 case AArch64::STRDui: 891 case AArch64::STRQui: 892 case AArch64::STRXui: 893 case AArch64::STRWui: 894 case AArch64::LDRSui: 895 case AArch64::LDRDui: 896 case AArch64::LDRQui: 897 case AArch64::LDRXui: 898 case AArch64::LDRWui: 899 case AArch64::LDRSWui: 900 // do the unscaled versions as well 901 case AArch64::STURSi: 902 case AArch64::STURDi: 903 case AArch64::STURQi: 904 case AArch64::STURWi: 905 case AArch64::STURXi: 906 case AArch64::LDURSi: 907 case AArch64::LDURDi: 908 case AArch64::LDURQi: 909 case AArch64::LDURWi: 910 case AArch64::LDURXi: 911 case AArch64::LDURSWi: { 912 // If this is a volatile load/store, don't mess with it. 913 if (MI->hasOrderedMemoryRef()) { 914 ++MBBI; 915 break; 916 } 917 // Make sure this is a reg+imm (as opposed to an address reloc). 918 if (!MI->getOperand(2).isImm()) { 919 ++MBBI; 920 break; 921 } 922 // Check if this load/store has a hint to avoid pair formation. 923 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 924 if (TII->isLdStPairSuppressed(MI)) { 925 ++MBBI; 926 break; 927 } 928 // Look ahead up to ScanLimit instructions for a pairable instruction. 929 bool MergeForward = false; 930 int SExtIdx = -1; 931 MachineBasicBlock::iterator Paired = 932 findMatchingInsn(MBBI, MergeForward, SExtIdx, ScanLimit); 933 if (Paired != E) { 934 // Merge the loads into a pair. Keeping the iterator straight is a 935 // pain, so we let the merge routine tell us what the next instruction 936 // is after it's done mucking about. 937 MBBI = mergePairedInsns(MBBI, Paired, MergeForward, SExtIdx); 938 939 Modified = true; 940 ++NumPairCreated; 941 if (isUnscaledLdst(MI->getOpcode())) 942 ++NumUnscaledPairCreated; 943 break; 944 } 945 ++MBBI; 946 break; 947 } 948 // FIXME: Do the other instructions. 949 } 950 } 951 952 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 953 MBBI != E;) { 954 MachineInstr *MI = MBBI; 955 // Do update merging. It's simpler to keep this separate from the above 956 // switch, though not strictly necessary. 957 int Opc = MI->getOpcode(); 958 switch (Opc) { 959 default: 960 // Just move on to the next instruction. 961 ++MBBI; 962 break; 963 case AArch64::STRSui: 964 case AArch64::STRDui: 965 case AArch64::STRQui: 966 case AArch64::STRXui: 967 case AArch64::STRWui: 968 case AArch64::LDRSui: 969 case AArch64::LDRDui: 970 case AArch64::LDRQui: 971 case AArch64::LDRXui: 972 case AArch64::LDRWui: 973 // do the unscaled versions as well 974 case AArch64::STURSi: 975 case AArch64::STURDi: 976 case AArch64::STURQi: 977 case AArch64::STURWi: 978 case AArch64::STURXi: 979 case AArch64::LDURSi: 980 case AArch64::LDURDi: 981 case AArch64::LDURQi: 982 case AArch64::LDURWi: 983 case AArch64::LDURXi: { 984 // Make sure this is a reg+imm (as opposed to an address reloc). 985 if (!MI->getOperand(2).isImm()) { 986 ++MBBI; 987 break; 988 } 989 // Look ahead up to ScanLimit instructions for a mergable instruction. 990 MachineBasicBlock::iterator Update = 991 findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); 992 if (Update != E) { 993 // Merge the update into the ld/st. 994 MBBI = mergePostIdxUpdateInsn(MBBI, Update); 995 Modified = true; 996 ++NumPostFolded; 997 break; 998 } 999 // Don't know how to handle pre/post-index versions, so move to the next 1000 // instruction. 1001 if (isUnscaledLdst(Opc)) { 1002 ++MBBI; 1003 break; 1004 } 1005 1006 // Look back to try to find a pre-index instruction. For example, 1007 // add x0, x0, #8 1008 // ldr x1, [x0] 1009 // merged into: 1010 // ldr x1, [x0, #8]! 1011 Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); 1012 if (Update != E) { 1013 // Merge the update into the ld/st. 1014 MBBI = mergePreIdxUpdateInsn(MBBI, Update); 1015 Modified = true; 1016 ++NumPreFolded; 1017 break; 1018 } 1019 1020 // Look forward to try to find a post-index instruction. For example, 1021 // ldr x1, [x0, #64] 1022 // add x0, x0, #64 1023 // merged into: 1024 // ldr x1, [x0, #64]! 1025 1026 // The immediate in the load/store is scaled by the size of the register 1027 // being loaded. The immediate in the add we're looking for, 1028 // however, is not, so adjust here. 1029 int Value = MI->getOperand(2).getImm() * 1030 TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent())) 1031 ->getSize(); 1032 Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value); 1033 if (Update != E) { 1034 // Merge the update into the ld/st. 1035 MBBI = mergePreIdxUpdateInsn(MBBI, Update); 1036 Modified = true; 1037 ++NumPreFolded; 1038 break; 1039 } 1040 1041 // Nothing found. Just move to the next instruction. 1042 ++MBBI; 1043 break; 1044 } 1045 // FIXME: Do the other instructions. 1046 } 1047 } 1048 1049 return Modified; 1050 } 1051 1052 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1053 TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo()); 1054 TRI = Fn.getSubtarget().getRegisterInfo(); 1055 1056 bool Modified = false; 1057 for (auto &MBB : Fn) 1058 Modified |= optimizeBlock(MBB); 1059 1060 return Modified; 1061 } 1062 1063 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep 1064 // loads and stores near one another? 1065 1066 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store 1067 /// optimization pass. 1068 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 1069 return new AArch64LoadStoreOpt(); 1070 } 1071