1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that performs load / store related peephole 11 // optimizations. This pass should be run after register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AArch64InstrInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "MCTargetDesc/AArch64AddressingModes.h" 18 #include "llvm/ADT/BitVector.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/MachineInstr.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/Support/CommandLine.h" 26 #include "llvm/Support/Debug.h" 27 #include "llvm/Support/ErrorHandling.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetMachine.h" 31 #include "llvm/Target/TargetRegisterInfo.h" 32 using namespace llvm; 33 34 #define DEBUG_TYPE "aarch64-ldst-opt" 35 36 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine 37 /// load / store instructions to form ldp / stp instructions. 38 39 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 40 STATISTIC(NumPostFolded, "Number of post-index updates folded"); 41 STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 42 STATISTIC(NumUnscaledPairCreated, 43 "Number of load/store from unscaled generated"); 44 STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted"); 45 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); 46 47 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", 48 cl::init(20), cl::Hidden); 49 50 namespace llvm { 51 void initializeAArch64LoadStoreOptPass(PassRegistry &); 52 } 53 54 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" 55 56 namespace { 57 58 typedef struct LdStPairFlags { 59 // If a matching instruction is found, MergeForward is set to true if the 60 // merge is to remove the first instruction and replace the second with 61 // a pair-wise insn, and false if the reverse is true. 62 bool MergeForward; 63 64 // SExtIdx gives the index of the result of the load pair that must be 65 // extended. The value of SExtIdx assumes that the paired load produces the 66 // value in this order: (I, returned iterator), i.e., -1 means no value has 67 // to be extended, 0 means I, and 1 means the returned iterator. 68 int SExtIdx; 69 70 LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} 71 72 void setMergeForward(bool V = true) { MergeForward = V; } 73 bool getMergeForward() const { return MergeForward; } 74 75 void setSExtIdx(int V) { SExtIdx = V; } 76 int getSExtIdx() const { return SExtIdx; } 77 78 } LdStPairFlags; 79 80 struct AArch64LoadStoreOpt : public MachineFunctionPass { 81 static char ID; 82 AArch64LoadStoreOpt() : MachineFunctionPass(ID) { 83 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); 84 } 85 86 const AArch64InstrInfo *TII; 87 const TargetRegisterInfo *TRI; 88 const AArch64Subtarget *Subtarget; 89 90 // Scan the instructions looking for a load/store that can be combined 91 // with the current instruction into a load/store pair. 92 // Return the matching instruction if one is found, else MBB->end(). 93 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 94 LdStPairFlags &Flags, 95 unsigned Limit); 96 // Merge the two instructions indicated into a single pair-wise instruction. 97 // If MergeForward is true, erase the first instruction and fold its 98 // operation into the second. If false, the reverse. Return the instruction 99 // following the first instruction (which may change during processing). 100 MachineBasicBlock::iterator 101 mergePairedInsns(MachineBasicBlock::iterator I, 102 MachineBasicBlock::iterator Paired, 103 const LdStPairFlags &Flags); 104 105 // Scan the instruction list to find a base register update that can 106 // be combined with the current instruction (a load or store) using 107 // pre or post indexed addressing with writeback. Scan forwards. 108 MachineBasicBlock::iterator 109 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, 110 int UnscaledOffset); 111 112 // Scan the instruction list to find a base register update that can 113 // be combined with the current instruction (a load or store) using 114 // pre or post indexed addressing with writeback. Scan backwards. 115 MachineBasicBlock::iterator 116 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 117 118 // Find an instruction that updates the base register of the ld/st 119 // instruction. 120 bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, 121 unsigned BaseReg, int Offset); 122 123 // Merge a pre- or post-index base register update into a ld/st instruction. 124 MachineBasicBlock::iterator 125 mergeUpdateInsn(MachineBasicBlock::iterator I, 126 MachineBasicBlock::iterator Update, bool IsPreIdx); 127 128 // Find and merge foldable ldr/str instructions. 129 bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); 130 131 // Check if converting two narrow loads into a single wider load with 132 // bitfield extracts could be enabled. 133 bool enableNarrowLdMerge(MachineFunction &Fn); 134 135 bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); 136 137 bool runOnMachineFunction(MachineFunction &Fn) override; 138 139 const char *getPassName() const override { 140 return AARCH64_LOAD_STORE_OPT_NAME; 141 } 142 }; 143 char AArch64LoadStoreOpt::ID = 0; 144 } // namespace 145 146 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", 147 AARCH64_LOAD_STORE_OPT_NAME, false, false) 148 149 static bool isUnscaledLdSt(unsigned Opc) { 150 switch (Opc) { 151 default: 152 return false; 153 case AArch64::STURSi: 154 case AArch64::STURDi: 155 case AArch64::STURQi: 156 case AArch64::STURBBi: 157 case AArch64::STURHHi: 158 case AArch64::STURWi: 159 case AArch64::STURXi: 160 case AArch64::LDURSi: 161 case AArch64::LDURDi: 162 case AArch64::LDURQi: 163 case AArch64::LDURWi: 164 case AArch64::LDURXi: 165 case AArch64::LDURSWi: 166 case AArch64::LDURHHi: 167 case AArch64::LDURBBi: 168 case AArch64::LDURSBWi: 169 case AArch64::LDURSHWi: 170 return true; 171 } 172 } 173 174 static bool isUnscaledLdSt(MachineInstr *MI) { 175 return isUnscaledLdSt(MI->getOpcode()); 176 } 177 178 static unsigned getBitExtrOpcode(MachineInstr *MI) { 179 switch (MI->getOpcode()) { 180 default: 181 llvm_unreachable("Unexpected opcode."); 182 case AArch64::LDRBBui: 183 case AArch64::LDURBBi: 184 case AArch64::LDRHHui: 185 case AArch64::LDURHHi: 186 return AArch64::UBFMWri; 187 case AArch64::LDRSBWui: 188 case AArch64::LDURSBWi: 189 case AArch64::LDRSHWui: 190 case AArch64::LDURSHWi: 191 return AArch64::SBFMWri; 192 } 193 } 194 195 static bool isNarrowStore(unsigned Opc) { 196 switch (Opc) { 197 default: 198 return false; 199 case AArch64::STRBBui: 200 case AArch64::STURBBi: 201 case AArch64::STRHHui: 202 case AArch64::STURHHi: 203 return true; 204 } 205 } 206 207 static bool isNarrowStore(MachineInstr *MI) { 208 return isNarrowStore(MI->getOpcode()); 209 } 210 211 static bool isNarrowLoad(unsigned Opc) { 212 switch (Opc) { 213 default: 214 return false; 215 case AArch64::LDRHHui: 216 case AArch64::LDURHHi: 217 case AArch64::LDRBBui: 218 case AArch64::LDURBBi: 219 case AArch64::LDRSHWui: 220 case AArch64::LDURSHWi: 221 case AArch64::LDRSBWui: 222 case AArch64::LDURSBWi: 223 return true; 224 } 225 } 226 227 static bool isNarrowLoad(MachineInstr *MI) { 228 return isNarrowLoad(MI->getOpcode()); 229 } 230 231 // Scaling factor for unscaled load or store. 232 static int getMemScale(MachineInstr *MI) { 233 switch (MI->getOpcode()) { 234 default: 235 llvm_unreachable("Opcode has unknown scale!"); 236 case AArch64::LDRBBui: 237 case AArch64::LDURBBi: 238 case AArch64::LDRSBWui: 239 case AArch64::LDURSBWi: 240 case AArch64::STRBBui: 241 case AArch64::STURBBi: 242 return 1; 243 case AArch64::LDRHHui: 244 case AArch64::LDURHHi: 245 case AArch64::LDRSHWui: 246 case AArch64::LDURSHWi: 247 case AArch64::STRHHui: 248 case AArch64::STURHHi: 249 return 2; 250 case AArch64::LDRSui: 251 case AArch64::LDURSi: 252 case AArch64::LDRSWui: 253 case AArch64::LDURSWi: 254 case AArch64::LDRWui: 255 case AArch64::LDURWi: 256 case AArch64::STRSui: 257 case AArch64::STURSi: 258 case AArch64::STRWui: 259 case AArch64::STURWi: 260 case AArch64::LDPSi: 261 case AArch64::LDPSWi: 262 case AArch64::LDPWi: 263 case AArch64::STPSi: 264 case AArch64::STPWi: 265 return 4; 266 case AArch64::LDRDui: 267 case AArch64::LDURDi: 268 case AArch64::LDRXui: 269 case AArch64::LDURXi: 270 case AArch64::STRDui: 271 case AArch64::STURDi: 272 case AArch64::STRXui: 273 case AArch64::STURXi: 274 case AArch64::LDPDi: 275 case AArch64::LDPXi: 276 case AArch64::STPDi: 277 case AArch64::STPXi: 278 return 8; 279 case AArch64::LDRQui: 280 case AArch64::LDURQi: 281 case AArch64::STRQui: 282 case AArch64::STURQi: 283 case AArch64::LDPQi: 284 case AArch64::STPQi: 285 return 16; 286 } 287 } 288 289 static unsigned getMatchingNonSExtOpcode(unsigned Opc, 290 bool *IsValidLdStrOpc = nullptr) { 291 if (IsValidLdStrOpc) 292 *IsValidLdStrOpc = true; 293 switch (Opc) { 294 default: 295 if (IsValidLdStrOpc) 296 *IsValidLdStrOpc = false; 297 return UINT_MAX; 298 case AArch64::STRDui: 299 case AArch64::STURDi: 300 case AArch64::STRQui: 301 case AArch64::STURQi: 302 case AArch64::STRBBui: 303 case AArch64::STURBBi: 304 case AArch64::STRHHui: 305 case AArch64::STURHHi: 306 case AArch64::STRWui: 307 case AArch64::STURWi: 308 case AArch64::STRXui: 309 case AArch64::STURXi: 310 case AArch64::LDRDui: 311 case AArch64::LDURDi: 312 case AArch64::LDRQui: 313 case AArch64::LDURQi: 314 case AArch64::LDRWui: 315 case AArch64::LDURWi: 316 case AArch64::LDRXui: 317 case AArch64::LDURXi: 318 case AArch64::STRSui: 319 case AArch64::STURSi: 320 case AArch64::LDRSui: 321 case AArch64::LDURSi: 322 case AArch64::LDRHHui: 323 case AArch64::LDURHHi: 324 case AArch64::LDRBBui: 325 case AArch64::LDURBBi: 326 return Opc; 327 case AArch64::LDRSWui: 328 return AArch64::LDRWui; 329 case AArch64::LDURSWi: 330 return AArch64::LDURWi; 331 case AArch64::LDRSBWui: 332 return AArch64::LDRBBui; 333 case AArch64::LDRSHWui: 334 return AArch64::LDRHHui; 335 case AArch64::LDURSBWi: 336 return AArch64::LDURBBi; 337 case AArch64::LDURSHWi: 338 return AArch64::LDURHHi; 339 } 340 } 341 342 static unsigned getMatchingPairOpcode(unsigned Opc) { 343 switch (Opc) { 344 default: 345 llvm_unreachable("Opcode has no pairwise equivalent!"); 346 case AArch64::STRSui: 347 case AArch64::STURSi: 348 return AArch64::STPSi; 349 case AArch64::STRDui: 350 case AArch64::STURDi: 351 return AArch64::STPDi; 352 case AArch64::STRQui: 353 case AArch64::STURQi: 354 return AArch64::STPQi; 355 case AArch64::STRBBui: 356 return AArch64::STRHHui; 357 case AArch64::STRHHui: 358 return AArch64::STRWui; 359 case AArch64::STURBBi: 360 return AArch64::STURHHi; 361 case AArch64::STURHHi: 362 return AArch64::STURWi; 363 case AArch64::STRWui: 364 case AArch64::STURWi: 365 return AArch64::STPWi; 366 case AArch64::STRXui: 367 case AArch64::STURXi: 368 return AArch64::STPXi; 369 case AArch64::LDRSui: 370 case AArch64::LDURSi: 371 return AArch64::LDPSi; 372 case AArch64::LDRDui: 373 case AArch64::LDURDi: 374 return AArch64::LDPDi; 375 case AArch64::LDRQui: 376 case AArch64::LDURQi: 377 return AArch64::LDPQi; 378 case AArch64::LDRWui: 379 case AArch64::LDURWi: 380 return AArch64::LDPWi; 381 case AArch64::LDRXui: 382 case AArch64::LDURXi: 383 return AArch64::LDPXi; 384 case AArch64::LDRSWui: 385 case AArch64::LDURSWi: 386 return AArch64::LDPSWi; 387 case AArch64::LDRHHui: 388 case AArch64::LDRSHWui: 389 return AArch64::LDRWui; 390 case AArch64::LDURHHi: 391 case AArch64::LDURSHWi: 392 return AArch64::LDURWi; 393 case AArch64::LDRBBui: 394 case AArch64::LDRSBWui: 395 return AArch64::LDRHHui; 396 case AArch64::LDURBBi: 397 case AArch64::LDURSBWi: 398 return AArch64::LDURHHi; 399 } 400 } 401 402 static unsigned getPreIndexedOpcode(unsigned Opc) { 403 switch (Opc) { 404 default: 405 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 406 case AArch64::STRSui: 407 return AArch64::STRSpre; 408 case AArch64::STRDui: 409 return AArch64::STRDpre; 410 case AArch64::STRQui: 411 return AArch64::STRQpre; 412 case AArch64::STRBBui: 413 return AArch64::STRBBpre; 414 case AArch64::STRHHui: 415 return AArch64::STRHHpre; 416 case AArch64::STRWui: 417 return AArch64::STRWpre; 418 case AArch64::STRXui: 419 return AArch64::STRXpre; 420 case AArch64::LDRSui: 421 return AArch64::LDRSpre; 422 case AArch64::LDRDui: 423 return AArch64::LDRDpre; 424 case AArch64::LDRQui: 425 return AArch64::LDRQpre; 426 case AArch64::LDRBBui: 427 return AArch64::LDRBBpre; 428 case AArch64::LDRHHui: 429 return AArch64::LDRHHpre; 430 case AArch64::LDRWui: 431 return AArch64::LDRWpre; 432 case AArch64::LDRXui: 433 return AArch64::LDRXpre; 434 case AArch64::LDRSWui: 435 return AArch64::LDRSWpre; 436 case AArch64::LDPSi: 437 return AArch64::LDPSpre; 438 case AArch64::LDPSWi: 439 return AArch64::LDPSWpre; 440 case AArch64::LDPDi: 441 return AArch64::LDPDpre; 442 case AArch64::LDPQi: 443 return AArch64::LDPQpre; 444 case AArch64::LDPWi: 445 return AArch64::LDPWpre; 446 case AArch64::LDPXi: 447 return AArch64::LDPXpre; 448 case AArch64::STPSi: 449 return AArch64::STPSpre; 450 case AArch64::STPDi: 451 return AArch64::STPDpre; 452 case AArch64::STPQi: 453 return AArch64::STPQpre; 454 case AArch64::STPWi: 455 return AArch64::STPWpre; 456 case AArch64::STPXi: 457 return AArch64::STPXpre; 458 } 459 } 460 461 static unsigned getPostIndexedOpcode(unsigned Opc) { 462 switch (Opc) { 463 default: 464 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 465 case AArch64::STRSui: 466 return AArch64::STRSpost; 467 case AArch64::STRDui: 468 return AArch64::STRDpost; 469 case AArch64::STRQui: 470 return AArch64::STRQpost; 471 case AArch64::STRBBui: 472 return AArch64::STRBBpost; 473 case AArch64::STRHHui: 474 return AArch64::STRHHpost; 475 case AArch64::STRWui: 476 return AArch64::STRWpost; 477 case AArch64::STRXui: 478 return AArch64::STRXpost; 479 case AArch64::LDRSui: 480 return AArch64::LDRSpost; 481 case AArch64::LDRDui: 482 return AArch64::LDRDpost; 483 case AArch64::LDRQui: 484 return AArch64::LDRQpost; 485 case AArch64::LDRBBui: 486 return AArch64::LDRBBpost; 487 case AArch64::LDRHHui: 488 return AArch64::LDRHHpost; 489 case AArch64::LDRWui: 490 return AArch64::LDRWpost; 491 case AArch64::LDRXui: 492 return AArch64::LDRXpost; 493 case AArch64::LDRSWui: 494 return AArch64::LDRSWpost; 495 case AArch64::LDPSi: 496 return AArch64::LDPSpost; 497 case AArch64::LDPSWi: 498 return AArch64::LDPSWpost; 499 case AArch64::LDPDi: 500 return AArch64::LDPDpost; 501 case AArch64::LDPQi: 502 return AArch64::LDPQpost; 503 case AArch64::LDPWi: 504 return AArch64::LDPWpost; 505 case AArch64::LDPXi: 506 return AArch64::LDPXpost; 507 case AArch64::STPSi: 508 return AArch64::STPSpost; 509 case AArch64::STPDi: 510 return AArch64::STPDpost; 511 case AArch64::STPQi: 512 return AArch64::STPQpost; 513 case AArch64::STPWi: 514 return AArch64::STPWpost; 515 case AArch64::STPXi: 516 return AArch64::STPXpost; 517 } 518 } 519 520 static bool isPairedLdSt(const MachineInstr *MI) { 521 switch (MI->getOpcode()) { 522 default: 523 return false; 524 case AArch64::LDPSi: 525 case AArch64::LDPSWi: 526 case AArch64::LDPDi: 527 case AArch64::LDPQi: 528 case AArch64::LDPWi: 529 case AArch64::LDPXi: 530 case AArch64::STPSi: 531 case AArch64::STPDi: 532 case AArch64::STPQi: 533 case AArch64::STPWi: 534 case AArch64::STPXi: 535 return true; 536 } 537 } 538 539 static const MachineOperand &getLdStRegOp(const MachineInstr *MI, 540 unsigned PairedRegOp = 0) { 541 assert(PairedRegOp < 2 && "Unexpected register operand idx."); 542 unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; 543 return MI->getOperand(Idx); 544 } 545 546 static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { 547 unsigned Idx = isPairedLdSt(MI) ? 2 : 1; 548 return MI->getOperand(Idx); 549 } 550 551 static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { 552 unsigned Idx = isPairedLdSt(MI) ? 3 : 2; 553 return MI->getOperand(Idx); 554 } 555 556 // Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI. 557 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, 558 MachineInstr *Op1) { 559 assert(MI->memoperands_empty() && "expected a new machineinstr"); 560 size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) + 561 (Op1->memoperands_end() - Op1->memoperands_begin()); 562 563 MachineFunction *MF = MI->getParent()->getParent(); 564 MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); 565 MachineSDNode::mmo_iterator MemEnd = 566 std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); 567 MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); 568 MI->setMemRefs(MemBegin, MemEnd); 569 } 570 571 MachineBasicBlock::iterator 572 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 573 MachineBasicBlock::iterator Paired, 574 const LdStPairFlags &Flags) { 575 MachineBasicBlock::iterator NextI = I; 576 ++NextI; 577 // If NextI is the second of the two instructions to be merged, we need 578 // to skip one further. Either way we merge will invalidate the iterator, 579 // and we don't need to scan the new instruction, as it's a pairwise 580 // instruction, which we're not considering for further action anyway. 581 if (NextI == Paired) 582 ++NextI; 583 584 int SExtIdx = Flags.getSExtIdx(); 585 unsigned Opc = 586 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); 587 bool IsUnscaled = isUnscaledLdSt(Opc); 588 int OffsetStride = IsUnscaled ? getMemScale(I) : 1; 589 590 bool MergeForward = Flags.getMergeForward(); 591 unsigned NewOpc = getMatchingPairOpcode(Opc); 592 // Insert our new paired instruction after whichever of the paired 593 // instructions MergeForward indicates. 594 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 595 // Also based on MergeForward is from where we copy the base register operand 596 // so we get the flags compatible with the input code. 597 const MachineOperand &BaseRegOp = 598 MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); 599 600 // Which register is Rt and which is Rt2 depends on the offset order. 601 MachineInstr *RtMI, *Rt2MI; 602 if (getLdStOffsetOp(I).getImm() == 603 getLdStOffsetOp(Paired).getImm() + OffsetStride) { 604 RtMI = Paired; 605 Rt2MI = I; 606 // Here we swapped the assumption made for SExtIdx. 607 // I.e., we turn ldp I, Paired into ldp Paired, I. 608 // Update the index accordingly. 609 if (SExtIdx != -1) 610 SExtIdx = (SExtIdx + 1) % 2; 611 } else { 612 RtMI = I; 613 Rt2MI = Paired; 614 } 615 616 int OffsetImm = getLdStOffsetOp(RtMI).getImm(); 617 618 if (isNarrowLoad(Opc)) { 619 // Change the scaled offset from small to large type. 620 if (!IsUnscaled) { 621 assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); 622 OffsetImm /= 2; 623 } 624 MachineInstr *RtNewDest = MergeForward ? I : Paired; 625 // When merging small (< 32 bit) loads for big-endian targets, the order of 626 // the component parts gets swapped. 627 if (!Subtarget->isLittleEndian()) 628 std::swap(RtMI, Rt2MI); 629 // Construct the new load instruction. 630 MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; 631 NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 632 TII->get(NewOpc)) 633 .addOperand(getLdStRegOp(RtNewDest)) 634 .addOperand(BaseRegOp) 635 .addImm(OffsetImm); 636 637 // Copy MachineMemOperands from the original loads. 638 concatenateMemOperands(NewMemMI, I, Paired); 639 640 DEBUG( 641 dbgs() 642 << "Creating the new load and extract. Replacing instructions:\n "); 643 DEBUG(I->print(dbgs())); 644 DEBUG(dbgs() << " "); 645 DEBUG(Paired->print(dbgs())); 646 DEBUG(dbgs() << " with instructions:\n "); 647 DEBUG((NewMemMI)->print(dbgs())); 648 649 int Width = getMemScale(I) == 1 ? 8 : 16; 650 int LSBLow = 0; 651 int LSBHigh = Width; 652 int ImmsLow = LSBLow + Width - 1; 653 int ImmsHigh = LSBHigh + Width - 1; 654 MachineInstr *ExtDestMI = MergeForward ? Paired : I; 655 if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) { 656 // Create the bitfield extract for high bits. 657 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 658 TII->get(getBitExtrOpcode(Rt2MI))) 659 .addOperand(getLdStRegOp(Rt2MI)) 660 .addReg(getLdStRegOp(RtNewDest).getReg()) 661 .addImm(LSBHigh) 662 .addImm(ImmsHigh); 663 // Create the bitfield extract for low bits. 664 if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { 665 // For unsigned, prefer to use AND for low bits. 666 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 667 TII->get(AArch64::ANDWri)) 668 .addOperand(getLdStRegOp(RtMI)) 669 .addReg(getLdStRegOp(RtNewDest).getReg()) 670 .addImm(ImmsLow); 671 } else { 672 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 673 TII->get(getBitExtrOpcode(RtMI))) 674 .addOperand(getLdStRegOp(RtMI)) 675 .addReg(getLdStRegOp(RtNewDest).getReg()) 676 .addImm(LSBLow) 677 .addImm(ImmsLow); 678 } 679 } else { 680 // Create the bitfield extract for low bits. 681 if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { 682 // For unsigned, prefer to use AND for low bits. 683 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 684 TII->get(AArch64::ANDWri)) 685 .addOperand(getLdStRegOp(RtMI)) 686 .addReg(getLdStRegOp(RtNewDest).getReg()) 687 .addImm(ImmsLow); 688 } else { 689 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 690 TII->get(getBitExtrOpcode(RtMI))) 691 .addOperand(getLdStRegOp(RtMI)) 692 .addReg(getLdStRegOp(RtNewDest).getReg()) 693 .addImm(LSBLow) 694 .addImm(ImmsLow); 695 } 696 697 // Create the bitfield extract for high bits. 698 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 699 TII->get(getBitExtrOpcode(Rt2MI))) 700 .addOperand(getLdStRegOp(Rt2MI)) 701 .addReg(getLdStRegOp(RtNewDest).getReg()) 702 .addImm(LSBHigh) 703 .addImm(ImmsHigh); 704 } 705 DEBUG(dbgs() << " "); 706 DEBUG((BitExtMI1)->print(dbgs())); 707 DEBUG(dbgs() << " "); 708 DEBUG((BitExtMI2)->print(dbgs())); 709 DEBUG(dbgs() << "\n"); 710 711 // Erase the old instructions. 712 I->eraseFromParent(); 713 Paired->eraseFromParent(); 714 return NextI; 715 } 716 717 // Construct the new instruction. 718 MachineInstrBuilder MIB; 719 if (isNarrowStore(Opc)) { 720 // Change the scaled offset from small to large type. 721 if (!IsUnscaled) { 722 assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); 723 OffsetImm /= 2; 724 } 725 MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 726 TII->get(NewOpc)) 727 .addOperand(getLdStRegOp(I)) 728 .addOperand(BaseRegOp) 729 .addImm(OffsetImm); 730 // Copy MachineMemOperands from the original stores. 731 concatenateMemOperands(MIB, I, Paired); 732 } else { 733 // Handle Unscaled 734 if (IsUnscaled) 735 OffsetImm /= OffsetStride; 736 MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 737 TII->get(NewOpc)) 738 .addOperand(getLdStRegOp(RtMI)) 739 .addOperand(getLdStRegOp(Rt2MI)) 740 .addOperand(BaseRegOp) 741 .addImm(OffsetImm); 742 } 743 744 (void)MIB; 745 746 // FIXME: Do we need/want to copy the mem operands from the source 747 // instructions? Probably. What uses them after this? 748 749 DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); 750 DEBUG(I->print(dbgs())); 751 DEBUG(dbgs() << " "); 752 DEBUG(Paired->print(dbgs())); 753 DEBUG(dbgs() << " with instruction:\n "); 754 755 if (SExtIdx != -1) { 756 // Generate the sign extension for the proper result of the ldp. 757 // I.e., with X1, that would be: 758 // %W1<def> = KILL %W1, %X1<imp-def> 759 // %X1<def> = SBFMXri %X1<kill>, 0, 31 760 MachineOperand &DstMO = MIB->getOperand(SExtIdx); 761 // Right now, DstMO has the extended register, since it comes from an 762 // extended opcode. 763 unsigned DstRegX = DstMO.getReg(); 764 // Get the W variant of that register. 765 unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); 766 // Update the result of LDP to use the W instead of the X variant. 767 DstMO.setReg(DstRegW); 768 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 769 DEBUG(dbgs() << "\n"); 770 // Make the machine verifier happy by providing a definition for 771 // the X register. 772 // Insert this definition right after the generated LDP, i.e., before 773 // InsertionPoint. 774 MachineInstrBuilder MIBKill = 775 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 776 TII->get(TargetOpcode::KILL), DstRegW) 777 .addReg(DstRegW) 778 .addReg(DstRegX, RegState::Define); 779 MIBKill->getOperand(2).setImplicit(); 780 // Create the sign extension. 781 MachineInstrBuilder MIBSXTW = 782 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 783 TII->get(AArch64::SBFMXri), DstRegX) 784 .addReg(DstRegX) 785 .addImm(0) 786 .addImm(31); 787 (void)MIBSXTW; 788 DEBUG(dbgs() << " Extend operand:\n "); 789 DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); 790 DEBUG(dbgs() << "\n"); 791 } else { 792 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 793 DEBUG(dbgs() << "\n"); 794 } 795 796 // Erase the old instructions. 797 I->eraseFromParent(); 798 Paired->eraseFromParent(); 799 800 return NextI; 801 } 802 803 /// trackRegDefsUses - Remember what registers the specified instruction uses 804 /// and modifies. 805 static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, 806 BitVector &UsedRegs, 807 const TargetRegisterInfo *TRI) { 808 for (const MachineOperand &MO : MI->operands()) { 809 if (MO.isRegMask()) 810 ModifiedRegs.setBitsNotInMask(MO.getRegMask()); 811 812 if (!MO.isReg()) 813 continue; 814 unsigned Reg = MO.getReg(); 815 if (MO.isDef()) { 816 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 817 ModifiedRegs.set(*AI); 818 } else { 819 assert(MO.isUse() && "Reg operand not a def and not a use?!?"); 820 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 821 UsedRegs.set(*AI); 822 } 823 } 824 } 825 826 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 827 // Convert the byte-offset used by unscaled into an "element" offset used 828 // by the scaled pair load/store instructions. 829 if (IsUnscaled) 830 Offset /= OffsetStride; 831 832 return Offset <= 63 && Offset >= -64; 833 } 834 835 // Do alignment, specialized to power of 2 and for signed ints, 836 // avoiding having to do a C-style cast from uint_64t to int when 837 // using RoundUpToAlignment from include/llvm/Support/MathExtras.h. 838 // FIXME: Move this function to include/MathExtras.h? 839 static int alignTo(int Num, int PowOf2) { 840 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 841 } 842 843 static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb, 844 const AArch64InstrInfo *TII) { 845 // One of the instructions must modify memory. 846 if (!MIa->mayStore() && !MIb->mayStore()) 847 return false; 848 849 // Both instructions must be memory operations. 850 if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore()) 851 return false; 852 853 return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); 854 } 855 856 static bool mayAlias(MachineInstr *MIa, 857 SmallVectorImpl<MachineInstr *> &MemInsns, 858 const AArch64InstrInfo *TII) { 859 for (auto &MIb : MemInsns) 860 if (mayAlias(MIa, MIb, TII)) 861 return true; 862 863 return false; 864 } 865 866 /// findMatchingInsn - Scan the instructions looking for a load/store that can 867 /// be combined with the current instruction into a load/store pair. 868 MachineBasicBlock::iterator 869 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 870 LdStPairFlags &Flags, unsigned Limit) { 871 MachineBasicBlock::iterator E = I->getParent()->end(); 872 MachineBasicBlock::iterator MBBI = I; 873 MachineInstr *FirstMI = I; 874 ++MBBI; 875 876 unsigned Opc = FirstMI->getOpcode(); 877 bool MayLoad = FirstMI->mayLoad(); 878 bool IsUnscaled = isUnscaledLdSt(FirstMI); 879 unsigned Reg = getLdStRegOp(FirstMI).getReg(); 880 unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); 881 int Offset = getLdStOffsetOp(FirstMI).getImm(); 882 bool IsNarrowStore = isNarrowStore(Opc); 883 884 // For narrow stores, find only the case where the stored value is WZR. 885 if (IsNarrowStore && Reg != AArch64::WZR) 886 return E; 887 888 // Early exit if the first instruction modifies the base register. 889 // e.g., ldr x0, [x0] 890 if (FirstMI->modifiesRegister(BaseReg, TRI)) 891 return E; 892 893 // Early exit if the offset if not possible to match. (6 bits of positive 894 // range, plus allow an extra one in case we find a later insn that matches 895 // with Offset-1) 896 int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; 897 if (!(isNarrowLoad(Opc) || IsNarrowStore) && 898 !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 899 return E; 900 901 // Track which registers have been modified and used between the first insn 902 // (inclusive) and the second insn. 903 BitVector ModifiedRegs, UsedRegs; 904 ModifiedRegs.resize(TRI->getNumRegs()); 905 UsedRegs.resize(TRI->getNumRegs()); 906 907 // Remember any instructions that read/write memory between FirstMI and MI. 908 SmallVector<MachineInstr *, 4> MemInsns; 909 910 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { 911 MachineInstr *MI = MBBI; 912 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 913 // optimization by changing how far we scan. 914 if (MI->isDebugValue()) 915 continue; 916 917 // Now that we know this is a real instruction, count it. 918 ++Count; 919 920 bool CanMergeOpc = Opc == MI->getOpcode(); 921 Flags.setSExtIdx(-1); 922 if (!CanMergeOpc) { 923 bool IsValidLdStrOpc; 924 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); 925 assert(IsValidLdStrOpc && 926 "Given Opc should be a Load or Store with an immediate"); 927 // Opc will be the first instruction in the pair. 928 Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0); 929 CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); 930 } 931 932 if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { 933 assert(MI->mayLoadOrStore() && "Expected memory operation."); 934 // If we've found another instruction with the same opcode, check to see 935 // if the base and offset are compatible with our starting instruction. 936 // These instructions all have scaled immediate operands, so we just 937 // check for +1/-1. Make sure to check the new instruction offset is 938 // actually an immediate and not a symbolic reference destined for 939 // a relocation. 940 // 941 // Pairwise instructions have a 7-bit signed offset field. Single insns 942 // have a 12-bit unsigned offset field. To be a valid combine, the 943 // final offset must be in range. 944 unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); 945 int MIOffset = getLdStOffsetOp(MI).getImm(); 946 if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || 947 (Offset + OffsetStride == MIOffset))) { 948 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 949 // If this is a volatile load/store that otherwise matched, stop looking 950 // as something is going on that we don't have enough information to 951 // safely transform. Similarly, stop if we see a hint to avoid pairs. 952 if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 953 return E; 954 // If the resultant immediate offset of merging these instructions 955 // is out of range for a pairwise instruction, bail and keep looking. 956 bool MIIsUnscaled = isUnscaledLdSt(MI); 957 bool IsNarrowLoad = isNarrowLoad(MI->getOpcode()); 958 if (!IsNarrowLoad && 959 !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { 960 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 961 MemInsns.push_back(MI); 962 continue; 963 } 964 965 if (IsNarrowLoad || IsNarrowStore) { 966 // If the alignment requirements of the scaled wide load/store 967 // instruction can't express the offset of the scaled narrow 968 // input, bail and keep looking. 969 if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { 970 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 971 MemInsns.push_back(MI); 972 continue; 973 } 974 } else { 975 // If the alignment requirements of the paired (scaled) instruction 976 // can't express the offset of the unscaled input, bail and keep 977 // looking. 978 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { 979 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 980 MemInsns.push_back(MI); 981 continue; 982 } 983 } 984 // If the destination register of the loads is the same register, bail 985 // and keep looking. A load-pair instruction with both destination 986 // registers the same is UNPREDICTABLE and will result in an exception. 987 // For narrow stores, allow only when the stored value is the same 988 // (i.e., WZR). 989 if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) || 990 (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) { 991 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 992 MemInsns.push_back(MI); 993 continue; 994 } 995 996 // If the Rt of the second instruction was not modified or used between 997 // the two instructions and none of the instructions between the second 998 // and first alias with the second, we can combine the second into the 999 // first. 1000 if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && 1001 !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && 1002 !mayAlias(MI, MemInsns, TII)) { 1003 Flags.setMergeForward(false); 1004 return MBBI; 1005 } 1006 1007 // Likewise, if the Rt of the first instruction is not modified or used 1008 // between the two instructions and none of the instructions between the 1009 // first and the second alias with the first, we can combine the first 1010 // into the second. 1011 if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && 1012 !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && 1013 !mayAlias(FirstMI, MemInsns, TII)) { 1014 Flags.setMergeForward(true); 1015 return MBBI; 1016 } 1017 // Unable to combine these instructions due to interference in between. 1018 // Keep looking. 1019 } 1020 } 1021 1022 // If the instruction wasn't a matching load or store. Stop searching if we 1023 // encounter a call instruction that might modify memory. 1024 if (MI->isCall()) 1025 return E; 1026 1027 // Update modified / uses register lists. 1028 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1029 1030 // Otherwise, if the base register is modified, we have no match, so 1031 // return early. 1032 if (ModifiedRegs[BaseReg]) 1033 return E; 1034 1035 // Update list of instructions that read/write memory. 1036 if (MI->mayLoadOrStore()) 1037 MemInsns.push_back(MI); 1038 } 1039 return E; 1040 } 1041 1042 MachineBasicBlock::iterator 1043 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, 1044 MachineBasicBlock::iterator Update, 1045 bool IsPreIdx) { 1046 assert((Update->getOpcode() == AArch64::ADDXri || 1047 Update->getOpcode() == AArch64::SUBXri) && 1048 "Unexpected base register update instruction to merge!"); 1049 MachineBasicBlock::iterator NextI = I; 1050 // Return the instruction following the merged instruction, which is 1051 // the instruction following our unmerged load. Unless that's the add/sub 1052 // instruction we're merging, in which case it's the one after that. 1053 if (++NextI == Update) 1054 ++NextI; 1055 1056 int Value = Update->getOperand(2).getImm(); 1057 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 1058 "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); 1059 if (Update->getOpcode() == AArch64::SUBXri) 1060 Value = -Value; 1061 1062 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) 1063 : getPostIndexedOpcode(I->getOpcode()); 1064 MachineInstrBuilder MIB; 1065 if (!isPairedLdSt(I)) { 1066 // Non-paired instruction. 1067 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 1068 .addOperand(getLdStRegOp(Update)) 1069 .addOperand(getLdStRegOp(I)) 1070 .addOperand(getLdStBaseOp(I)) 1071 .addImm(Value); 1072 } else { 1073 // Paired instruction. 1074 int Scale = getMemScale(I); 1075 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 1076 .addOperand(getLdStRegOp(Update)) 1077 .addOperand(getLdStRegOp(I, 0)) 1078 .addOperand(getLdStRegOp(I, 1)) 1079 .addOperand(getLdStBaseOp(I)) 1080 .addImm(Value / Scale); 1081 } 1082 (void)MIB; 1083 1084 if (IsPreIdx) 1085 DEBUG(dbgs() << "Creating pre-indexed load/store."); 1086 else 1087 DEBUG(dbgs() << "Creating post-indexed load/store."); 1088 DEBUG(dbgs() << " Replacing instructions:\n "); 1089 DEBUG(I->print(dbgs())); 1090 DEBUG(dbgs() << " "); 1091 DEBUG(Update->print(dbgs())); 1092 DEBUG(dbgs() << " with instruction:\n "); 1093 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 1094 DEBUG(dbgs() << "\n"); 1095 1096 // Erase the old instructions for the block. 1097 I->eraseFromParent(); 1098 Update->eraseFromParent(); 1099 1100 return NextI; 1101 } 1102 1103 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, 1104 MachineInstr *MI, 1105 unsigned BaseReg, int Offset) { 1106 switch (MI->getOpcode()) { 1107 default: 1108 break; 1109 case AArch64::SUBXri: 1110 // Negate the offset for a SUB instruction. 1111 Offset *= -1; 1112 // FALLTHROUGH 1113 case AArch64::ADDXri: 1114 // Make sure it's a vanilla immediate operand, not a relocation or 1115 // anything else we can't handle. 1116 if (!MI->getOperand(2).isImm()) 1117 break; 1118 // Watch out for 1 << 12 shifted value. 1119 if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) 1120 break; 1121 1122 // The update instruction source and destination register must be the 1123 // same as the load/store base register. 1124 if (MI->getOperand(0).getReg() != BaseReg || 1125 MI->getOperand(1).getReg() != BaseReg) 1126 break; 1127 1128 bool IsPairedInsn = isPairedLdSt(MemMI); 1129 int UpdateOffset = MI->getOperand(2).getImm(); 1130 // For non-paired load/store instructions, the immediate must fit in a 1131 // signed 9-bit integer. 1132 if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) 1133 break; 1134 1135 // For paired load/store instructions, the immediate must be a multiple of 1136 // the scaling factor. The scaled offset must also fit into a signed 7-bit 1137 // integer. 1138 if (IsPairedInsn) { 1139 int Scale = getMemScale(MemMI); 1140 if (UpdateOffset % Scale != 0) 1141 break; 1142 1143 int ScaledOffset = UpdateOffset / Scale; 1144 if (ScaledOffset > 64 || ScaledOffset < -64) 1145 break; 1146 } 1147 1148 // If we have a non-zero Offset, we check that it matches the amount 1149 // we're adding to the register. 1150 if (!Offset || Offset == MI->getOperand(2).getImm()) 1151 return true; 1152 break; 1153 } 1154 return false; 1155 } 1156 1157 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 1158 MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { 1159 MachineBasicBlock::iterator E = I->getParent()->end(); 1160 MachineInstr *MemMI = I; 1161 MachineBasicBlock::iterator MBBI = I; 1162 1163 unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); 1164 int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); 1165 1166 // Scan forward looking for post-index opportunities. Updating instructions 1167 // can't be formed if the memory instruction doesn't have the offset we're 1168 // looking for. 1169 if (MIUnscaledOffset != UnscaledOffset) 1170 return E; 1171 1172 // If the base register overlaps a destination register, we can't 1173 // merge the update. 1174 bool IsPairedInsn = isPairedLdSt(MemMI); 1175 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1176 unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); 1177 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1178 return E; 1179 } 1180 1181 // Track which registers have been modified and used between the first insn 1182 // (inclusive) and the second insn. 1183 BitVector ModifiedRegs, UsedRegs; 1184 ModifiedRegs.resize(TRI->getNumRegs()); 1185 UsedRegs.resize(TRI->getNumRegs()); 1186 ++MBBI; 1187 for (unsigned Count = 0; MBBI != E; ++MBBI) { 1188 MachineInstr *MI = MBBI; 1189 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 1190 // optimization by changing how far we scan. 1191 if (MI->isDebugValue()) 1192 continue; 1193 1194 // Now that we know this is a real instruction, count it. 1195 ++Count; 1196 1197 // If we found a match, return it. 1198 if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) 1199 return MBBI; 1200 1201 // Update the status of what the instruction clobbered and used. 1202 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1203 1204 // Otherwise, if the base register is used or modified, we have no match, so 1205 // return early. 1206 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 1207 return E; 1208 } 1209 return E; 1210 } 1211 1212 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 1213 MachineBasicBlock::iterator I, unsigned Limit) { 1214 MachineBasicBlock::iterator B = I->getParent()->begin(); 1215 MachineBasicBlock::iterator E = I->getParent()->end(); 1216 MachineInstr *MemMI = I; 1217 MachineBasicBlock::iterator MBBI = I; 1218 1219 unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); 1220 int Offset = getLdStOffsetOp(MemMI).getImm(); 1221 1222 // If the load/store is the first instruction in the block, there's obviously 1223 // not any matching update. Ditto if the memory offset isn't zero. 1224 if (MBBI == B || Offset != 0) 1225 return E; 1226 // If the base register overlaps a destination register, we can't 1227 // merge the update. 1228 bool IsPairedInsn = isPairedLdSt(MemMI); 1229 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1230 unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); 1231 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1232 return E; 1233 } 1234 1235 // Track which registers have been modified and used between the first insn 1236 // (inclusive) and the second insn. 1237 BitVector ModifiedRegs, UsedRegs; 1238 ModifiedRegs.resize(TRI->getNumRegs()); 1239 UsedRegs.resize(TRI->getNumRegs()); 1240 --MBBI; 1241 for (unsigned Count = 0; MBBI != B; --MBBI) { 1242 MachineInstr *MI = MBBI; 1243 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 1244 // optimization by changing how far we scan. 1245 if (MI->isDebugValue()) 1246 continue; 1247 1248 // Now that we know this is a real instruction, count it. 1249 ++Count; 1250 1251 // If we found a match, return it. 1252 if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) 1253 return MBBI; 1254 1255 // Update the status of what the instruction clobbered and used. 1256 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1257 1258 // Otherwise, if the base register is used or modified, we have no match, so 1259 // return early. 1260 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 1261 return E; 1262 } 1263 return E; 1264 } 1265 1266 bool AArch64LoadStoreOpt::tryToMergeLdStInst( 1267 MachineBasicBlock::iterator &MBBI) { 1268 MachineInstr *MI = MBBI; 1269 MachineBasicBlock::iterator E = MI->getParent()->end(); 1270 // If this is a volatile load/store, don't mess with it. 1271 if (MI->hasOrderedMemoryRef()) 1272 return false; 1273 1274 // Make sure this is a reg+imm (as opposed to an address reloc). 1275 if (!getLdStOffsetOp(MI).isImm()) 1276 return false; 1277 1278 // Check if this load/store has a hint to avoid pair formation. 1279 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 1280 if (TII->isLdStPairSuppressed(MI)) 1281 return false; 1282 1283 // Look ahead up to ScanLimit instructions for a pairable instruction. 1284 LdStPairFlags Flags; 1285 MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); 1286 if (Paired != E) { 1287 if (isNarrowLoad(MI)) { 1288 ++NumNarrowLoadsPromoted; 1289 } else if (isNarrowStore(MI)) { 1290 ++NumZeroStoresPromoted; 1291 } else { 1292 ++NumPairCreated; 1293 if (isUnscaledLdSt(MI)) 1294 ++NumUnscaledPairCreated; 1295 } 1296 1297 // Merge the loads into a pair. Keeping the iterator straight is a 1298 // pain, so we let the merge routine tell us what the next instruction 1299 // is after it's done mucking about. 1300 MBBI = mergePairedInsns(MBBI, Paired, Flags); 1301 return true; 1302 } 1303 return false; 1304 } 1305 1306 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, 1307 bool enableNarrowLdOpt) { 1308 bool Modified = false; 1309 // Three tranformations to do here: 1310 // 1) Find narrow loads that can be converted into a single wider load 1311 // with bitfield extract instructions. 1312 // e.g., 1313 // ldrh w0, [x2] 1314 // ldrh w1, [x2, #2] 1315 // ; becomes 1316 // ldr w0, [x2] 1317 // ubfx w1, w0, #16, #16 1318 // and w0, w0, #ffff 1319 // 2) Find loads and stores that can be merged into a single load or store 1320 // pair instruction. 1321 // e.g., 1322 // ldr x0, [x2] 1323 // ldr x1, [x2, #8] 1324 // ; becomes 1325 // ldp x0, x1, [x2] 1326 // 3) Find base register updates that can be merged into the load or store 1327 // as a base-reg writeback. 1328 // e.g., 1329 // ldr x0, [x2] 1330 // add x2, x2, #4 1331 // ; becomes 1332 // ldr x0, [x2], #4 1333 1334 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1335 enableNarrowLdOpt && MBBI != E;) { 1336 MachineInstr *MI = MBBI; 1337 switch (MI->getOpcode()) { 1338 default: 1339 // Just move on to the next instruction. 1340 ++MBBI; 1341 break; 1342 // Scaled instructions. 1343 case AArch64::LDRBBui: 1344 case AArch64::LDRHHui: 1345 case AArch64::LDRSBWui: 1346 case AArch64::LDRSHWui: 1347 case AArch64::STRBBui: 1348 case AArch64::STRHHui: 1349 // Unscaled instructions. 1350 case AArch64::LDURBBi: 1351 case AArch64::LDURHHi: 1352 case AArch64::LDURSBWi: 1353 case AArch64::LDURSHWi: 1354 case AArch64::STURBBi: 1355 case AArch64::STURHHi: { 1356 if (tryToMergeLdStInst(MBBI)) { 1357 Modified = true; 1358 break; 1359 } 1360 ++MBBI; 1361 break; 1362 } 1363 // FIXME: Do the other instructions. 1364 } 1365 } 1366 1367 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1368 MBBI != E;) { 1369 MachineInstr *MI = MBBI; 1370 switch (MI->getOpcode()) { 1371 default: 1372 // Just move on to the next instruction. 1373 ++MBBI; 1374 break; 1375 // Scaled instructions. 1376 case AArch64::STRSui: 1377 case AArch64::STRDui: 1378 case AArch64::STRQui: 1379 case AArch64::STRXui: 1380 case AArch64::STRWui: 1381 case AArch64::LDRSui: 1382 case AArch64::LDRDui: 1383 case AArch64::LDRQui: 1384 case AArch64::LDRXui: 1385 case AArch64::LDRWui: 1386 case AArch64::LDRSWui: 1387 // Unscaled instructions. 1388 case AArch64::STURSi: 1389 case AArch64::STURDi: 1390 case AArch64::STURQi: 1391 case AArch64::STURWi: 1392 case AArch64::STURXi: 1393 case AArch64::LDURSi: 1394 case AArch64::LDURDi: 1395 case AArch64::LDURQi: 1396 case AArch64::LDURWi: 1397 case AArch64::LDURXi: 1398 case AArch64::LDURSWi: { 1399 if (tryToMergeLdStInst(MBBI)) { 1400 Modified = true; 1401 break; 1402 } 1403 ++MBBI; 1404 break; 1405 } 1406 // FIXME: Do the other instructions. 1407 } 1408 } 1409 1410 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1411 MBBI != E;) { 1412 MachineInstr *MI = MBBI; 1413 // Do update merging. It's simpler to keep this separate from the above 1414 // switch, though not strictly necessary. 1415 unsigned Opc = MI->getOpcode(); 1416 switch (Opc) { 1417 default: 1418 // Just move on to the next instruction. 1419 ++MBBI; 1420 break; 1421 // Scaled instructions. 1422 case AArch64::STRSui: 1423 case AArch64::STRDui: 1424 case AArch64::STRQui: 1425 case AArch64::STRXui: 1426 case AArch64::STRWui: 1427 case AArch64::STRHHui: 1428 case AArch64::STRBBui: 1429 case AArch64::LDRSui: 1430 case AArch64::LDRDui: 1431 case AArch64::LDRQui: 1432 case AArch64::LDRXui: 1433 case AArch64::LDRWui: 1434 case AArch64::LDRHHui: 1435 case AArch64::LDRBBui: 1436 // Unscaled instructions. 1437 case AArch64::STURSi: 1438 case AArch64::STURDi: 1439 case AArch64::STURQi: 1440 case AArch64::STURWi: 1441 case AArch64::STURXi: 1442 case AArch64::LDURSi: 1443 case AArch64::LDURDi: 1444 case AArch64::LDURQi: 1445 case AArch64::LDURWi: 1446 case AArch64::LDURXi: 1447 // Paired instructions. 1448 case AArch64::LDPSi: 1449 case AArch64::LDPSWi: 1450 case AArch64::LDPDi: 1451 case AArch64::LDPQi: 1452 case AArch64::LDPWi: 1453 case AArch64::LDPXi: 1454 case AArch64::STPSi: 1455 case AArch64::STPDi: 1456 case AArch64::STPQi: 1457 case AArch64::STPWi: 1458 case AArch64::STPXi: { 1459 // Make sure this is a reg+imm (as opposed to an address reloc). 1460 if (!getLdStOffsetOp(MI).isImm()) { 1461 ++MBBI; 1462 break; 1463 } 1464 // Look forward to try to form a post-index instruction. For example, 1465 // ldr x0, [x20] 1466 // add x20, x20, #32 1467 // merged into: 1468 // ldr x0, [x20], #32 1469 MachineBasicBlock::iterator Update = 1470 findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); 1471 if (Update != E) { 1472 // Merge the update into the ld/st. 1473 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); 1474 Modified = true; 1475 ++NumPostFolded; 1476 break; 1477 } 1478 // Don't know how to handle pre/post-index versions, so move to the next 1479 // instruction. 1480 if (isUnscaledLdSt(Opc)) { 1481 ++MBBI; 1482 break; 1483 } 1484 1485 // Look back to try to find a pre-index instruction. For example, 1486 // add x0, x0, #8 1487 // ldr x1, [x0] 1488 // merged into: 1489 // ldr x1, [x0, #8]! 1490 Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); 1491 if (Update != E) { 1492 // Merge the update into the ld/st. 1493 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1494 Modified = true; 1495 ++NumPreFolded; 1496 break; 1497 } 1498 // The immediate in the load/store is scaled by the size of the memory 1499 // operation. The immediate in the add we're looking for, 1500 // however, is not, so adjust here. 1501 int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); 1502 1503 // Look forward to try to find a post-index instruction. For example, 1504 // ldr x1, [x0, #64] 1505 // add x0, x0, #64 1506 // merged into: 1507 // ldr x1, [x0, #64]! 1508 Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); 1509 if (Update != E) { 1510 // Merge the update into the ld/st. 1511 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1512 Modified = true; 1513 ++NumPreFolded; 1514 break; 1515 } 1516 1517 // Nothing found. Just move to the next instruction. 1518 ++MBBI; 1519 break; 1520 } 1521 // FIXME: Do the other instructions. 1522 } 1523 } 1524 1525 return Modified; 1526 } 1527 1528 bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { 1529 bool ProfitableArch = Subtarget->isCortexA57(); 1530 // FIXME: The benefit from converting narrow loads into a wider load could be 1531 // microarchitectural as it assumes that a single load with two bitfield 1532 // extracts is cheaper than two narrow loads. Currently, this conversion is 1533 // enabled only in cortex-a57 on which performance benefits were verified. 1534 return ProfitableArch && !Subtarget->requiresStrictAlign(); 1535 } 1536 1537 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1538 Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); 1539 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); 1540 TRI = Subtarget->getRegisterInfo(); 1541 1542 bool Modified = false; 1543 bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); 1544 for (auto &MBB : Fn) 1545 Modified |= optimizeBlock(MBB, enableNarrowLdOpt); 1546 1547 return Modified; 1548 } 1549 1550 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep 1551 // loads and stores near one another? 1552 1553 /// createAArch64LoadStoreOptimizationPass - returns an instance of the 1554 /// load / store optimization pass. 1555 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 1556 return new AArch64LoadStoreOpt(); 1557 } 1558