1 //===- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that expands pseudo instructions into target 11 // instructions to allow proper scheduling and other late optimizations. This 12 // pass should be run after register allocation but before the post-regalloc 13 // scheduling pass. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AArch64InstrInfo.h" 18 #include "AArch64Subtarget.h" 19 #include "MCTargetDesc/AArch64AddressingModes.h" 20 #include "Utils/AArch64BaseInfo.h" 21 #include "llvm/ADT/DenseMap.h" 22 #include "llvm/ADT/Triple.h" 23 #include "llvm/CodeGen/LivePhysRegs.h" 24 #include "llvm/CodeGen/MachineBasicBlock.h" 25 #include "llvm/CodeGen/MachineFunction.h" 26 #include "llvm/CodeGen/MachineFunctionPass.h" 27 #include "llvm/CodeGen/MachineInstr.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/CodeGen/MachineOperand.h" 30 #include "llvm/CodeGen/TargetSubtargetInfo.h" 31 #include "llvm/IR/DebugLoc.h" 32 #include "llvm/MC/MCInstrDesc.h" 33 #include "llvm/Pass.h" 34 #include "llvm/Support/CodeGen.h" 35 #include "llvm/Support/MathExtras.h" 36 #include "llvm/Target/TargetMachine.h" 37 #include <cassert> 38 #include <cstdint> 39 #include <iterator> 40 #include <limits> 41 #include <utility> 42 43 using namespace llvm; 44 45 #define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass" 46 47 namespace { 48 49 class AArch64ExpandPseudo : public MachineFunctionPass { 50 public: 51 const AArch64InstrInfo *TII; 52 53 static char ID; 54 55 AArch64ExpandPseudo() : MachineFunctionPass(ID) { 56 initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry()); 57 } 58 59 bool runOnMachineFunction(MachineFunction &Fn) override; 60 61 StringRef getPassName() const override { return AARCH64_EXPAND_PSEUDO_NAME; } 62 63 private: 64 bool expandMBB(MachineBasicBlock &MBB); 65 bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, 66 MachineBasicBlock::iterator &NextMBBI); 67 bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, 68 unsigned BitSize); 69 bool expandMOVImmSimple(MachineBasicBlock &MBB, 70 MachineBasicBlock::iterator MBBI, 71 unsigned BitSize, 72 unsigned OneChunks, 73 unsigned ZeroChunks); 74 75 bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, 76 unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, 77 unsigned ExtendImm, unsigned ZeroReg, 78 MachineBasicBlock::iterator &NextMBBI); 79 bool expandCMP_SWAP_128(MachineBasicBlock &MBB, 80 MachineBasicBlock::iterator MBBI, 81 MachineBasicBlock::iterator &NextMBBI); 82 }; 83 84 } // end anonymous namespace 85 86 char AArch64ExpandPseudo::ID = 0; 87 88 INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo", 89 AARCH64_EXPAND_PSEUDO_NAME, false, false) 90 91 /// Transfer implicit operands on the pseudo instruction to the 92 /// instructions created from the expansion. 93 static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, 94 MachineInstrBuilder &DefMI) { 95 const MCInstrDesc &Desc = OldMI.getDesc(); 96 for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e; 97 ++i) { 98 const MachineOperand &MO = OldMI.getOperand(i); 99 assert(MO.isReg() && MO.getReg()); 100 if (MO.isUse()) 101 UseMI.add(MO); 102 else 103 DefMI.add(MO); 104 } 105 } 106 107 /// Helper function which extracts the specified 16-bit chunk from a 108 /// 64-bit value. 109 static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) { 110 assert(ChunkIdx < 4 && "Out of range chunk index specified!"); 111 112 return (Imm >> (ChunkIdx * 16)) & 0xFFFF; 113 } 114 115 /// Check whether the given 16-bit chunk replicated to full 64-bit width 116 /// can be materialized with an ORR instruction. 117 static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) { 118 Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk; 119 120 return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding); 121 } 122 123 /// Check for identical 16-bit chunks within the constant and if so 124 /// materialize them with a single ORR instruction. The remaining one or two 125 /// 16-bit chunks will be materialized with MOVK instructions. 126 /// 127 /// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order 128 /// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with 129 /// an ORR instruction. 130 static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI, 131 MachineBasicBlock &MBB, 132 MachineBasicBlock::iterator &MBBI, 133 const AArch64InstrInfo *TII) { 134 using CountMap = DenseMap<uint64_t, unsigned>; 135 136 CountMap Counts; 137 138 // Scan the constant and count how often every chunk occurs. 139 for (unsigned Idx = 0; Idx < 4; ++Idx) 140 ++Counts[getChunk(UImm, Idx)]; 141 142 // Traverse the chunks to find one which occurs more than once. 143 for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end(); 144 Chunk != End; ++Chunk) { 145 const uint64_t ChunkVal = Chunk->first; 146 const unsigned Count = Chunk->second; 147 148 uint64_t Encoding = 0; 149 150 // We are looking for chunks which have two or three instances and can be 151 // materialized with an ORR instruction. 152 if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding)) 153 continue; 154 155 const bool CountThree = Count == 3; 156 // Create the ORR-immediate instruction. 157 MachineInstrBuilder MIB = 158 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) 159 .add(MI.getOperand(0)) 160 .addReg(AArch64::XZR) 161 .addImm(Encoding); 162 163 const unsigned DstReg = MI.getOperand(0).getReg(); 164 const bool DstIsDead = MI.getOperand(0).isDead(); 165 166 unsigned ShiftAmt = 0; 167 uint64_t Imm16 = 0; 168 // Find the first chunk not materialized with the ORR instruction. 169 for (; ShiftAmt < 64; ShiftAmt += 16) { 170 Imm16 = (UImm >> ShiftAmt) & 0xFFFF; 171 172 if (Imm16 != ChunkVal) 173 break; 174 } 175 176 // Create the first MOVK instruction. 177 MachineInstrBuilder MIB1 = 178 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) 179 .addReg(DstReg, 180 RegState::Define | getDeadRegState(DstIsDead && CountThree)) 181 .addReg(DstReg) 182 .addImm(Imm16) 183 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); 184 185 // In case we have three instances the whole constant is now materialized 186 // and we can exit. 187 if (CountThree) { 188 transferImpOps(MI, MIB, MIB1); 189 MI.eraseFromParent(); 190 return true; 191 } 192 193 // Find the remaining chunk which needs to be materialized. 194 for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) { 195 Imm16 = (UImm >> ShiftAmt) & 0xFFFF; 196 197 if (Imm16 != ChunkVal) 198 break; 199 } 200 201 // Create the second MOVK instruction. 202 MachineInstrBuilder MIB2 = 203 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) 204 .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) 205 .addReg(DstReg) 206 .addImm(Imm16) 207 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); 208 209 transferImpOps(MI, MIB, MIB2); 210 MI.eraseFromParent(); 211 return true; 212 } 213 214 return false; 215 } 216 217 /// Check whether this chunk matches the pattern '1...0...'. This pattern 218 /// starts a contiguous sequence of ones if we look at the bits from the LSB 219 /// towards the MSB. 220 static bool isStartChunk(uint64_t Chunk) { 221 if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max()) 222 return false; 223 224 return isMask_64(~Chunk); 225 } 226 227 /// Check whether this chunk matches the pattern '0...1...' This pattern 228 /// ends a contiguous sequence of ones if we look at the bits from the LSB 229 /// towards the MSB. 230 static bool isEndChunk(uint64_t Chunk) { 231 if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max()) 232 return false; 233 234 return isMask_64(Chunk); 235 } 236 237 /// Clear or set all bits in the chunk at the given index. 238 static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) { 239 const uint64_t Mask = 0xFFFF; 240 241 if (Clear) 242 // Clear chunk in the immediate. 243 Imm &= ~(Mask << (Idx * 16)); 244 else 245 // Set all bits in the immediate for the particular chunk. 246 Imm |= Mask << (Idx * 16); 247 248 return Imm; 249 } 250 251 /// Check whether the constant contains a sequence of contiguous ones, 252 /// which might be interrupted by one or two chunks. If so, materialize the 253 /// sequence of contiguous ones with an ORR instruction. 254 /// Materialize the chunks which are either interrupting the sequence or outside 255 /// of the sequence with a MOVK instruction. 256 /// 257 /// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk 258 /// which ends the sequence (0...1...). Then we are looking for constants which 259 /// contain at least one S and E chunk. 260 /// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|. 261 /// 262 /// We are also looking for constants like |S|A|B|E| where the contiguous 263 /// sequence of ones wraps around the MSB into the LSB. 264 static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI, 265 MachineBasicBlock &MBB, 266 MachineBasicBlock::iterator &MBBI, 267 const AArch64InstrInfo *TII) { 268 const int NotSet = -1; 269 const uint64_t Mask = 0xFFFF; 270 271 int StartIdx = NotSet; 272 int EndIdx = NotSet; 273 // Try to find the chunks which start/end a contiguous sequence of ones. 274 for (int Idx = 0; Idx < 4; ++Idx) { 275 int64_t Chunk = getChunk(UImm, Idx); 276 // Sign extend the 16-bit chunk to 64-bit. 277 Chunk = (Chunk << 48) >> 48; 278 279 if (isStartChunk(Chunk)) 280 StartIdx = Idx; 281 else if (isEndChunk(Chunk)) 282 EndIdx = Idx; 283 } 284 285 // Early exit in case we can't find a start/end chunk. 286 if (StartIdx == NotSet || EndIdx == NotSet) 287 return false; 288 289 // Outside of the contiguous sequence of ones everything needs to be zero. 290 uint64_t Outside = 0; 291 // Chunks between the start and end chunk need to have all their bits set. 292 uint64_t Inside = Mask; 293 294 // If our contiguous sequence of ones wraps around from the MSB into the LSB, 295 // just swap indices and pretend we are materializing a contiguous sequence 296 // of zeros surrounded by a contiguous sequence of ones. 297 if (StartIdx > EndIdx) { 298 std::swap(StartIdx, EndIdx); 299 std::swap(Outside, Inside); 300 } 301 302 uint64_t OrrImm = UImm; 303 int FirstMovkIdx = NotSet; 304 int SecondMovkIdx = NotSet; 305 306 // Find out which chunks we need to patch up to obtain a contiguous sequence 307 // of ones. 308 for (int Idx = 0; Idx < 4; ++Idx) { 309 const uint64_t Chunk = getChunk(UImm, Idx); 310 311 // Check whether we are looking at a chunk which is not part of the 312 // contiguous sequence of ones. 313 if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) { 314 OrrImm = updateImm(OrrImm, Idx, Outside == 0); 315 316 // Remember the index we need to patch. 317 if (FirstMovkIdx == NotSet) 318 FirstMovkIdx = Idx; 319 else 320 SecondMovkIdx = Idx; 321 322 // Check whether we are looking a chunk which is part of the contiguous 323 // sequence of ones. 324 } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) { 325 OrrImm = updateImm(OrrImm, Idx, Inside != Mask); 326 327 // Remember the index we need to patch. 328 if (FirstMovkIdx == NotSet) 329 FirstMovkIdx = Idx; 330 else 331 SecondMovkIdx = Idx; 332 } 333 } 334 assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!"); 335 336 // Create the ORR-immediate instruction. 337 uint64_t Encoding = 0; 338 AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding); 339 MachineInstrBuilder MIB = 340 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) 341 .add(MI.getOperand(0)) 342 .addReg(AArch64::XZR) 343 .addImm(Encoding); 344 345 const unsigned DstReg = MI.getOperand(0).getReg(); 346 const bool DstIsDead = MI.getOperand(0).isDead(); 347 348 const bool SingleMovk = SecondMovkIdx == NotSet; 349 // Create the first MOVK instruction. 350 MachineInstrBuilder MIB1 = 351 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) 352 .addReg(DstReg, 353 RegState::Define | getDeadRegState(DstIsDead && SingleMovk)) 354 .addReg(DstReg) 355 .addImm(getChunk(UImm, FirstMovkIdx)) 356 .addImm( 357 AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16)); 358 359 // Early exit in case we only need to emit a single MOVK instruction. 360 if (SingleMovk) { 361 transferImpOps(MI, MIB, MIB1); 362 MI.eraseFromParent(); 363 return true; 364 } 365 366 // Create the second MOVK instruction. 367 MachineInstrBuilder MIB2 = 368 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) 369 .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) 370 .addReg(DstReg) 371 .addImm(getChunk(UImm, SecondMovkIdx)) 372 .addImm( 373 AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16)); 374 375 transferImpOps(MI, MIB, MIB2); 376 MI.eraseFromParent(); 377 return true; 378 } 379 380 /// Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more 381 /// real move-immediate instructions to synthesize the immediate. 382 bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, 383 MachineBasicBlock::iterator MBBI, 384 unsigned BitSize) { 385 MachineInstr &MI = *MBBI; 386 unsigned DstReg = MI.getOperand(0).getReg(); 387 uint64_t Imm = MI.getOperand(1).getImm(); 388 const unsigned Mask = 0xFFFF; 389 390 if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) { 391 // Useless def, and we don't want to risk creating an invalid ORR (which 392 // would really write to sp). 393 MI.eraseFromParent(); 394 return true; 395 } 396 397 // Scan the immediate and count the number of 16-bit chunks which are either 398 // all ones or all zeros. 399 unsigned OneChunks = 0; 400 unsigned ZeroChunks = 0; 401 for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { 402 const unsigned Chunk = (Imm >> Shift) & Mask; 403 if (Chunk == Mask) 404 OneChunks++; 405 else if (Chunk == 0) 406 ZeroChunks++; 407 } 408 409 // FIXME: Prefer MOVZ/MOVN over ORR because of the rules for the "mov" 410 // alias. 411 412 // Try a single ORR. 413 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); 414 uint64_t Encoding; 415 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 416 unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri); 417 MachineInstrBuilder MIB = 418 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) 419 .add(MI.getOperand(0)) 420 .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR) 421 .addImm(Encoding); 422 transferImpOps(MI, MIB, MIB); 423 MI.eraseFromParent(); 424 return true; 425 } 426 427 // Two instruction sequences. 428 // 429 // Prefer MOVZ/MOVN followed by MOVK; it's more readable, and possibly the 430 // fastest sequence with fast literal generation. 431 if (OneChunks >= (BitSize / 16) - 2 || ZeroChunks >= (BitSize / 16) - 2) 432 return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks); 433 434 assert(BitSize == 64 && "All 32-bit immediates can be expanded with a" 435 "MOVZ/MOVK pair"); 436 437 // Try other two-instruction sequences. 438 439 // 64-bit ORR followed by MOVK. 440 // We try to construct the ORR immediate in three different ways: either we 441 // zero out the chunk which will be replaced, we fill the chunk which will 442 // be replaced with ones, or we take the bit pattern from the other half of 443 // the 64-bit immediate. This is comprehensive because of the way ORR 444 // immediates are constructed. 445 for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { 446 uint64_t ShiftedMask = (0xFFFFULL << Shift); 447 uint64_t ZeroChunk = UImm & ~ShiftedMask; 448 uint64_t OneChunk = UImm | ShiftedMask; 449 uint64_t RotatedImm = (UImm << 32) | (UImm >> 32); 450 uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask); 451 if (AArch64_AM::processLogicalImmediate(ZeroChunk, BitSize, Encoding) || 452 AArch64_AM::processLogicalImmediate(OneChunk, BitSize, Encoding) || 453 AArch64_AM::processLogicalImmediate(ReplicateChunk, 454 BitSize, Encoding)) { 455 // Create the ORR-immediate instruction. 456 MachineInstrBuilder MIB = 457 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) 458 .add(MI.getOperand(0)) 459 .addReg(AArch64::XZR) 460 .addImm(Encoding); 461 462 // Create the MOVK instruction. 463 const unsigned Imm16 = getChunk(UImm, Shift / 16); 464 const unsigned DstReg = MI.getOperand(0).getReg(); 465 const bool DstIsDead = MI.getOperand(0).isDead(); 466 MachineInstrBuilder MIB1 = 467 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) 468 .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) 469 .addReg(DstReg) 470 .addImm(Imm16) 471 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); 472 473 transferImpOps(MI, MIB, MIB1); 474 MI.eraseFromParent(); 475 return true; 476 } 477 } 478 479 // FIXME: Add more two-instruction sequences. 480 481 // Three instruction sequences. 482 // 483 // Prefer MOVZ/MOVN followed by two MOVK; it's more readable, and possibly 484 // the fastest sequence with fast literal generation. (If neither MOVK is 485 // part of a fast literal generation pair, it could be slower than the 486 // four-instruction sequence, but we won't worry about that for now.) 487 if (OneChunks || ZeroChunks) 488 return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks); 489 490 // Check for identical 16-bit chunks within the constant and if so materialize 491 // them with a single ORR instruction. The remaining one or two 16-bit chunks 492 // will be materialized with MOVK instructions. 493 if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII)) 494 return true; 495 496 // Check whether the constant contains a sequence of contiguous ones, which 497 // might be interrupted by one or two chunks. If so, materialize the sequence 498 // of contiguous ones with an ORR instruction. Materialize the chunks which 499 // are either interrupting the sequence or outside of the sequence with a 500 // MOVK instruction. 501 if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII)) 502 return true; 503 504 // We found no possible two or three instruction sequence; use the general 505 // four-instruction sequence. 506 return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks); 507 } 508 509 /// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a 510 /// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions. 511 bool AArch64ExpandPseudo::expandMOVImmSimple(MachineBasicBlock &MBB, 512 MachineBasicBlock::iterator MBBI, 513 unsigned BitSize, 514 unsigned OneChunks, 515 unsigned ZeroChunks) { 516 MachineInstr &MI = *MBBI; 517 unsigned DstReg = MI.getOperand(0).getReg(); 518 uint64_t Imm = MI.getOperand(1).getImm(); 519 const unsigned Mask = 0xFFFF; 520 521 // Use a MOVZ or MOVN instruction to set the high bits, followed by one or 522 // more MOVK instructions to insert additional 16-bit portions into the 523 // lower bits. 524 bool isNeg = false; 525 526 // Use MOVN to materialize the high bits if we have more all one chunks 527 // than all zero chunks. 528 if (OneChunks > ZeroChunks) { 529 isNeg = true; 530 Imm = ~Imm; 531 } 532 533 unsigned FirstOpc; 534 if (BitSize == 32) { 535 Imm &= (1LL << 32) - 1; 536 FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi); 537 } else { 538 FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi); 539 } 540 unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN 541 unsigned LastShift = 0; // LSL amount for last MOVK 542 if (Imm != 0) { 543 unsigned LZ = countLeadingZeros(Imm); 544 unsigned TZ = countTrailingZeros(Imm); 545 Shift = (TZ / 16) * 16; 546 LastShift = ((63 - LZ) / 16) * 16; 547 } 548 unsigned Imm16 = (Imm >> Shift) & Mask; 549 bool DstIsDead = MI.getOperand(0).isDead(); 550 MachineInstrBuilder MIB1 = 551 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc)) 552 .addReg(DstReg, RegState::Define | 553 getDeadRegState(DstIsDead && Shift == LastShift)) 554 .addImm(Imm16) 555 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); 556 557 // If a MOVN was used for the high bits of a negative value, flip the rest 558 // of the bits back for use with MOVK. 559 if (isNeg) 560 Imm = ~Imm; 561 562 if (Shift == LastShift) { 563 transferImpOps(MI, MIB1, MIB1); 564 MI.eraseFromParent(); 565 return true; 566 } 567 568 MachineInstrBuilder MIB2; 569 unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi); 570 while (Shift < LastShift) { 571 Shift += 16; 572 Imm16 = (Imm >> Shift) & Mask; 573 if (Imm16 == (isNeg ? Mask : 0)) 574 continue; // This 16-bit portion is already set correctly. 575 MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) 576 .addReg(DstReg, 577 RegState::Define | 578 getDeadRegState(DstIsDead && Shift == LastShift)) 579 .addReg(DstReg) 580 .addImm(Imm16) 581 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); 582 } 583 584 transferImpOps(MI, MIB1, MIB2); 585 MI.eraseFromParent(); 586 return true; 587 } 588 589 bool AArch64ExpandPseudo::expandCMP_SWAP( 590 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, 591 unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg, 592 MachineBasicBlock::iterator &NextMBBI) { 593 MachineInstr &MI = *MBBI; 594 DebugLoc DL = MI.getDebugLoc(); 595 const MachineOperand &Dest = MI.getOperand(0); 596 unsigned StatusReg = MI.getOperand(1).getReg(); 597 bool StatusDead = MI.getOperand(1).isDead(); 598 // Duplicating undef operands into 2 instructions does not guarantee the same 599 // value on both; However undef should be replaced by xzr anyway. 600 assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); 601 unsigned AddrReg = MI.getOperand(2).getReg(); 602 unsigned DesiredReg = MI.getOperand(3).getReg(); 603 unsigned NewReg = MI.getOperand(4).getReg(); 604 605 MachineFunction *MF = MBB.getParent(); 606 auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); 607 auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); 608 auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); 609 610 MF->insert(++MBB.getIterator(), LoadCmpBB); 611 MF->insert(++LoadCmpBB->getIterator(), StoreBB); 612 MF->insert(++StoreBB->getIterator(), DoneBB); 613 614 // .Lloadcmp: 615 // mov wStatus, 0 616 // ldaxr xDest, [xAddr] 617 // cmp xDest, xDesired 618 // b.ne .Ldone 619 if (!StatusDead) 620 BuildMI(LoadCmpBB, DL, TII->get(AArch64::MOVZWi), StatusReg) 621 .addImm(0).addImm(0); 622 BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg()) 623 .addReg(AddrReg); 624 BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg) 625 .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) 626 .addReg(DesiredReg) 627 .addImm(ExtendImm); 628 BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc)) 629 .addImm(AArch64CC::NE) 630 .addMBB(DoneBB) 631 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill); 632 LoadCmpBB->addSuccessor(DoneBB); 633 LoadCmpBB->addSuccessor(StoreBB); 634 635 // .Lstore: 636 // stlxr wStatus, xNew, [xAddr] 637 // cbnz wStatus, .Lloadcmp 638 BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg) 639 .addReg(NewReg) 640 .addReg(AddrReg); 641 BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) 642 .addReg(StatusReg, getKillRegState(StatusDead)) 643 .addMBB(LoadCmpBB); 644 StoreBB->addSuccessor(LoadCmpBB); 645 StoreBB->addSuccessor(DoneBB); 646 647 DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); 648 DoneBB->transferSuccessors(&MBB); 649 650 MBB.addSuccessor(LoadCmpBB); 651 652 NextMBBI = MBB.end(); 653 MI.eraseFromParent(); 654 655 // Recompute livein lists. 656 LivePhysRegs LiveRegs; 657 computeAndAddLiveIns(LiveRegs, *DoneBB); 658 computeAndAddLiveIns(LiveRegs, *StoreBB); 659 computeAndAddLiveIns(LiveRegs, *LoadCmpBB); 660 // Do an extra pass around the loop to get loop carried registers right. 661 StoreBB->clearLiveIns(); 662 computeAndAddLiveIns(LiveRegs, *StoreBB); 663 LoadCmpBB->clearLiveIns(); 664 computeAndAddLiveIns(LiveRegs, *LoadCmpBB); 665 666 return true; 667 } 668 669 bool AArch64ExpandPseudo::expandCMP_SWAP_128( 670 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, 671 MachineBasicBlock::iterator &NextMBBI) { 672 MachineInstr &MI = *MBBI; 673 DebugLoc DL = MI.getDebugLoc(); 674 MachineOperand &DestLo = MI.getOperand(0); 675 MachineOperand &DestHi = MI.getOperand(1); 676 unsigned StatusReg = MI.getOperand(2).getReg(); 677 bool StatusDead = MI.getOperand(2).isDead(); 678 // Duplicating undef operands into 2 instructions does not guarantee the same 679 // value on both; However undef should be replaced by xzr anyway. 680 assert(!MI.getOperand(3).isUndef() && "cannot handle undef"); 681 unsigned AddrReg = MI.getOperand(3).getReg(); 682 unsigned DesiredLoReg = MI.getOperand(4).getReg(); 683 unsigned DesiredHiReg = MI.getOperand(5).getReg(); 684 unsigned NewLoReg = MI.getOperand(6).getReg(); 685 unsigned NewHiReg = MI.getOperand(7).getReg(); 686 687 MachineFunction *MF = MBB.getParent(); 688 auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); 689 auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); 690 auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); 691 692 MF->insert(++MBB.getIterator(), LoadCmpBB); 693 MF->insert(++LoadCmpBB->getIterator(), StoreBB); 694 MF->insert(++StoreBB->getIterator(), DoneBB); 695 696 // .Lloadcmp: 697 // ldaxp xDestLo, xDestHi, [xAddr] 698 // cmp xDestLo, xDesiredLo 699 // sbcs xDestHi, xDesiredHi 700 // b.ne .Ldone 701 BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX)) 702 .addReg(DestLo.getReg(), RegState::Define) 703 .addReg(DestHi.getReg(), RegState::Define) 704 .addReg(AddrReg); 705 BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) 706 .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead())) 707 .addReg(DesiredLoReg) 708 .addImm(0); 709 BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg) 710 .addUse(AArch64::WZR) 711 .addUse(AArch64::WZR) 712 .addImm(AArch64CC::EQ); 713 BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) 714 .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead())) 715 .addReg(DesiredHiReg) 716 .addImm(0); 717 BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg) 718 .addUse(StatusReg, RegState::Kill) 719 .addUse(StatusReg, RegState::Kill) 720 .addImm(AArch64CC::EQ); 721 BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW)) 722 .addUse(StatusReg, getKillRegState(StatusDead)) 723 .addMBB(DoneBB); 724 LoadCmpBB->addSuccessor(DoneBB); 725 LoadCmpBB->addSuccessor(StoreBB); 726 727 // .Lstore: 728 // stlxp wStatus, xNewLo, xNewHi, [xAddr] 729 // cbnz wStatus, .Lloadcmp 730 BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg) 731 .addReg(NewLoReg) 732 .addReg(NewHiReg) 733 .addReg(AddrReg); 734 BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) 735 .addReg(StatusReg, getKillRegState(StatusDead)) 736 .addMBB(LoadCmpBB); 737 StoreBB->addSuccessor(LoadCmpBB); 738 StoreBB->addSuccessor(DoneBB); 739 740 DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); 741 DoneBB->transferSuccessors(&MBB); 742 743 MBB.addSuccessor(LoadCmpBB); 744 745 NextMBBI = MBB.end(); 746 MI.eraseFromParent(); 747 748 // Recompute liveness bottom up. 749 LivePhysRegs LiveRegs; 750 computeAndAddLiveIns(LiveRegs, *DoneBB); 751 computeAndAddLiveIns(LiveRegs, *StoreBB); 752 computeAndAddLiveIns(LiveRegs, *LoadCmpBB); 753 // Do an extra pass in the loop to get the loop carried dependencies right. 754 StoreBB->clearLiveIns(); 755 computeAndAddLiveIns(LiveRegs, *StoreBB); 756 LoadCmpBB->clearLiveIns(); 757 computeAndAddLiveIns(LiveRegs, *LoadCmpBB); 758 759 return true; 760 } 761 762 /// If MBBI references a pseudo instruction that should be expanded here, 763 /// do the expansion and return true. Otherwise return false. 764 bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, 765 MachineBasicBlock::iterator MBBI, 766 MachineBasicBlock::iterator &NextMBBI) { 767 MachineInstr &MI = *MBBI; 768 unsigned Opcode = MI.getOpcode(); 769 switch (Opcode) { 770 default: 771 break; 772 773 case AArch64::ADDWrr: 774 case AArch64::SUBWrr: 775 case AArch64::ADDXrr: 776 case AArch64::SUBXrr: 777 case AArch64::ADDSWrr: 778 case AArch64::SUBSWrr: 779 case AArch64::ADDSXrr: 780 case AArch64::SUBSXrr: 781 case AArch64::ANDWrr: 782 case AArch64::ANDXrr: 783 case AArch64::BICWrr: 784 case AArch64::BICXrr: 785 case AArch64::ANDSWrr: 786 case AArch64::ANDSXrr: 787 case AArch64::BICSWrr: 788 case AArch64::BICSXrr: 789 case AArch64::EONWrr: 790 case AArch64::EONXrr: 791 case AArch64::EORWrr: 792 case AArch64::EORXrr: 793 case AArch64::ORNWrr: 794 case AArch64::ORNXrr: 795 case AArch64::ORRWrr: 796 case AArch64::ORRXrr: { 797 unsigned Opcode; 798 switch (MI.getOpcode()) { 799 default: 800 return false; 801 case AArch64::ADDWrr: Opcode = AArch64::ADDWrs; break; 802 case AArch64::SUBWrr: Opcode = AArch64::SUBWrs; break; 803 case AArch64::ADDXrr: Opcode = AArch64::ADDXrs; break; 804 case AArch64::SUBXrr: Opcode = AArch64::SUBXrs; break; 805 case AArch64::ADDSWrr: Opcode = AArch64::ADDSWrs; break; 806 case AArch64::SUBSWrr: Opcode = AArch64::SUBSWrs; break; 807 case AArch64::ADDSXrr: Opcode = AArch64::ADDSXrs; break; 808 case AArch64::SUBSXrr: Opcode = AArch64::SUBSXrs; break; 809 case AArch64::ANDWrr: Opcode = AArch64::ANDWrs; break; 810 case AArch64::ANDXrr: Opcode = AArch64::ANDXrs; break; 811 case AArch64::BICWrr: Opcode = AArch64::BICWrs; break; 812 case AArch64::BICXrr: Opcode = AArch64::BICXrs; break; 813 case AArch64::ANDSWrr: Opcode = AArch64::ANDSWrs; break; 814 case AArch64::ANDSXrr: Opcode = AArch64::ANDSXrs; break; 815 case AArch64::BICSWrr: Opcode = AArch64::BICSWrs; break; 816 case AArch64::BICSXrr: Opcode = AArch64::BICSXrs; break; 817 case AArch64::EONWrr: Opcode = AArch64::EONWrs; break; 818 case AArch64::EONXrr: Opcode = AArch64::EONXrs; break; 819 case AArch64::EORWrr: Opcode = AArch64::EORWrs; break; 820 case AArch64::EORXrr: Opcode = AArch64::EORXrs; break; 821 case AArch64::ORNWrr: Opcode = AArch64::ORNWrs; break; 822 case AArch64::ORNXrr: Opcode = AArch64::ORNXrs; break; 823 case AArch64::ORRWrr: Opcode = AArch64::ORRWrs; break; 824 case AArch64::ORRXrr: Opcode = AArch64::ORRXrs; break; 825 } 826 MachineInstrBuilder MIB1 = 827 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode), 828 MI.getOperand(0).getReg()) 829 .add(MI.getOperand(1)) 830 .add(MI.getOperand(2)) 831 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 832 transferImpOps(MI, MIB1, MIB1); 833 MI.eraseFromParent(); 834 return true; 835 } 836 837 case AArch64::LOADgot: { 838 // Expand into ADRP + LDR. 839 unsigned DstReg = MI.getOperand(0).getReg(); 840 const MachineOperand &MO1 = MI.getOperand(1); 841 unsigned Flags = MO1.getTargetFlags(); 842 MachineInstrBuilder MIB1 = 843 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg); 844 MachineInstrBuilder MIB2 = 845 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui)) 846 .add(MI.getOperand(0)) 847 .addReg(DstReg); 848 849 if (MO1.isGlobal()) { 850 MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE); 851 MIB2.addGlobalAddress(MO1.getGlobal(), 0, 852 Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 853 } else if (MO1.isSymbol()) { 854 MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE); 855 MIB2.addExternalSymbol(MO1.getSymbolName(), 856 Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 857 } else { 858 assert(MO1.isCPI() && 859 "Only expect globals, externalsymbols, or constant pools"); 860 MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(), 861 Flags | AArch64II::MO_PAGE); 862 MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(), 863 Flags | AArch64II::MO_PAGEOFF | 864 AArch64II::MO_NC); 865 } 866 867 transferImpOps(MI, MIB1, MIB2); 868 MI.eraseFromParent(); 869 return true; 870 } 871 872 case AArch64::MOVaddr: 873 case AArch64::MOVaddrJT: 874 case AArch64::MOVaddrCP: 875 case AArch64::MOVaddrBA: 876 case AArch64::MOVaddrTLS: 877 case AArch64::MOVaddrEXT: { 878 // Expand into ADRP + ADD. 879 unsigned DstReg = MI.getOperand(0).getReg(); 880 MachineInstrBuilder MIB1 = 881 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg) 882 .add(MI.getOperand(1)); 883 884 MachineInstrBuilder MIB2 = 885 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri)) 886 .add(MI.getOperand(0)) 887 .addReg(DstReg) 888 .add(MI.getOperand(2)) 889 .addImm(0); 890 891 transferImpOps(MI, MIB1, MIB2); 892 MI.eraseFromParent(); 893 return true; 894 } 895 case AArch64::ADDlowTLS: 896 // Produce a plain ADD 897 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri)) 898 .add(MI.getOperand(0)) 899 .add(MI.getOperand(1)) 900 .add(MI.getOperand(2)) 901 .addImm(0); 902 MI.eraseFromParent(); 903 return true; 904 905 case AArch64::MOVbaseTLS: { 906 unsigned DstReg = MI.getOperand(0).getReg(); 907 auto SysReg = AArch64SysReg::TPIDR_EL0; 908 MachineFunction *MF = MBB.getParent(); 909 if (MF->getTarget().getTargetTriple().isOSFuchsia() && 910 MF->getTarget().getCodeModel() == CodeModel::Kernel) 911 SysReg = AArch64SysReg::TPIDR_EL1; 912 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MRS), DstReg) 913 .addImm(SysReg); 914 MI.eraseFromParent(); 915 return true; 916 } 917 918 case AArch64::MOVi32imm: 919 return expandMOVImm(MBB, MBBI, 32); 920 case AArch64::MOVi64imm: 921 return expandMOVImm(MBB, MBBI, 64); 922 case AArch64::RET_ReallyLR: { 923 // Hiding the LR use with RET_ReallyLR may lead to extra kills in the 924 // function and missing live-ins. We are fine in practice because callee 925 // saved register handling ensures the register value is restored before 926 // RET, but we need the undef flag here to appease the MachineVerifier 927 // liveness checks. 928 MachineInstrBuilder MIB = 929 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET)) 930 .addReg(AArch64::LR, RegState::Undef); 931 transferImpOps(MI, MIB, MIB); 932 MI.eraseFromParent(); 933 return true; 934 } 935 case AArch64::CMP_SWAP_8: 936 return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB, 937 AArch64::SUBSWrx, 938 AArch64_AM::getArithExtendImm(AArch64_AM::UXTB, 0), 939 AArch64::WZR, NextMBBI); 940 case AArch64::CMP_SWAP_16: 941 return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRH, AArch64::STLXRH, 942 AArch64::SUBSWrx, 943 AArch64_AM::getArithExtendImm(AArch64_AM::UXTH, 0), 944 AArch64::WZR, NextMBBI); 945 case AArch64::CMP_SWAP_32: 946 return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRW, AArch64::STLXRW, 947 AArch64::SUBSWrs, 948 AArch64_AM::getShifterImm(AArch64_AM::LSL, 0), 949 AArch64::WZR, NextMBBI); 950 case AArch64::CMP_SWAP_64: 951 return expandCMP_SWAP(MBB, MBBI, 952 AArch64::LDAXRX, AArch64::STLXRX, AArch64::SUBSXrs, 953 AArch64_AM::getShifterImm(AArch64_AM::LSL, 0), 954 AArch64::XZR, NextMBBI); 955 case AArch64::CMP_SWAP_128: 956 return expandCMP_SWAP_128(MBB, MBBI, NextMBBI); 957 958 case AArch64::AESMCrrTied: 959 case AArch64::AESIMCrrTied: { 960 MachineInstrBuilder MIB = 961 BuildMI(MBB, MBBI, MI.getDebugLoc(), 962 TII->get(Opcode == AArch64::AESMCrrTied ? AArch64::AESMCrr : 963 AArch64::AESIMCrr)) 964 .add(MI.getOperand(0)) 965 .add(MI.getOperand(1)); 966 transferImpOps(MI, MIB, MIB); 967 MI.eraseFromParent(); 968 return true; 969 } 970 } 971 return false; 972 } 973 974 /// Iterate over the instructions in basic block MBB and expand any 975 /// pseudo instructions. Return true if anything was modified. 976 bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { 977 bool Modified = false; 978 979 MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 980 while (MBBI != E) { 981 MachineBasicBlock::iterator NMBBI = std::next(MBBI); 982 Modified |= expandMI(MBB, MBBI, NMBBI); 983 MBBI = NMBBI; 984 } 985 986 return Modified; 987 } 988 989 bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { 990 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); 991 992 bool Modified = false; 993 for (auto &MBB : MF) 994 Modified |= expandMBB(MBB); 995 return Modified; 996 } 997 998 /// Returns an instance of the pseudo instruction expansion pass. 999 FunctionPass *llvm::createAArch64ExpandPseudoPass() { 1000 return new AArch64ExpandPseudo(); 1001 } 1002