1 //===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // When profitable, replace GPR targeting i64 instructions with their 10 // AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined 11 // as minimizing the number of cross-class register copies. 12 //===----------------------------------------------------------------------===// 13 14 //===----------------------------------------------------------------------===// 15 // TODO: Graph based predicate heuristics. 16 // Walking the instruction list linearly will get many, perhaps most, of 17 // the cases, but to do a truly thorough job of this, we need a more 18 // wholistic approach. 19 // 20 // This optimization is very similar in spirit to the register allocator's 21 // spill placement, only here we're determining where to place cross-class 22 // register copies rather than spills. As such, a similar approach is 23 // called for. 24 // 25 // We want to build up a set of graphs of all instructions which are candidates 26 // for transformation along with instructions which generate their inputs and 27 // consume their outputs. For each edge in the graph, we assign a weight 28 // based on whether there is a copy required there (weight zero if not) and 29 // the block frequency of the block containing the defining or using 30 // instruction, whichever is less. Our optimization is then a graph problem 31 // to minimize the total weight of all the graphs, then transform instructions 32 // and add or remove copy instructions as called for to implement the 33 // solution. 34 //===----------------------------------------------------------------------===// 35 36 #include "AArch64.h" 37 #include "AArch64InstrInfo.h" 38 #include "AArch64RegisterInfo.h" 39 #include "AArch64Subtarget.h" 40 #include "llvm/ADT/Statistic.h" 41 #include "llvm/CodeGen/MachineFunction.h" 42 #include "llvm/CodeGen/MachineFunctionPass.h" 43 #include "llvm/CodeGen/MachineInstr.h" 44 #include "llvm/CodeGen/MachineInstrBuilder.h" 45 #include "llvm/CodeGen/MachineRegisterInfo.h" 46 #include "llvm/Support/CommandLine.h" 47 #include "llvm/Support/Debug.h" 48 #include "llvm/Support/raw_ostream.h" 49 using namespace llvm; 50 51 #define DEBUG_TYPE "aarch64-simd-scalar" 52 53 // Allow forcing all i64 operations with equivalent SIMD instructions to use 54 // them. For stress-testing the transformation function. 55 static cl::opt<bool> 56 TransformAll("aarch64-simd-scalar-force-all", 57 cl::desc("Force use of AdvSIMD scalar instructions everywhere"), 58 cl::init(false), cl::Hidden); 59 60 STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used"); 61 STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted"); 62 STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted"); 63 64 namespace llvm { 65 void initializeAArch64AdvSIMDScalarPass(PassRegistry &); 66 } 67 68 #define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization" 69 70 namespace { 71 class AArch64AdvSIMDScalar : public MachineFunctionPass { 72 MachineRegisterInfo *MRI; 73 const TargetInstrInfo *TII; 74 75 private: 76 // isProfitableToTransform - Predicate function to determine whether an 77 // instruction should be transformed to its equivalent AdvSIMD scalar 78 // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. 79 bool isProfitableToTransform(const MachineInstr *MI) const; 80 81 // transformInstruction - Perform the transformation of an instruction 82 // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs 83 // to be the correct register class, minimizing cross-class copies. 84 void transformInstruction(MachineInstr *MI); 85 86 // processMachineBasicBlock - Main optimzation loop. 87 bool processMachineBasicBlock(MachineBasicBlock *MBB); 88 89 public: 90 static char ID; // Pass identification, replacement for typeid. 91 explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) { 92 initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry()); 93 } 94 95 bool runOnMachineFunction(MachineFunction &F) override; 96 97 const char *getPassName() const override { 98 return AARCH64_ADVSIMD_NAME; 99 } 100 101 void getAnalysisUsage(AnalysisUsage &AU) const override { 102 AU.setPreservesCFG(); 103 MachineFunctionPass::getAnalysisUsage(AU); 104 } 105 }; 106 char AArch64AdvSIMDScalar::ID = 0; 107 } // end anonymous namespace 108 109 INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar", 110 AARCH64_ADVSIMD_NAME, false, false) 111 112 static bool isGPR64(unsigned Reg, unsigned SubReg, 113 const MachineRegisterInfo *MRI) { 114 if (SubReg) 115 return false; 116 if (TargetRegisterInfo::isVirtualRegister(Reg)) 117 return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass); 118 return AArch64::GPR64RegClass.contains(Reg); 119 } 120 121 static bool isFPR64(unsigned Reg, unsigned SubReg, 122 const MachineRegisterInfo *MRI) { 123 if (TargetRegisterInfo::isVirtualRegister(Reg)) 124 return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) && 125 SubReg == 0) || 126 (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) && 127 SubReg == AArch64::dsub); 128 // Physical register references just check the register class directly. 129 return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) || 130 (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub); 131 } 132 133 // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64 134 // copy instruction. Return zero_reg if the instruction is not a copy. 135 static unsigned getSrcFromCopy(const MachineInstr *MI, 136 const MachineRegisterInfo *MRI, 137 unsigned &SubReg) { 138 SubReg = 0; 139 // The "FMOV Xd, Dn" instruction is the typical form. 140 if (MI->getOpcode() == AArch64::FMOVDXr || 141 MI->getOpcode() == AArch64::FMOVXDr) 142 return MI->getOperand(1).getReg(); 143 // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see 144 // these at this stage, but it's easy to check for. 145 if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) { 146 SubReg = AArch64::dsub; 147 return MI->getOperand(1).getReg(); 148 } 149 // Or just a plain COPY instruction. This can be directly to/from FPR64, 150 // or it can be a dsub subreg reference to an FPR128. 151 if (MI->getOpcode() == AArch64::COPY) { 152 if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), 153 MRI) && 154 isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI)) 155 return MI->getOperand(1).getReg(); 156 if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), 157 MRI) && 158 isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), 159 MRI)) { 160 SubReg = MI->getOperand(1).getSubReg(); 161 return MI->getOperand(1).getReg(); 162 } 163 } 164 165 // Otherwise, this is some other kind of instruction. 166 return 0; 167 } 168 169 // getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent 170 // that we're considering transforming to, return that AdvSIMD opcode. For all 171 // others, return the original opcode. 172 static unsigned getTransformOpcode(unsigned Opc) { 173 switch (Opc) { 174 default: 175 break; 176 // FIXME: Lots more possibilities. 177 case AArch64::ADDXrr: 178 return AArch64::ADDv1i64; 179 case AArch64::SUBXrr: 180 return AArch64::SUBv1i64; 181 case AArch64::ANDXrr: 182 return AArch64::ANDv8i8; 183 case AArch64::EORXrr: 184 return AArch64::EORv8i8; 185 case AArch64::ORRXrr: 186 return AArch64::ORRv8i8; 187 } 188 // No AdvSIMD equivalent, so just return the original opcode. 189 return Opc; 190 } 191 192 static bool isTransformable(const MachineInstr *MI) { 193 unsigned Opc = MI->getOpcode(); 194 return Opc != getTransformOpcode(Opc); 195 } 196 197 // isProfitableToTransform - Predicate function to determine whether an 198 // instruction should be transformed to its equivalent AdvSIMD scalar 199 // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. 200 bool 201 AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { 202 // If this instruction isn't eligible to be transformed (no SIMD equivalent), 203 // early exit since that's the common case. 204 if (!isTransformable(MI)) 205 return false; 206 207 // Count the number of copies we'll need to add and approximate the number 208 // of copies that a transform will enable us to remove. 209 unsigned NumNewCopies = 3; 210 unsigned NumRemovableCopies = 0; 211 212 unsigned OrigSrc0 = MI->getOperand(1).getReg(); 213 unsigned OrigSrc1 = MI->getOperand(2).getReg(); 214 unsigned Src0 = 0, SubReg0; 215 unsigned Src1 = 0, SubReg1; 216 if (!MRI->def_empty(OrigSrc0)) { 217 MachineRegisterInfo::def_instr_iterator Def = 218 MRI->def_instr_begin(OrigSrc0); 219 assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); 220 Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); 221 // If the source was from a copy, we don't need to insert a new copy. 222 if (Src0) 223 --NumNewCopies; 224 // If there are no other users of the original source, we can delete 225 // that instruction. 226 if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) 227 ++NumRemovableCopies; 228 } 229 if (!MRI->def_empty(OrigSrc1)) { 230 MachineRegisterInfo::def_instr_iterator Def = 231 MRI->def_instr_begin(OrigSrc1); 232 assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); 233 Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); 234 if (Src1) 235 --NumNewCopies; 236 // If there are no other users of the original source, we can delete 237 // that instruction. 238 if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) 239 ++NumRemovableCopies; 240 } 241 242 // If any of the uses of the original instructions is a cross class copy, 243 // that's a copy that will be removable if we transform. Likewise, if 244 // any of the uses is a transformable instruction, it's likely the tranforms 245 // will chain, enabling us to save a copy there, too. This is an aggressive 246 // heuristic that approximates the graph based cost analysis described above. 247 unsigned Dst = MI->getOperand(0).getReg(); 248 bool AllUsesAreCopies = true; 249 for (MachineRegisterInfo::use_instr_nodbg_iterator 250 Use = MRI->use_instr_nodbg_begin(Dst), 251 E = MRI->use_instr_nodbg_end(); 252 Use != E; ++Use) { 253 unsigned SubReg; 254 if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use)) 255 ++NumRemovableCopies; 256 // If the use is an INSERT_SUBREG, that's still something that can 257 // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's 258 // preferable to have it use the FPR64 in most cases, as if the source 259 // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely. 260 // Ditto for a lane insert. 261 else if (Use->getOpcode() == AArch64::INSERT_SUBREG || 262 Use->getOpcode() == AArch64::INSvi64gpr) 263 ; 264 else 265 AllUsesAreCopies = false; 266 } 267 // If all of the uses of the original destination register are copies to 268 // FPR64, then we won't end up having a new copy back to GPR64 either. 269 if (AllUsesAreCopies) 270 --NumNewCopies; 271 272 // If a transform will not increase the number of cross-class copies required, 273 // return true. 274 if (NumNewCopies <= NumRemovableCopies) 275 return true; 276 277 // Finally, even if we otherwise wouldn't transform, check if we're forcing 278 // transformation of everything. 279 return TransformAll; 280 } 281 282 static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI, 283 unsigned Dst, unsigned Src, bool IsKill) { 284 MachineInstrBuilder MIB = 285 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY), 286 Dst) 287 .addReg(Src, getKillRegState(IsKill)); 288 DEBUG(dbgs() << " adding copy: " << *MIB); 289 ++NumCopiesInserted; 290 return MIB; 291 } 292 293 // transformInstruction - Perform the transformation of an instruction 294 // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs 295 // to be the correct register class, minimizing cross-class copies. 296 void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { 297 DEBUG(dbgs() << "Scalar transform: " << *MI); 298 299 MachineBasicBlock *MBB = MI->getParent(); 300 unsigned OldOpc = MI->getOpcode(); 301 unsigned NewOpc = getTransformOpcode(OldOpc); 302 assert(OldOpc != NewOpc && "transform an instruction to itself?!"); 303 304 // Check if we need a copy for the source registers. 305 unsigned OrigSrc0 = MI->getOperand(1).getReg(); 306 unsigned OrigSrc1 = MI->getOperand(2).getReg(); 307 unsigned Src0 = 0, SubReg0; 308 unsigned Src1 = 0, SubReg1; 309 if (!MRI->def_empty(OrigSrc0)) { 310 MachineRegisterInfo::def_instr_iterator Def = 311 MRI->def_instr_begin(OrigSrc0); 312 assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); 313 Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); 314 // If there are no other users of the original source, we can delete 315 // that instruction. 316 if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) { 317 assert(Src0 && "Can't delete copy w/o a valid original source!"); 318 Def->eraseFromParent(); 319 ++NumCopiesDeleted; 320 } 321 } 322 if (!MRI->def_empty(OrigSrc1)) { 323 MachineRegisterInfo::def_instr_iterator Def = 324 MRI->def_instr_begin(OrigSrc1); 325 assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); 326 Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); 327 // If there are no other users of the original source, we can delete 328 // that instruction. 329 if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) { 330 assert(Src1 && "Can't delete copy w/o a valid original source!"); 331 Def->eraseFromParent(); 332 ++NumCopiesDeleted; 333 } 334 } 335 // If we weren't able to reference the original source directly, create a 336 // copy. 337 if (!Src0) { 338 SubReg0 = 0; 339 Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); 340 insertCopy(TII, MI, Src0, OrigSrc0, true); 341 } 342 if (!Src1) { 343 SubReg1 = 0; 344 Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); 345 insertCopy(TII, MI, Src1, OrigSrc1, true); 346 } 347 348 // Create a vreg for the destination. 349 // FIXME: No need to do this if the ultimate user expects an FPR64. 350 // Check for that and avoid the copy if possible. 351 unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass); 352 353 // For now, all of the new instructions have the same simple three-register 354 // form, so no need to special case based on what instruction we're 355 // building. 356 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst) 357 .addReg(Src0, getKillRegState(true), SubReg0) 358 .addReg(Src1, getKillRegState(true), SubReg1); 359 360 // Now copy the result back out to a GPR. 361 // FIXME: Try to avoid this if all uses could actually just use the FPR64 362 // directly. 363 insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true); 364 365 // Erase the old instruction. 366 MI->eraseFromParent(); 367 368 ++NumScalarInsnsUsed; 369 } 370 371 // processMachineBasicBlock - Main optimzation loop. 372 bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) { 373 bool Changed = false; 374 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { 375 MachineInstr *MI = I; 376 ++I; 377 if (isProfitableToTransform(MI)) { 378 transformInstruction(MI); 379 Changed = true; 380 } 381 } 382 return Changed; 383 } 384 385 // runOnMachineFunction - Pass entry point from PassManager. 386 bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { 387 bool Changed = false; 388 DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n"); 389 390 MRI = &mf.getRegInfo(); 391 TII = mf.getSubtarget().getInstrInfo(); 392 393 // Just check things on a one-block-at-a-time basis. 394 for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) 395 if (processMachineBasicBlock(&*I)) 396 Changed = true; 397 return Changed; 398 } 399 400 // createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine 401 // to add the pass to the PassManager. 402 FunctionPass *llvm::createAArch64AdvSIMDScalar() { 403 return new AArch64AdvSIMDScalar(); 404 } 405