Home | History | Annotate | Download | only in AMDGPU
      1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 /// The pass tries to use the 32-bit encoding for instructions when possible.
      9 //===----------------------------------------------------------------------===//
     10 //
     11 
     12 #include "AMDGPU.h"
     13 #include "AMDGPUMCInstLower.h"
     14 #include "AMDGPUSubtarget.h"
     15 #include "SIInstrInfo.h"
     16 #include "llvm/ADT/Statistic.h"
     17 #include "llvm/CodeGen/MachineFunctionPass.h"
     18 #include "llvm/CodeGen/MachineInstrBuilder.h"
     19 #include "llvm/CodeGen/MachineRegisterInfo.h"
     20 #include "llvm/IR/Constants.h"
     21 #include "llvm/IR/Function.h"
     22 #include "llvm/IR/LLVMContext.h"
     23 #include "llvm/Support/Debug.h"
     24 #include "llvm/Support/raw_ostream.h"
     25 #include "llvm/Target/TargetMachine.h"
     26 
     27 #define DEBUG_TYPE "si-shrink-instructions"
     28 
     29 STATISTIC(NumInstructionsShrunk,
     30           "Number of 64-bit instruction reduced to 32-bit.");
     31 STATISTIC(NumLiteralConstantsFolded,
     32           "Number of literal constants folded into 32-bit instructions.");
     33 
     34 using namespace llvm;
     35 
     36 namespace {
     37 
     38 class SIShrinkInstructions : public MachineFunctionPass {
     39 public:
     40   static char ID;
     41 
     42 public:
     43   SIShrinkInstructions() : MachineFunctionPass(ID) {
     44   }
     45 
     46   bool runOnMachineFunction(MachineFunction &MF) override;
     47 
     48   const char *getPassName() const override {
     49     return "SI Shrink Instructions";
     50   }
     51 
     52   void getAnalysisUsage(AnalysisUsage &AU) const override {
     53     AU.setPreservesCFG();
     54     MachineFunctionPass::getAnalysisUsage(AU);
     55   }
     56 };
     57 
     58 } // End anonymous namespace.
     59 
     60 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
     61                 "SI Shrink Instructions", false, false)
     62 
     63 char SIShrinkInstructions::ID = 0;
     64 
     65 FunctionPass *llvm::createSIShrinkInstructionsPass() {
     66   return new SIShrinkInstructions();
     67 }
     68 
     69 static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
     70                    const MachineRegisterInfo &MRI) {
     71   if (!MO->isReg())
     72     return false;
     73 
     74   if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
     75     return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
     76 
     77   return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
     78 }
     79 
     80 static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
     81                       const SIRegisterInfo &TRI,
     82                       const MachineRegisterInfo &MRI) {
     83 
     84   const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
     85   // Can't shrink instruction with three operands.
     86   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
     87   // a special case for it.  It can only be shrunk if the third operand
     88   // is vcc.  We should handle this the same way we handle vopc, by addding
     89   // a register allocation hint pre-regalloc and then do the shrining
     90   // post-regalloc.
     91   if (Src2) {
     92     switch (MI.getOpcode()) {
     93       default: return false;
     94 
     95       case AMDGPU::V_MAC_F32_e64:
     96         if (!isVGPR(Src2, TRI, MRI) ||
     97             TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
     98           return false;
     99         break;
    100 
    101       case AMDGPU::V_CNDMASK_B32_e64:
    102         break;
    103     }
    104   }
    105 
    106   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    107   const MachineOperand *Src1Mod =
    108       TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
    109 
    110   if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
    111     return false;
    112 
    113   // We don't need to check src0, all input types are legal, so just make sure
    114   // src0 isn't using any modifiers.
    115   if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
    116     return false;
    117 
    118   // Check output modifiers
    119   if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
    120     return false;
    121 
    122   return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
    123 }
    124 
    125 /// \brief This function checks \p MI for operands defined by a move immediate
    126 /// instruction and then folds the literal constant into the instruction if it
    127 /// can.  This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
    128 /// and will only fold literal constants if we are still in SSA.
    129 static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
    130                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
    131 
    132   if (!MRI.isSSA())
    133     return;
    134 
    135   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
    136 
    137   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    138   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
    139   MachineOperand &Src0 = MI.getOperand(Src0Idx);
    140 
    141   // Only one literal constant is allowed per instruction, so if src0 is a
    142   // literal constant then we can't do any folding.
    143   if (Src0.isImm() &&
    144       TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
    145     return;
    146 
    147   // Literal constants and SGPRs can only be used in Src0, so if Src0 is an
    148   // SGPR, we cannot commute the instruction, so we can't fold any literal
    149   // constants.
    150   if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI))
    151     return;
    152 
    153   // Try to fold Src0
    154   if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
    155     unsigned Reg = Src0.getReg();
    156     MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
    157     if (Def && Def->isMoveImmediate()) {
    158       MachineOperand &MovSrc = Def->getOperand(1);
    159       bool ConstantFolded = false;
    160 
    161       if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) {
    162         Src0.ChangeToImmediate(MovSrc.getImm());
    163         ConstantFolded = true;
    164       }
    165       if (ConstantFolded) {
    166         if (MRI.use_empty(Reg))
    167           Def->eraseFromParent();
    168         ++NumLiteralConstantsFolded;
    169         return;
    170       }
    171     }
    172   }
    173 
    174   // We have failed to fold src0, so commute the instruction and try again.
    175   if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
    176     foldImmediates(MI, TII, MRI, false);
    177 
    178 }
    179 
    180 // Copy MachineOperand with all flags except setting it as implicit.
    181 static void copyFlagsToImplicitVCC(MachineInstr &MI,
    182                                    const MachineOperand &Orig) {
    183 
    184   for (MachineOperand &Use : MI.implicit_operands()) {
    185     if (Use.getReg() == AMDGPU::VCC) {
    186       Use.setIsUndef(Orig.isUndef());
    187       Use.setIsKill(Orig.isKill());
    188       return;
    189     }
    190   }
    191 }
    192 
    193 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
    194   return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
    195 }
    196 
    197 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
    198   if (skipFunction(*MF.getFunction()))
    199     return false;
    200 
    201   MachineRegisterInfo &MRI = MF.getRegInfo();
    202   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    203   const SIInstrInfo *TII = ST.getInstrInfo();
    204   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    205 
    206   std::vector<unsigned> I1Defs;
    207 
    208   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
    209                                                   BI != BE; ++BI) {
    210 
    211     MachineBasicBlock &MBB = *BI;
    212     MachineBasicBlock::iterator I, Next;
    213     for (I = MBB.begin(); I != MBB.end(); I = Next) {
    214       Next = std::next(I);
    215       MachineInstr &MI = *I;
    216 
    217       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
    218         // If this has a literal constant source that is the same as the
    219         // reversed bits of an inline immediate, replace with a bitreverse of
    220         // that constant. This saves 4 bytes in the common case of materializing
    221         // sign bits.
    222 
    223         // Test if we are after regalloc. We only want to do this after any
    224         // optimizations happen because this will confuse them.
    225         // XXX - not exactly a check for post-regalloc run.
    226         MachineOperand &Src = MI.getOperand(1);
    227         if (Src.isImm() &&
    228             TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
    229           int64_t Imm = Src.getImm();
    230           if (isInt<32>(Imm) && !TII->isInlineConstant(Src, 4)) {
    231             int32_t ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Imm));
    232             if (ReverseImm >= -16 && ReverseImm <= 64) {
    233               MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
    234               Src.setImm(ReverseImm);
    235               continue;
    236             }
    237           }
    238         }
    239       }
    240 
    241       // Combine adjacent s_nops to use the immediate operand encoding how long
    242       // to wait.
    243       //
    244       // s_nop N
    245       // s_nop M
    246       //  =>
    247       // s_nop (N + M)
    248       if (MI.getOpcode() == AMDGPU::S_NOP &&
    249           Next != MBB.end() &&
    250           (*Next).getOpcode() == AMDGPU::S_NOP) {
    251 
    252         MachineInstr &NextMI = *Next;
    253         // The instruction encodes the amount to wait with an offset of 1,
    254         // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
    255         // after adding.
    256         uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
    257         uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
    258 
    259         // Make sure we don't overflow the bounds.
    260         if (Nop0 + Nop1 <= 8) {
    261           NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
    262           MI.eraseFromParent();
    263         }
    264 
    265         continue;
    266       }
    267 
    268       // FIXME: We also need to consider movs of constant operands since
    269       // immediate operands are not folded if they have more than one use, and
    270       // the operand folding pass is unaware if the immediate will be free since
    271       // it won't know if the src == dest constraint will end up being
    272       // satisfied.
    273       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
    274           MI.getOpcode() == AMDGPU::S_MUL_I32) {
    275         const MachineOperand &Dest = MI.getOperand(0);
    276         const MachineOperand &Src0 = MI.getOperand(1);
    277         const MachineOperand &Src1 = MI.getOperand(2);
    278 
    279         // FIXME: This could work better if hints worked with subregisters. If
    280         // we have a vector add of a constant, we usually don't get the correct
    281         // allocation due to the subregister usage.
    282         if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) &&
    283             Src0.isReg()) {
    284           MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg());
    285           continue;
    286         }
    287 
    288         if (Src0.isReg() && Src0.getReg() == Dest.getReg()) {
    289           if (Src1.isImm() && isKImmOperand(TII, Src1)) {
    290             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
    291               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
    292 
    293             MI.setDesc(TII->get(Opc));
    294             MI.tieOperands(0, 1);
    295           }
    296         }
    297       }
    298 
    299       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
    300       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
    301         const MachineOperand &Src = MI.getOperand(1);
    302 
    303         if (Src.isImm() && isKImmOperand(TII, Src))
    304           MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
    305 
    306         continue;
    307       }
    308 
    309       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
    310         continue;
    311 
    312       if (!canShrink(MI, TII, TRI, MRI)) {
    313         // Try commuting the instruction and see if that enables us to shrink
    314         // it.
    315         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
    316             !canShrink(MI, TII, TRI, MRI))
    317           continue;
    318       }
    319 
    320       // getVOPe32 could be -1 here if we started with an instruction that had
    321       // a 32-bit encoding and then commuted it to an instruction that did not.
    322       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
    323         continue;
    324 
    325       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
    326 
    327       if (TII->isVOPC(Op32)) {
    328         unsigned DstReg = MI.getOperand(0).getReg();
    329         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
    330           // VOPC instructions can only write to the VCC register. We can't
    331           // force them to use VCC here, because this is only one register and
    332           // cannot deal with sequences which would require multiple copies of
    333           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
    334           //
    335           // So, instead of forcing the instruction to write to VCC, we provide
    336           // a hint to the register allocator to use VCC and then we we will run
    337           // this pass again after RA and shrink it if it outputs to VCC.
    338           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
    339           continue;
    340         }
    341         if (DstReg != AMDGPU::VCC)
    342           continue;
    343       }
    344 
    345       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
    346         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
    347         // instructions.
    348         const MachineOperand *Src2 =
    349             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
    350         if (!Src2->isReg())
    351           continue;
    352         unsigned SReg = Src2->getReg();
    353         if (TargetRegisterInfo::isVirtualRegister(SReg)) {
    354           MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
    355           continue;
    356         }
    357         if (SReg != AMDGPU::VCC)
    358           continue;
    359       }
    360 
    361       // We can shrink this instruction
    362       DEBUG(dbgs() << "Shrinking " << MI);
    363 
    364       MachineInstrBuilder Inst32 =
    365           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
    366 
    367       // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
    368       // For VOPC instructions, this is replaced by an implicit def of vcc.
    369       int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
    370       if (Op32DstIdx != -1) {
    371         // dst
    372         Inst32.addOperand(MI.getOperand(0));
    373       } else {
    374         assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
    375                "Unexpected case");
    376       }
    377 
    378 
    379       Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
    380 
    381       const MachineOperand *Src1 =
    382           TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    383       if (Src1)
    384         Inst32.addOperand(*Src1);
    385 
    386       const MachineOperand *Src2 =
    387         TII->getNamedOperand(MI, AMDGPU::OpName::src2);
    388       if (Src2) {
    389         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
    390         if (Op32Src2Idx != -1) {
    391           Inst32.addOperand(*Src2);
    392         } else {
    393           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
    394           // replaced with an implicit read of vcc. This was already added
    395           // during the initial BuildMI, so find it to preserve the flags.
    396           copyFlagsToImplicitVCC(*Inst32, *Src2);
    397         }
    398       }
    399 
    400       ++NumInstructionsShrunk;
    401       MI.eraseFromParent();
    402 
    403       foldImmediates(*Inst32, TII, MRI);
    404       DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
    405 
    406 
    407     }
    408   }
    409   return false;
    410 }
    411