Home | History | Annotate | Download | only in NVPTX
      1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // Top-level implementation for the NVPTX target.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "NVPTXTargetMachine.h"
     15 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
     16 #include "NVPTX.h"
     17 #include "NVPTXAllocaHoisting.h"
     18 #include "NVPTXLowerAggrCopies.h"
     19 #include "NVPTXTargetObjectFile.h"
     20 #include "NVPTXTargetTransformInfo.h"
     21 #include "llvm/Analysis/Passes.h"
     22 #include "llvm/CodeGen/AsmPrinter.h"
     23 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
     24 #include "llvm/CodeGen/MachineModuleInfo.h"
     25 #include "llvm/CodeGen/Passes.h"
     26 #include "llvm/CodeGen/TargetPassConfig.h"
     27 #include "llvm/IR/DataLayout.h"
     28 #include "llvm/IR/IRPrintingPasses.h"
     29 #include "llvm/IR/LegacyPassManager.h"
     30 #include "llvm/IR/Verifier.h"
     31 #include "llvm/MC/MCAsmInfo.h"
     32 #include "llvm/MC/MCInstrInfo.h"
     33 #include "llvm/MC/MCStreamer.h"
     34 #include "llvm/MC/MCSubtargetInfo.h"
     35 #include "llvm/Support/CommandLine.h"
     36 #include "llvm/Support/Debug.h"
     37 #include "llvm/Support/FormattedStream.h"
     38 #include "llvm/Support/TargetRegistry.h"
     39 #include "llvm/Support/raw_ostream.h"
     40 #include "llvm/Target/TargetInstrInfo.h"
     41 #include "llvm/Target/TargetLowering.h"
     42 #include "llvm/Target/TargetLoweringObjectFile.h"
     43 #include "llvm/Target/TargetMachine.h"
     44 #include "llvm/Target/TargetOptions.h"
     45 #include "llvm/Target/TargetRegisterInfo.h"
     46 #include "llvm/Target/TargetSubtargetInfo.h"
     47 #include "llvm/Transforms/Scalar.h"
     48 #include "llvm/Transforms/Scalar/GVN.h"
     49 
     50 using namespace llvm;
     51 
     52 static cl::opt<bool> UseInferAddressSpaces(
     53     "nvptx-use-infer-addrspace", cl::init(false), cl::Hidden,
     54     cl::desc("Optimize address spaces using NVPTXInferAddressSpaces instead of "
     55              "NVPTXFavorNonGenericAddrSpaces"));
     56 
     57 namespace llvm {
     58 void initializeNVVMIntrRangePass(PassRegistry&);
     59 void initializeNVVMReflectPass(PassRegistry&);
     60 void initializeGenericToNVVMPass(PassRegistry&);
     61 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
     62 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
     63 void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
     64 void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
     65 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
     66 void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
     67 void initializeNVPTXLowerAllocaPass(PassRegistry &);
     68 }
     69 
     70 extern "C" void LLVMInitializeNVPTXTarget() {
     71   // Register the target.
     72   RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32);
     73   RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64);
     74 
     75   // FIXME: This pass is really intended to be invoked during IR optimization,
     76   // but it's very NVPTX-specific.
     77   PassRegistry &PR = *PassRegistry::getPassRegistry();
     78   initializeNVVMReflectPass(PR);
     79   initializeNVVMIntrRangePass(PR);
     80   initializeGenericToNVVMPass(PR);
     81   initializeNVPTXAllocaHoistingPass(PR);
     82   initializeNVPTXAssignValidGlobalNamesPass(PR);
     83   initializeNVPTXFavorNonGenericAddrSpacesPass(PR);
     84   initializeNVPTXInferAddressSpacesPass(PR);
     85   initializeNVPTXLowerKernelArgsPass(PR);
     86   initializeNVPTXLowerAllocaPass(PR);
     87   initializeNVPTXLowerAggrCopiesPass(PR);
     88 }
     89 
     90 static std::string computeDataLayout(bool is64Bit) {
     91   std::string Ret = "e";
     92 
     93   if (!is64Bit)
     94     Ret += "-p:32:32";
     95 
     96   Ret += "-i64:64-v16:16-v32:32-n16:32:64";
     97 
     98   return Ret;
     99 }
    100 
    101 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
    102                                        StringRef CPU, StringRef FS,
    103                                        const TargetOptions &Options,
    104                                        Optional<Reloc::Model> RM,
    105                                        CodeModel::Model CM,
    106                                        CodeGenOpt::Level OL, bool is64bit)
    107     // The pic relocation model is used regardless of what the client has
    108     // specified, as it is the only relocation model currently supported.
    109     : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options,
    110                         Reloc::PIC_, CM, OL),
    111       is64bit(is64bit),
    112       TLOF(make_unique<NVPTXTargetObjectFile>()),
    113       Subtarget(TT, CPU, FS, *this) {
    114   if (TT.getOS() == Triple::NVCL)
    115     drvInterface = NVPTX::NVCL;
    116   else
    117     drvInterface = NVPTX::CUDA;
    118   initAsmInfo();
    119 }
    120 
    121 NVPTXTargetMachine::~NVPTXTargetMachine() {}
    122 
    123 void NVPTXTargetMachine32::anchor() {}
    124 
    125 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
    126                                            StringRef CPU, StringRef FS,
    127                                            const TargetOptions &Options,
    128                                            Optional<Reloc::Model> RM,
    129                                            CodeModel::Model CM,
    130                                            CodeGenOpt::Level OL)
    131     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
    132 
    133 void NVPTXTargetMachine64::anchor() {}
    134 
    135 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
    136                                            StringRef CPU, StringRef FS,
    137                                            const TargetOptions &Options,
    138                                            Optional<Reloc::Model> RM,
    139                                            CodeModel::Model CM,
    140                                            CodeGenOpt::Level OL)
    141     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
    142 
    143 namespace {
    144 class NVPTXPassConfig : public TargetPassConfig {
    145 public:
    146   NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
    147       : TargetPassConfig(TM, PM) {}
    148 
    149   NVPTXTargetMachine &getNVPTXTargetMachine() const {
    150     return getTM<NVPTXTargetMachine>();
    151   }
    152 
    153   void addIRPasses() override;
    154   bool addInstSelector() override;
    155   void addPostRegAlloc() override;
    156   void addMachineSSAOptimization() override;
    157 
    158   FunctionPass *createTargetRegisterAllocator(bool) override;
    159   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
    160   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
    161 
    162 private:
    163   // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
    164   // function is only called in opt mode.
    165   void addEarlyCSEOrGVNPass();
    166 
    167   // Add passes that propagate special memory spaces.
    168   void addAddressSpaceInferencePasses();
    169 
    170   // Add passes that perform straight-line scalar optimizations.
    171   void addStraightLineScalarOptimizationPasses();
    172 };
    173 } // end anonymous namespace
    174 
    175 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
    176   return new NVPTXPassConfig(this, PM);
    177 }
    178 
    179 void NVPTXTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
    180   PM.add(createNVVMReflectPass());
    181   PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
    182 }
    183 
    184 TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
    185   return TargetIRAnalysis([this](const Function &F) {
    186     return TargetTransformInfo(NVPTXTTIImpl(this, F));
    187   });
    188 }
    189 
    190 void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
    191   if (getOptLevel() == CodeGenOpt::Aggressive)
    192     addPass(createGVNPass());
    193   else
    194     addPass(createEarlyCSEPass());
    195 }
    196 
    197 void NVPTXPassConfig::addAddressSpaceInferencePasses() {
    198   // NVPTXLowerKernelArgs emits alloca for byval parameters which can often
    199   // be eliminated by SROA.
    200   addPass(createSROAPass());
    201   addPass(createNVPTXLowerAllocaPass());
    202   if (UseInferAddressSpaces) {
    203     addPass(createNVPTXInferAddressSpacesPass());
    204   } else {
    205     addPass(createNVPTXFavorNonGenericAddrSpacesPass());
    206     // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave
    207     // them unused. We could remove dead code in an ad-hoc manner, but that
    208     // requires manual work and might be error-prone.
    209     addPass(createDeadCodeEliminationPass());
    210   }
    211 }
    212 
    213 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
    214   addPass(createSeparateConstOffsetFromGEPPass());
    215   addPass(createSpeculativeExecutionPass());
    216   // ReassociateGEPs exposes more opportunites for SLSR. See
    217   // the example in reassociate-geps-and-slsr.ll.
    218   addPass(createStraightLineStrengthReducePass());
    219   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
    220   // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
    221   // for some of our benchmarks.
    222   addEarlyCSEOrGVNPass();
    223   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
    224   addPass(createNaryReassociatePass());
    225   // NaryReassociate on GEPs creates redundant common expressions, so run
    226   // EarlyCSE after it.
    227   addPass(createEarlyCSEPass());
    228 }
    229 
    230 void NVPTXPassConfig::addIRPasses() {
    231   // The following passes are known to not play well with virtual regs hanging
    232   // around after register allocation (which in our case, is *all* registers).
    233   // We explicitly disable them here.  We do, however, need some functionality
    234   // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
    235   // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
    236   disablePass(&PrologEpilogCodeInserterID);
    237   disablePass(&MachineCopyPropagationID);
    238   disablePass(&TailDuplicateID);
    239   disablePass(&StackMapLivenessID);
    240   disablePass(&LiveDebugValuesID);
    241   disablePass(&PostRASchedulerID);
    242   disablePass(&FuncletLayoutID);
    243   disablePass(&PatchableFunctionID);
    244 
    245   // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
    246   // it here does nothing.  But since we need it for correctness when lowering
    247   // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
    248   // call addEarlyAsPossiblePasses.
    249   addPass(createNVVMReflectPass());
    250 
    251   if (getOptLevel() != CodeGenOpt::None)
    252     addPass(createNVPTXImageOptimizerPass());
    253   addPass(createNVPTXAssignValidGlobalNamesPass());
    254   addPass(createGenericToNVVMPass());
    255 
    256   // NVPTXLowerKernelArgs is required for correctness and should be run right
    257   // before the address space inference passes.
    258   addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
    259   if (getOptLevel() != CodeGenOpt::None) {
    260     addAddressSpaceInferencePasses();
    261     addStraightLineScalarOptimizationPasses();
    262   }
    263 
    264   // === LSR and other generic IR passes ===
    265   TargetPassConfig::addIRPasses();
    266   // EarlyCSE is not always strong enough to clean up what LSR produces. For
    267   // example, GVN can combine
    268   //
    269   //   %0 = add %a, %b
    270   //   %1 = add %b, %a
    271   //
    272   // and
    273   //
    274   //   %0 = shl nsw %a, 2
    275   //   %1 = shl %a, 2
    276   //
    277   // but EarlyCSE can do neither of them.
    278   if (getOptLevel() != CodeGenOpt::None)
    279     addEarlyCSEOrGVNPass();
    280 }
    281 
    282 bool NVPTXPassConfig::addInstSelector() {
    283   const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
    284 
    285   addPass(createLowerAggrCopies());
    286   addPass(createAllocaHoisting());
    287   addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
    288 
    289   if (!ST.hasImageHandles())
    290     addPass(createNVPTXReplaceImageHandlesPass());
    291 
    292   return false;
    293 }
    294 
    295 void NVPTXPassConfig::addPostRegAlloc() {
    296   addPass(createNVPTXPrologEpilogPass(), false);
    297   if (getOptLevel() != CodeGenOpt::None) {
    298     // NVPTXPrologEpilogPass calculates frame object offset and replace frame
    299     // index with VRFrame register. NVPTXPeephole need to be run after that and
    300     // will replace VRFrame with VRFrameLocal when possible.
    301     addPass(createNVPTXPeephole());
    302   }
    303 }
    304 
    305 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
    306   return nullptr; // No reg alloc
    307 }
    308 
    309 void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
    310   assert(!RegAllocPass && "NVPTX uses no regalloc!");
    311   addPass(&PHIEliminationID);
    312   addPass(&TwoAddressInstructionPassID);
    313 }
    314 
    315 void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
    316   assert(!RegAllocPass && "NVPTX uses no regalloc!");
    317 
    318   addPass(&ProcessImplicitDefsID);
    319   addPass(&LiveVariablesID);
    320   addPass(&MachineLoopInfoID);
    321   addPass(&PHIEliminationID);
    322 
    323   addPass(&TwoAddressInstructionPassID);
    324   addPass(&RegisterCoalescerID);
    325 
    326   // PreRA instruction scheduling.
    327   if (addPass(&MachineSchedulerID))
    328     printAndVerify("After Machine Scheduling");
    329 
    330 
    331   addPass(&StackSlotColoringID);
    332 
    333   // FIXME: Needs physical registers
    334   //addPass(&PostRAMachineLICMID);
    335 
    336   printAndVerify("After StackSlotColoring");
    337 }
    338 
    339 void NVPTXPassConfig::addMachineSSAOptimization() {
    340   // Pre-ra tail duplication.
    341   if (addPass(&EarlyTailDuplicateID))
    342     printAndVerify("After Pre-RegAlloc TailDuplicate");
    343 
    344   // Optimize PHIs before DCE: removing dead PHI cycles may make more
    345   // instructions dead.
    346   addPass(&OptimizePHIsID);
    347 
    348   // This pass merges large allocas. StackSlotColoring is a different pass
    349   // which merges spill slots.
    350   addPass(&StackColoringID);
    351 
    352   // If the target requests it, assign local variables to stack slots relative
    353   // to one another and simplify frame index references where possible.
    354   addPass(&LocalStackSlotAllocationID);
    355 
    356   // With optimization, dead code should already be eliminated. However
    357   // there is one known exception: lowered code for arguments that are only
    358   // used by tail calls, where the tail calls reuse the incoming stack
    359   // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
    360   addPass(&DeadMachineInstructionElimID);
    361   printAndVerify("After codegen DCE pass");
    362 
    363   // Allow targets to insert passes that improve instruction level parallelism,
    364   // like if-conversion. Such passes will typically need dominator trees and
    365   // loop info, just like LICM and CSE below.
    366   if (addILPOpts())
    367     printAndVerify("After ILP optimizations");
    368 
    369   addPass(&MachineLICMID);
    370   addPass(&MachineCSEID);
    371 
    372   addPass(&MachineSinkingID);
    373   printAndVerify("After Machine LICM, CSE and Sinking passes");
    374 
    375   addPass(&PeepholeOptimizerID);
    376   printAndVerify("After codegen peephole optimization pass");
    377 }
    378