Home | History | Annotate | Download | only in NVPTX
      1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // Top-level implementation for the NVPTX target.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "NVPTXTargetMachine.h"
     15 #include "NVPTX.h"
     16 #include "NVPTXAllocaHoisting.h"
     17 #include "NVPTXLowerAggrCopies.h"
     18 #include "NVPTXTargetObjectFile.h"
     19 #include "NVPTXTargetTransformInfo.h"
     20 #include "llvm/ADT/STLExtras.h"
     21 #include "llvm/ADT/Triple.h"
     22 #include "llvm/Analysis/TargetTransformInfo.h"
     23 #include "llvm/CodeGen/Passes.h"
     24 #include "llvm/CodeGen/TargetPassConfig.h"
     25 #include "llvm/IR/LegacyPassManager.h"
     26 #include "llvm/Pass.h"
     27 #include "llvm/Support/CommandLine.h"
     28 #include "llvm/Support/TargetRegistry.h"
     29 #include "llvm/Target/TargetMachine.h"
     30 #include "llvm/Target/TargetOptions.h"
     31 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
     32 #include "llvm/Transforms/Scalar.h"
     33 #include "llvm/Transforms/Scalar/GVN.h"
     34 #include "llvm/Transforms/Vectorize.h"
     35 #include <cassert>
     36 #include <string>
     37 
     38 using namespace llvm;
     39 
     40 // LSV is still relatively new; this switch lets us turn it off in case we
     41 // encounter (or suspect) a bug.
     42 static cl::opt<bool>
     43     DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",
     44                                cl::desc("Disable load/store vectorizer"),
     45                                cl::init(false), cl::Hidden);
     46 
     47 // TODO: Remove this flag when we are confident with no regressions.
     48 static cl::opt<bool> DisableRequireStructuredCFG(
     49     "disable-nvptx-require-structured-cfg",
     50     cl::desc("Transitional flag to turn off NVPTX's requirement on preserving "
     51              "structured CFG. The requirement should be disabled only when "
     52              "unexpected regressions happen."),
     53     cl::init(false), cl::Hidden);
     54 
     55 static cl::opt<bool> UseShortPointersOpt(
     56     "nvptx-short-ptr",
     57     cl::desc(
     58         "Use 32-bit pointers for accessing const/local/shared address spaces."),
     59     cl::init(false), cl::Hidden);
     60 
     61 namespace llvm {
     62 
     63 void initializeNVVMIntrRangePass(PassRegistry&);
     64 void initializeNVVMReflectPass(PassRegistry&);
     65 void initializeGenericToNVVMPass(PassRegistry&);
     66 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
     67 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
     68 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
     69 void initializeNVPTXLowerArgsPass(PassRegistry &);
     70 void initializeNVPTXLowerAllocaPass(PassRegistry &);
     71 
     72 } // end namespace llvm
     73 
     74 extern "C" void LLVMInitializeNVPTXTarget() {
     75   // Register the target.
     76   RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
     77   RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
     78 
     79   // FIXME: This pass is really intended to be invoked during IR optimization,
     80   // but it's very NVPTX-specific.
     81   PassRegistry &PR = *PassRegistry::getPassRegistry();
     82   initializeNVVMReflectPass(PR);
     83   initializeNVVMIntrRangePass(PR);
     84   initializeGenericToNVVMPass(PR);
     85   initializeNVPTXAllocaHoistingPass(PR);
     86   initializeNVPTXAssignValidGlobalNamesPass(PR);
     87   initializeNVPTXLowerArgsPass(PR);
     88   initializeNVPTXLowerAllocaPass(PR);
     89   initializeNVPTXLowerAggrCopiesPass(PR);
     90 }
     91 
     92 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
     93   std::string Ret = "e";
     94 
     95   if (!is64Bit)
     96     Ret += "-p:32:32";
     97   else if (UseShortPointers)
     98     Ret += "-p3:32:32-p4:32:32-p5:32:32";
     99 
    100   Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
    101 
    102   return Ret;
    103 }
    104 
    105 static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
    106   if (CM)
    107     return *CM;
    108   return CodeModel::Small;
    109 }
    110 
    111 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
    112                                        StringRef CPU, StringRef FS,
    113                                        const TargetOptions &Options,
    114                                        Optional<Reloc::Model> RM,
    115                                        Optional<CodeModel::Model> CM,
    116                                        CodeGenOpt::Level OL, bool is64bit)
    117     // The pic relocation model is used regardless of what the client has
    118     // specified, as it is the only relocation model currently supported.
    119     : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,
    120                         CPU, FS, Options, Reloc::PIC_,
    121                         getEffectiveCodeModel(CM), OL),
    122       is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
    123       TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
    124       Subtarget(TT, CPU, FS, *this) {
    125   if (TT.getOS() == Triple::NVCL)
    126     drvInterface = NVPTX::NVCL;
    127   else
    128     drvInterface = NVPTX::CUDA;
    129   if (!DisableRequireStructuredCFG)
    130     setRequiresStructuredCFG(true);
    131   initAsmInfo();
    132 }
    133 
    134 NVPTXTargetMachine::~NVPTXTargetMachine() = default;
    135 
    136 void NVPTXTargetMachine32::anchor() {}
    137 
    138 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
    139                                            StringRef CPU, StringRef FS,
    140                                            const TargetOptions &Options,
    141                                            Optional<Reloc::Model> RM,
    142                                            Optional<CodeModel::Model> CM,
    143                                            CodeGenOpt::Level OL, bool JIT)
    144     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
    145 
    146 void NVPTXTargetMachine64::anchor() {}
    147 
    148 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
    149                                            StringRef CPU, StringRef FS,
    150                                            const TargetOptions &Options,
    151                                            Optional<Reloc::Model> RM,
    152                                            Optional<CodeModel::Model> CM,
    153                                            CodeGenOpt::Level OL, bool JIT)
    154     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
    155 
    156 namespace {
    157 
    158 class NVPTXPassConfig : public TargetPassConfig {
    159 public:
    160   NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM)
    161       : TargetPassConfig(TM, PM) {}
    162 
    163   NVPTXTargetMachine &getNVPTXTargetMachine() const {
    164     return getTM<NVPTXTargetMachine>();
    165   }
    166 
    167   void addIRPasses() override;
    168   bool addInstSelector() override;
    169   void addPostRegAlloc() override;
    170   void addMachineSSAOptimization() override;
    171 
    172   FunctionPass *createTargetRegisterAllocator(bool) override;
    173   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
    174   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
    175 
    176 private:
    177   // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
    178   // function is only called in opt mode.
    179   void addEarlyCSEOrGVNPass();
    180 
    181   // Add passes that propagate special memory spaces.
    182   void addAddressSpaceInferencePasses();
    183 
    184   // Add passes that perform straight-line scalar optimizations.
    185   void addStraightLineScalarOptimizationPasses();
    186 };
    187 
    188 } // end anonymous namespace
    189 
    190 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
    191   return new NVPTXPassConfig(*this, PM);
    192 }
    193 
    194 void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
    195   Builder.addExtension(
    196     PassManagerBuilder::EP_EarlyAsPossible,
    197     [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
    198       PM.add(createNVVMReflectPass());
    199       PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
    200     });
    201 }
    202 
    203 TargetTransformInfo
    204 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) {
    205   return TargetTransformInfo(NVPTXTTIImpl(this, F));
    206 }
    207 
    208 void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
    209   if (getOptLevel() == CodeGenOpt::Aggressive)
    210     addPass(createGVNPass());
    211   else
    212     addPass(createEarlyCSEPass());
    213 }
    214 
    215 void NVPTXPassConfig::addAddressSpaceInferencePasses() {
    216   // NVPTXLowerArgs emits alloca for byval parameters which can often
    217   // be eliminated by SROA.
    218   addPass(createSROAPass());
    219   addPass(createNVPTXLowerAllocaPass());
    220   addPass(createInferAddressSpacesPass());
    221 }
    222 
    223 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
    224   addPass(createSeparateConstOffsetFromGEPPass());
    225   addPass(createSpeculativeExecutionPass());
    226   // ReassociateGEPs exposes more opportunites for SLSR. See
    227   // the example in reassociate-geps-and-slsr.ll.
    228   addPass(createStraightLineStrengthReducePass());
    229   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
    230   // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
    231   // for some of our benchmarks.
    232   addEarlyCSEOrGVNPass();
    233   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
    234   addPass(createNaryReassociatePass());
    235   // NaryReassociate on GEPs creates redundant common expressions, so run
    236   // EarlyCSE after it.
    237   addPass(createEarlyCSEPass());
    238 }
    239 
    240 void NVPTXPassConfig::addIRPasses() {
    241   // The following passes are known to not play well with virtual regs hanging
    242   // around after register allocation (which in our case, is *all* registers).
    243   // We explicitly disable them here.  We do, however, need some functionality
    244   // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
    245   // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
    246   disablePass(&PrologEpilogCodeInserterID);
    247   disablePass(&MachineCopyPropagationID);
    248   disablePass(&TailDuplicateID);
    249   disablePass(&StackMapLivenessID);
    250   disablePass(&LiveDebugValuesID);
    251   disablePass(&PostRAMachineSinkingID);
    252   disablePass(&PostRASchedulerID);
    253   disablePass(&FuncletLayoutID);
    254   disablePass(&PatchableFunctionID);
    255   disablePass(&ShrinkWrapID);
    256 
    257   // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
    258   // it here does nothing.  But since we need it for correctness when lowering
    259   // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
    260   // call addEarlyAsPossiblePasses.
    261   addPass(createNVVMReflectPass());
    262 
    263   if (getOptLevel() != CodeGenOpt::None)
    264     addPass(createNVPTXImageOptimizerPass());
    265   addPass(createNVPTXAssignValidGlobalNamesPass());
    266   addPass(createGenericToNVVMPass());
    267 
    268   // NVPTXLowerArgs is required for correctness and should be run right
    269   // before the address space inference passes.
    270   addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
    271   if (getOptLevel() != CodeGenOpt::None) {
    272     addAddressSpaceInferencePasses();
    273     if (!DisableLoadStoreVectorizer)
    274       addPass(createLoadStoreVectorizerPass());
    275     addStraightLineScalarOptimizationPasses();
    276   }
    277 
    278   // === LSR and other generic IR passes ===
    279   TargetPassConfig::addIRPasses();
    280   // EarlyCSE is not always strong enough to clean up what LSR produces. For
    281   // example, GVN can combine
    282   //
    283   //   %0 = add %a, %b
    284   //   %1 = add %b, %a
    285   //
    286   // and
    287   //
    288   //   %0 = shl nsw %a, 2
    289   //   %1 = shl %a, 2
    290   //
    291   // but EarlyCSE can do neither of them.
    292   if (getOptLevel() != CodeGenOpt::None)
    293     addEarlyCSEOrGVNPass();
    294 }
    295 
    296 bool NVPTXPassConfig::addInstSelector() {
    297   const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
    298 
    299   addPass(createLowerAggrCopies());
    300   addPass(createAllocaHoisting());
    301   addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
    302 
    303   if (!ST.hasImageHandles())
    304     addPass(createNVPTXReplaceImageHandlesPass());
    305 
    306   return false;
    307 }
    308 
    309 void NVPTXPassConfig::addPostRegAlloc() {
    310   addPass(createNVPTXPrologEpilogPass(), false);
    311   if (getOptLevel() != CodeGenOpt::None) {
    312     // NVPTXPrologEpilogPass calculates frame object offset and replace frame
    313     // index with VRFrame register. NVPTXPeephole need to be run after that and
    314     // will replace VRFrame with VRFrameLocal when possible.
    315     addPass(createNVPTXPeephole());
    316   }
    317 }
    318 
    319 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
    320   return nullptr; // No reg alloc
    321 }
    322 
    323 void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
    324   assert(!RegAllocPass && "NVPTX uses no regalloc!");
    325   addPass(&PHIEliminationID);
    326   addPass(&TwoAddressInstructionPassID);
    327 }
    328 
    329 void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
    330   assert(!RegAllocPass && "NVPTX uses no regalloc!");
    331 
    332   addPass(&ProcessImplicitDefsID);
    333   addPass(&LiveVariablesID);
    334   addPass(&MachineLoopInfoID);
    335   addPass(&PHIEliminationID);
    336 
    337   addPass(&TwoAddressInstructionPassID);
    338   addPass(&RegisterCoalescerID);
    339 
    340   // PreRA instruction scheduling.
    341   if (addPass(&MachineSchedulerID))
    342     printAndVerify("After Machine Scheduling");
    343 
    344 
    345   addPass(&StackSlotColoringID);
    346 
    347   // FIXME: Needs physical registers
    348   //addPass(&MachineLICMID);
    349 
    350   printAndVerify("After StackSlotColoring");
    351 }
    352 
    353 void NVPTXPassConfig::addMachineSSAOptimization() {
    354   // Pre-ra tail duplication.
    355   if (addPass(&EarlyTailDuplicateID))
    356     printAndVerify("After Pre-RegAlloc TailDuplicate");
    357 
    358   // Optimize PHIs before DCE: removing dead PHI cycles may make more
    359   // instructions dead.
    360   addPass(&OptimizePHIsID);
    361 
    362   // This pass merges large allocas. StackSlotColoring is a different pass
    363   // which merges spill slots.
    364   addPass(&StackColoringID);
    365 
    366   // If the target requests it, assign local variables to stack slots relative
    367   // to one another and simplify frame index references where possible.
    368   addPass(&LocalStackSlotAllocationID);
    369 
    370   // With optimization, dead code should already be eliminated. However
    371   // there is one known exception: lowered code for arguments that are only
    372   // used by tail calls, where the tail calls reuse the incoming stack
    373   // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
    374   addPass(&DeadMachineInstructionElimID);
    375   printAndVerify("After codegen DCE pass");
    376 
    377   // Allow targets to insert passes that improve instruction level parallelism,
    378   // like if-conversion. Such passes will typically need dominator trees and
    379   // loop info, just like LICM and CSE below.
    380   if (addILPOpts())
    381     printAndVerify("After ILP optimizations");
    382 
    383   addPass(&EarlyMachineLICMID);
    384   addPass(&MachineCSEID);
    385 
    386   addPass(&MachineSinkingID);
    387   printAndVerify("After Machine LICM, CSE and Sinking passes");
    388 
    389   addPass(&PeepholeOptimizerID);
    390   printAndVerify("After codegen peephole optimization pass");
    391 }
    392