Home | History | Annotate | Download | only in NVPTX
      1 //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 
     10 #include "NVPTXTargetTransformInfo.h"
     11 #include "NVPTXUtilities.h"
     12 #include "llvm/Analysis/LoopInfo.h"
     13 #include "llvm/Analysis/TargetTransformInfo.h"
     14 #include "llvm/Analysis/ValueTracking.h"
     15 #include "llvm/CodeGen/BasicTTIImpl.h"
     16 #include "llvm/Support/Debug.h"
     17 #include "llvm/Target/CostTable.h"
     18 #include "llvm/Target/TargetLowering.h"
     19 using namespace llvm;
     20 
     21 #define DEBUG_TYPE "NVPTXtti"
     22 
     23 // Whether the given intrinsic reads threadIdx.x/y/z.
     24 static bool readsThreadIndex(const IntrinsicInst *II) {
     25   switch (II->getIntrinsicID()) {
     26     default: return false;
     27     case Intrinsic::nvvm_read_ptx_sreg_tid_x:
     28     case Intrinsic::nvvm_read_ptx_sreg_tid_y:
     29     case Intrinsic::nvvm_read_ptx_sreg_tid_z:
     30       return true;
     31   }
     32 }
     33 
     34 static bool readsLaneId(const IntrinsicInst *II) {
     35   return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
     36 }
     37 
     38 // Whether the given intrinsic is an atomic instruction in PTX.
     39 static bool isNVVMAtomic(const IntrinsicInst *II) {
     40   switch (II->getIntrinsicID()) {
     41     default: return false;
     42     case Intrinsic::nvvm_atomic_load_add_f32:
     43     case Intrinsic::nvvm_atomic_load_inc_32:
     44     case Intrinsic::nvvm_atomic_load_dec_32:
     45       return true;
     46   }
     47 }
     48 
     49 bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
     50   // Without inter-procedural analysis, we conservatively assume that arguments
     51   // to __device__ functions are divergent.
     52   if (const Argument *Arg = dyn_cast<Argument>(V))
     53     return !isKernelFunction(*Arg->getParent());
     54 
     55   if (const Instruction *I = dyn_cast<Instruction>(V)) {
     56     // Without pointer analysis, we conservatively assume values loaded from
     57     // generic or local address space are divergent.
     58     if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
     59       unsigned AS = LI->getPointerAddressSpace();
     60       return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
     61     }
     62     // Atomic instructions may cause divergence. Atomic instructions are
     63     // executed sequentially across all threads in a warp. Therefore, an earlier
     64     // executed thread may see different memory inputs than a later executed
     65     // thread. For example, suppose *a = 0 initially.
     66     //
     67     //   atom.global.add.s32 d, [a], 1
     68     //
     69     // returns 0 for the first thread that enters the critical region, and 1 for
     70     // the second thread.
     71     if (I->isAtomic())
     72       return true;
     73     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     74       // Instructions that read threadIdx are obviously divergent.
     75       if (readsThreadIndex(II) || readsLaneId(II))
     76         return true;
     77       // Handle the NVPTX atomic instrinsics that cannot be represented as an
     78       // atomic IR instruction.
     79       if (isNVVMAtomic(II))
     80         return true;
     81     }
     82     // Conservatively consider the return value of function calls as divergent.
     83     // We could analyze callees with bodies more precisely using
     84     // inter-procedural analysis.
     85     if (isa<CallInst>(I))
     86       return true;
     87   }
     88 
     89   return false;
     90 }
     91 
     92 int NVPTXTTIImpl::getArithmeticInstrCost(
     93     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     94     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     95     TTI::OperandValueProperties Opd2PropInfo) {
     96   // Legalize the type.
     97   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
     98 
     99   int ISD = TLI->InstructionOpcodeToISD(Opcode);
    100 
    101   switch (ISD) {
    102   default:
    103     return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
    104                                          Opd1PropInfo, Opd2PropInfo);
    105   case ISD::ADD:
    106   case ISD::MUL:
    107   case ISD::XOR:
    108   case ISD::OR:
    109   case ISD::AND:
    110     // The machine code (SASS) simulates an i64 with two i32. Therefore, we
    111     // estimate that arithmetic operations on i64 are twice as expensive as
    112     // those on types that can fit into one machine register.
    113     if (LT.second.SimpleTy == MVT::i64)
    114       return 2 * LT.first;
    115     // Delegate other cases to the basic TTI.
    116     return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
    117                                          Opd1PropInfo, Opd2PropInfo);
    118   }
    119 }
    120 
    121 void NVPTXTTIImpl::getUnrollingPreferences(Loop *L,
    122                                            TTI::UnrollingPreferences &UP) {
    123   BaseT::getUnrollingPreferences(L, UP);
    124 
    125   // Enable partial unrolling and runtime unrolling, but reduce the
    126   // threshold.  This partially unrolls small loops which are often
    127   // unrolled by the PTX to SASS compiler and unrolling earlier can be
    128   // beneficial.
    129   UP.Partial = UP.Runtime = true;
    130   UP.PartialThreshold = UP.Threshold / 4;
    131 }
    132