Home | History | Annotate | Download | only in AMDGPU
      1 //===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// Interface definition of the TargetLowering class that is common
     12 /// to all AMD GPUs.
     13 //
     14 //===----------------------------------------------------------------------===//
     15 
     16 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
     17 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
     18 
     19 #include "AMDGPU.h"
     20 #include "llvm/CodeGen/CallingConvLower.h"
     21 #include "llvm/CodeGen/TargetLowering.h"
     22 
     23 namespace llvm {
     24 
     25 class AMDGPUMachineFunction;
     26 class AMDGPUSubtarget;
     27 struct ArgDescriptor;
     28 
     29 class AMDGPUTargetLowering : public TargetLowering {
     30 private:
     31   const AMDGPUSubtarget *Subtarget;
     32 
     33   /// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been
     34   /// legalized from a smaller type VT. Need to match pre-legalized type because
     35   /// the generic legalization inserts the add/sub between the select and
     36   /// compare.
     37   SDValue getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, unsigned Opc) const;
     38 
     39 public:
     40   static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG);
     41   static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
     42 
     43 protected:
     44   AMDGPUAS AMDGPUASI;
     45 
     46   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
     47   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
     48   /// Split a vector store into multiple scalar stores.
     49   /// \returns The resulting chain.
     50 
     51   SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
     52   SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
     53   SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
     54   SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
     55   SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
     56 
     57   SDValue LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const;
     58   SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
     59   SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
     60   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
     61   SDValue LowerFLOG(SDValue Op, SelectionDAG &Dag,
     62                     double Log2BaseInverted) const;
     63 
     64   SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;
     65 
     66   SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
     67   SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
     68   SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     69   SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     70 
     71   SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
     72   SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
     73   SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
     74   SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     75 
     76   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
     77 
     78 protected:
     79   bool shouldCombineMemoryType(EVT VT) const;
     80   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     81   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     82   SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     83 
     84   SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
     85                                        unsigned Opc, SDValue LHS,
     86                                        uint32_t ValLo, uint32_t ValHi) const;
     87   SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     88   SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     89   SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     90   SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     91   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     92   SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     93   SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     94   SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;
     95   SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
     96                              SDValue RHS, DAGCombinerInfo &DCI) const;
     97   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     98   SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     99   SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
    100   SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
    101 
    102   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
    103 
    104   virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
    105                                      SelectionDAG &DAG) const;
    106 
    107   /// Return 64-bit value Op as two 32-bit integers.
    108   std::pair<SDValue, SDValue> split64BitValue(SDValue Op,
    109                                               SelectionDAG &DAG) const;
    110   SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
    111   SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
    112 
    113   /// Split a vector load into 2 loads of half the vector.
    114   SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
    115 
    116   /// Split a vector store into 2 stores of half the vector.
    117   SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
    118 
    119   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
    120   SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
    121   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
    122   SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
    123   void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
    124                                     SmallVectorImpl<SDValue> &Results) const;
    125 
    126   void analyzeFormalArgumentsCompute(
    127     CCState &State,
    128     const SmallVectorImpl<ISD::InputArg> &Ins) const;
    129 
    130 public:
    131   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
    132 
    133   bool mayIgnoreSignedZero(SDValue Op) const {
    134     if (getTargetMachine().Options.NoSignedZerosFPMath)
    135       return true;
    136 
    137     const auto Flags = Op.getNode()->getFlags();
    138     if (Flags.isDefined())
    139       return Flags.hasNoSignedZeros();
    140 
    141     return false;
    142   }
    143 
    144   static inline SDValue stripBitcast(SDValue Val) {
    145     return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
    146   }
    147 
    148   static bool allUsesHaveSourceMods(const SDNode *N,
    149                                     unsigned CostThreshold = 4);
    150   bool isFAbsFree(EVT VT) const override;
    151   bool isFNegFree(EVT VT) const override;
    152   bool isTruncateFree(EVT Src, EVT Dest) const override;
    153   bool isTruncateFree(Type *Src, Type *Dest) const override;
    154 
    155   bool isZExtFree(Type *Src, Type *Dest) const override;
    156   bool isZExtFree(EVT Src, EVT Dest) const override;
    157   bool isZExtFree(SDValue Val, EVT VT2) const override;
    158 
    159   bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
    160 
    161   MVT getVectorIdxTy(const DataLayout &) const override;
    162   bool isSelectSupported(SelectSupportKind) const override;
    163 
    164   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
    165   bool ShouldShrinkFPConstant(EVT VT) const override;
    166   bool shouldReduceLoadWidth(SDNode *Load,
    167                              ISD::LoadExtType ExtType,
    168                              EVT ExtVT) const override;
    169 
    170   bool isLoadBitCastBeneficial(EVT, EVT) const final;
    171 
    172   bool storeOfVectorConstantIsCheap(EVT MemVT,
    173                                     unsigned NumElem,
    174                                     unsigned AS) const override;
    175   bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
    176   bool isCheapToSpeculateCttz() const override;
    177   bool isCheapToSpeculateCtlz() const override;
    178 
    179   bool isSDNodeAlwaysUniform(const SDNode *N) const override;
    180   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
    181   static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
    182 
    183   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
    184                       const SmallVectorImpl<ISD::OutputArg> &Outs,
    185                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
    186                       SelectionDAG &DAG) const override;
    187 
    188   SDValue addTokenForArgument(SDValue Chain,
    189                               SelectionDAG &DAG,
    190                               MachineFrameInfo &MFI,
    191                               int ClobberedFI) const;
    192 
    193   SDValue lowerUnhandledCall(CallLoweringInfo &CLI,
    194                              SmallVectorImpl<SDValue> &InVals,
    195                              StringRef Reason) const;
    196   SDValue LowerCall(CallLoweringInfo &CLI,
    197                     SmallVectorImpl<SDValue> &InVals) const override;
    198 
    199   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op,
    200                                   SelectionDAG &DAG) const;
    201 
    202   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
    203   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
    204   void ReplaceNodeResults(SDNode * N,
    205                           SmallVectorImpl<SDValue> &Results,
    206                           SelectionDAG &DAG) const override;
    207 
    208   SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
    209                                SDValue RHS, SDValue True, SDValue False,
    210                                SDValue CC, DAGCombinerInfo &DCI) const;
    211 
    212   const char* getTargetNodeName(unsigned Opcode) const override;
    213 
    214   // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection
    215   // for AMDGPU.
    216   // A commit ( git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319036
    217   // 91177308-0d34-0410-b5e6-96231b3b80d8 ) turned on
    218   // MergeConsecutiveStores() before Instruction Selection for all targets.
    219   // Enough AMDGPU compiles go into an infinite loop ( MergeConsecutiveStores()
    220   // merges two stores; LegalizeStoreOps() un-merges; MergeConsecutiveStores()
    221   // re-merges, etc. ) to warrant turning it off for now.
    222   bool mergeStoresAfterLegalization() const override { return false; }
    223 
    224   bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
    225     return true;
    226   }
    227   SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
    228                            int &RefinementSteps, bool &UseOneConstNR,
    229                            bool Reciprocal) const override;
    230   SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
    231                            int &RefinementSteps) const override;
    232 
    233   virtual SDNode *PostISelFolding(MachineSDNode *N,
    234                                   SelectionDAG &DAG) const = 0;
    235 
    236   /// Determine which of the bits specified in \p Mask are known to be
    237   /// either zero or one and return them in the \p KnownZero and \p KnownOne
    238   /// bitsets.
    239   void computeKnownBitsForTargetNode(const SDValue Op,
    240                                      KnownBits &Known,
    241                                      const APInt &DemandedElts,
    242                                      const SelectionDAG &DAG,
    243                                      unsigned Depth = 0) const override;
    244 
    245   unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts,
    246                                            const SelectionDAG &DAG,
    247                                            unsigned Depth = 0) const override;
    248 
    249   /// Helper function that adds Reg to the LiveIn list of the DAG's
    250   /// MachineFunction.
    251   ///
    252   /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
    253   /// a copy from the register.
    254   SDValue CreateLiveInRegister(SelectionDAG &DAG,
    255                                const TargetRegisterClass *RC,
    256                                unsigned Reg, EVT VT,
    257                                const SDLoc &SL,
    258                                bool RawReg = false) const;
    259   SDValue CreateLiveInRegister(SelectionDAG &DAG,
    260                                const TargetRegisterClass *RC,
    261                                unsigned Reg, EVT VT) const {
    262     return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()));
    263   }
    264 
    265   // Returns the raw live in register rather than a copy from it.
    266   SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG,
    267                                   const TargetRegisterClass *RC,
    268                                   unsigned Reg, EVT VT) const {
    269     return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
    270   }
    271 
    272   /// Similar to CreateLiveInRegister, except value maybe loaded from a stack
    273   /// slot rather than passed in a register.
    274   SDValue loadStackInputValue(SelectionDAG &DAG,
    275                               EVT VT,
    276                               const SDLoc &SL,
    277                               int64_t Offset) const;
    278 
    279   SDValue storeStackInputValue(SelectionDAG &DAG,
    280                                const SDLoc &SL,
    281                                SDValue Chain,
    282                                SDValue StackPtr,
    283                                SDValue ArgVal,
    284                                int64_t Offset) const;
    285 
    286   SDValue loadInputValue(SelectionDAG &DAG,
    287                          const TargetRegisterClass *RC,
    288                          EVT VT, const SDLoc &SL,
    289                          const ArgDescriptor &Arg) const;
    290 
    291   enum ImplicitParameter {
    292     FIRST_IMPLICIT,
    293     GRID_DIM = FIRST_IMPLICIT,
    294     GRID_OFFSET,
    295   };
    296 
    297   /// Helper function that returns the byte offset of the given
    298   /// type of implicit parameter.
    299   uint32_t getImplicitParameterOffset(const MachineFunction &MF,
    300                                       const ImplicitParameter Param) const;
    301 
    302   AMDGPUAS getAMDGPUAS() const {
    303     return AMDGPUASI;
    304   }
    305 
    306   MVT getFenceOperandTy(const DataLayout &DL) const override {
    307     return MVT::i32;
    308   }
    309 };
    310 
    311 namespace AMDGPUISD {
    312 
    313 enum NodeType : unsigned {
    314   // AMDIL ISD Opcodes
    315   FIRST_NUMBER = ISD::BUILTIN_OP_END,
    316   UMUL,        // 32bit unsigned multiplication
    317   BRANCH_COND,
    318   // End AMDIL ISD Opcodes
    319 
    320   // Function call.
    321   CALL,
    322   TC_RETURN,
    323   TRAP,
    324 
    325   // Masked control flow nodes.
    326   IF,
    327   ELSE,
    328   LOOP,
    329 
    330   // A uniform kernel return that terminates the wavefront.
    331   ENDPGM,
    332 
    333   // Return to a shader part's epilog code.
    334   RETURN_TO_EPILOG,
    335 
    336   // Return with values from a non-entry function.
    337   RET_FLAG,
    338 
    339   DWORDADDR,
    340   FRACT,
    341 
    342   /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
    343   /// modifier behavior with dx10_enable.
    344   CLAMP,
    345 
    346   // This is SETCC with the full mask result which is used for a compare with a
    347   // result bit per item in the wavefront.
    348   SETCC,
    349   SETREG,
    350   // FP ops with input and output chain.
    351   FMA_W_CHAIN,
    352   FMUL_W_CHAIN,
    353 
    354   // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
    355   // Denormals handled on some parts.
    356   COS_HW,
    357   SIN_HW,
    358   FMAX_LEGACY,
    359   FMIN_LEGACY,
    360   FMAX3,
    361   SMAX3,
    362   UMAX3,
    363   FMIN3,
    364   SMIN3,
    365   UMIN3,
    366   FMED3,
    367   SMED3,
    368   UMED3,
    369   FDOT2,
    370   URECIP,
    371   DIV_SCALE,
    372   DIV_FMAS,
    373   DIV_FIXUP,
    374   // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
    375   // treated as an illegal operation.
    376   FMAD_FTZ,
    377   TRIG_PREOP, // 1 ULP max error for f64
    378 
    379   // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
    380   //            For f64, max error 2^29 ULP, handles denormals.
    381   RCP,
    382   RSQ,
    383   RCP_LEGACY,
    384   RSQ_LEGACY,
    385   RCP_IFLAG,
    386   FMUL_LEGACY,
    387   RSQ_CLAMP,
    388   LDEXP,
    389   FP_CLASS,
    390   DOT4,
    391   CARRY,
    392   BORROW,
    393   BFE_U32, // Extract range of bits with zero extension to 32-bits.
    394   BFE_I32, // Extract range of bits with sign extension to 32-bits.
    395   BFI, // (src0 & src1) | (~src0 & src2)
    396   BFM, // Insert a range of bits into a 32-bit word.
    397   FFBH_U32, // ctlz with -1 if input is zero.
    398   FFBH_I32,
    399   FFBL_B32, // cttz with -1 if input is zero.
    400   MUL_U24,
    401   MUL_I24,
    402   MULHI_U24,
    403   MULHI_I24,
    404   MAD_U24,
    405   MAD_I24,
    406   MAD_U64_U32,
    407   MAD_I64_I32,
    408   MUL_LOHI_I24,
    409   MUL_LOHI_U24,
    410   PERM,
    411   TEXTURE_FETCH,
    412   EXPORT, // exp on SI+
    413   EXPORT_DONE, // exp on SI+ with done bit set
    414   R600_EXPORT,
    415   CONST_ADDRESS,
    416   REGISTER_LOAD,
    417   REGISTER_STORE,
    418   SAMPLE,
    419   SAMPLEB,
    420   SAMPLED,
    421   SAMPLEL,
    422 
    423   // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
    424   CVT_F32_UBYTE0,
    425   CVT_F32_UBYTE1,
    426   CVT_F32_UBYTE2,
    427   CVT_F32_UBYTE3,
    428 
    429   // Convert two float 32 numbers into a single register holding two packed f16
    430   // with round to zero.
    431   CVT_PKRTZ_F16_F32,
    432   CVT_PKNORM_I16_F32,
    433   CVT_PKNORM_U16_F32,
    434   CVT_PK_I16_I32,
    435   CVT_PK_U16_U32,
    436 
    437   // Same as the standard node, except the high bits of the resulting integer
    438   // are known 0.
    439   FP_TO_FP16,
    440 
    441   // Wrapper around fp16 results that are known to zero the high bits.
    442   FP16_ZEXT,
    443 
    444   /// This node is for VLIW targets and it is used to represent a vector
    445   /// that is stored in consecutive registers with the same channel.
    446   /// For example:
    447   ///   |X  |Y|Z|W|
    448   /// T0|v.x| | | |
    449   /// T1|v.y| | | |
    450   /// T2|v.z| | | |
    451   /// T3|v.w| | | |
    452   BUILD_VERTICAL_VECTOR,
    453   /// Pointer to the start of the shader's constant data.
    454   CONST_DATA_PTR,
    455   INIT_EXEC,
    456   INIT_EXEC_FROM_INPUT,
    457   SENDMSG,
    458   SENDMSGHALT,
    459   INTERP_MOV,
    460   INTERP_P1,
    461   INTERP_P2,
    462   PC_ADD_REL_OFFSET,
    463   KILL,
    464   DUMMY_CHAIN,
    465   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
    466   STORE_MSKOR,
    467   LOAD_CONSTANT,
    468   TBUFFER_STORE_FORMAT,
    469   TBUFFER_STORE_FORMAT_X3,
    470   TBUFFER_STORE_FORMAT_D16,
    471   TBUFFER_LOAD_FORMAT,
    472   TBUFFER_LOAD_FORMAT_D16,
    473   ATOMIC_CMP_SWAP,
    474   ATOMIC_INC,
    475   ATOMIC_DEC,
    476   ATOMIC_LOAD_FADD,
    477   ATOMIC_LOAD_FMIN,
    478   ATOMIC_LOAD_FMAX,
    479   BUFFER_LOAD,
    480   BUFFER_LOAD_FORMAT,
    481   BUFFER_LOAD_FORMAT_D16,
    482   BUFFER_STORE,
    483   BUFFER_STORE_FORMAT,
    484   BUFFER_STORE_FORMAT_D16,
    485   BUFFER_ATOMIC_SWAP,
    486   BUFFER_ATOMIC_ADD,
    487   BUFFER_ATOMIC_SUB,
    488   BUFFER_ATOMIC_SMIN,
    489   BUFFER_ATOMIC_UMIN,
    490   BUFFER_ATOMIC_SMAX,
    491   BUFFER_ATOMIC_UMAX,
    492   BUFFER_ATOMIC_AND,
    493   BUFFER_ATOMIC_OR,
    494   BUFFER_ATOMIC_XOR,
    495   BUFFER_ATOMIC_CMPSWAP,
    496 
    497   LAST_AMDGPU_ISD_NUMBER
    498 };
    499 
    500 
    501 } // End namespace AMDGPUISD
    502 
    503 } // End namespace llvm
    504 
    505 #endif
    506