Home | History | Annotate | Download | only in X86
      1 //===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the interfaces that X86 uses to lower LLVM code into a
     11 // selection DAG.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
     16 #define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
     17 
     18 #include "llvm/CodeGen/CallingConvLower.h"
     19 #include "llvm/CodeGen/SelectionDAG.h"
     20 #include "llvm/CodeGen/TargetLowering.h"
     21 #include "llvm/Target/TargetOptions.h"
     22 
     23 namespace llvm {
     24   class X86Subtarget;
     25   class X86TargetMachine;
     26 
     27   namespace X86ISD {
     28     // X86 Specific DAG Nodes
     29     enum NodeType : unsigned {
     30       // Start the numbering where the builtin ops leave off.
     31       FIRST_NUMBER = ISD::BUILTIN_OP_END,
     32 
     33       /// Bit scan forward.
     34       BSF,
     35       /// Bit scan reverse.
     36       BSR,
     37 
     38       /// Double shift instructions. These correspond to
     39       /// X86::SHLDxx and X86::SHRDxx instructions.
     40       SHLD,
     41       SHRD,
     42 
     43       /// Bitwise logical AND of floating point values. This corresponds
     44       /// to X86::ANDPS or X86::ANDPD.
     45       FAND,
     46 
     47       /// Bitwise logical OR of floating point values. This corresponds
     48       /// to X86::ORPS or X86::ORPD.
     49       FOR,
     50 
     51       /// Bitwise logical XOR of floating point values. This corresponds
     52       /// to X86::XORPS or X86::XORPD.
     53       FXOR,
     54 
     55       ///  Bitwise logical ANDNOT of floating point values. This
     56       /// corresponds to X86::ANDNPS or X86::ANDNPD.
     57       FANDN,
     58 
     59       /// These operations represent an abstract X86 call
     60       /// instruction, which includes a bunch of information.  In particular the
     61       /// operands of these node are:
     62       ///
     63       ///     #0 - The incoming token chain
     64       ///     #1 - The callee
     65       ///     #2 - The number of arg bytes the caller pushes on the stack.
     66       ///     #3 - The number of arg bytes the callee pops off the stack.
     67       ///     #4 - The value to pass in AL/AX/EAX (optional)
     68       ///     #5 - The value to pass in DL/DX/EDX (optional)
     69       ///
     70       /// The result values of these nodes are:
     71       ///
     72       ///     #0 - The outgoing token chain
     73       ///     #1 - The first register result value (optional)
     74       ///     #2 - The second register result value (optional)
     75       ///
     76       CALL,
     77 
     78       /// Same as call except it adds the NoTrack prefix.
     79       NT_CALL,
     80 
     81       /// This operation implements the lowering for readcyclecounter.
     82       RDTSC_DAG,
     83 
     84       /// X86 Read Time-Stamp Counter and Processor ID.
     85       RDTSCP_DAG,
     86 
     87       /// X86 Read Performance Monitoring Counters.
     88       RDPMC_DAG,
     89 
     90       /// X86 compare and logical compare instructions.
     91       CMP, COMI, UCOMI,
     92 
     93       /// X86 bit-test instructions.
     94       BT,
     95 
     96       /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
     97       /// operand, usually produced by a CMP instruction.
     98       SETCC,
     99 
    100       /// X86 Select
    101       SELECT, SELECTS,
    102 
    103       // Same as SETCC except it's materialized with a sbb and the value is all
    104       // one's or all zero's.
    105       SETCC_CARRY,  // R = carry_bit ? ~0 : 0
    106 
    107       /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
    108       /// Operands are two FP values to compare; result is a mask of
    109       /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
    110       FSETCC,
    111 
    112       /// X86 FP SETCC, similar to above, but with output as an i1 mask and
    113       /// with optional rounding mode.
    114       FSETCCM, FSETCCM_RND,
    115 
    116       /// X86 conditional moves. Operand 0 and operand 1 are the two values
    117       /// to select from. Operand 2 is the condition code, and operand 3 is the
    118       /// flag operand produced by a CMP or TEST instruction. It also writes a
    119       /// flag result.
    120       CMOV,
    121 
    122       /// X86 conditional branches. Operand 0 is the chain operand, operand 1
    123       /// is the block to branch if condition is true, operand 2 is the
    124       /// condition code, and operand 3 is the flag operand produced by a CMP
    125       /// or TEST instruction.
    126       BRCOND,
    127 
    128       /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
    129       /// operand 1 is the target address.
    130       NT_BRIND,
    131 
    132       /// Return with a flag operand. Operand 0 is the chain operand, operand
    133       /// 1 is the number of bytes of stack to pop.
    134       RET_FLAG,
    135 
    136       /// Return from interrupt. Operand 0 is the number of bytes to pop.
    137       IRET,
    138 
    139       /// Repeat fill, corresponds to X86::REP_STOSx.
    140       REP_STOS,
    141 
    142       /// Repeat move, corresponds to X86::REP_MOVSx.
    143       REP_MOVS,
    144 
    145       /// On Darwin, this node represents the result of the popl
    146       /// at function entry, used for PIC code.
    147       GlobalBaseReg,
    148 
    149       /// A wrapper node for TargetConstantPool, TargetJumpTable,
    150       /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
    151       /// MCSymbol and TargetBlockAddress.
    152       Wrapper,
    153 
    154       /// Special wrapper used under X86-64 PIC mode for RIP
    155       /// relative displacements.
    156       WrapperRIP,
    157 
    158       /// Copies a 64-bit value from the low word of an XMM vector
    159       /// to an MMX vector.
    160       MOVDQ2Q,
    161 
    162       /// Copies a 32-bit value from the low word of a MMX
    163       /// vector to a GPR.
    164       MMX_MOVD2W,
    165 
    166       /// Copies a GPR into the low 32-bit word of a MMX vector
    167       /// and zero out the high word.
    168       MMX_MOVW2D,
    169 
    170       /// Extract an 8-bit value from a vector and zero extend it to
    171       /// i32, corresponds to X86::PEXTRB.
    172       PEXTRB,
    173 
    174       /// Extract a 16-bit value from a vector and zero extend it to
    175       /// i32, corresponds to X86::PEXTRW.
    176       PEXTRW,
    177 
    178       /// Insert any element of a 4 x float vector into any element
    179       /// of a destination 4 x floatvector.
    180       INSERTPS,
    181 
    182       /// Insert the lower 8-bits of a 32-bit value to a vector,
    183       /// corresponds to X86::PINSRB.
    184       PINSRB,
    185 
    186       /// Insert the lower 16-bits of a 32-bit value to a vector,
    187       /// corresponds to X86::PINSRW.
    188       PINSRW,
    189 
    190       /// Shuffle 16 8-bit values within a vector.
    191       PSHUFB,
    192 
    193       /// Compute Sum of Absolute Differences.
    194       PSADBW,
    195       /// Compute Double Block Packed Sum-Absolute-Differences
    196       DBPSADBW,
    197 
    198       /// Bitwise Logical AND NOT of Packed FP values.
    199       ANDNP,
    200 
    201       /// Blend where the selector is an immediate.
    202       BLENDI,
    203 
    204       /// Dynamic (non-constant condition) vector blend where only the sign bits
    205       /// of the condition elements are used. This is used to enforce that the
    206       /// condition mask is not valid for generic VSELECT optimizations.
    207       SHRUNKBLEND,
    208 
    209       /// Combined add and sub on an FP vector.
    210       ADDSUB,
    211 
    212       //  FP vector ops with rounding mode.
    213       FADD_RND, FADDS_RND,
    214       FSUB_RND, FSUBS_RND,
    215       FMUL_RND, FMULS_RND,
    216       FDIV_RND, FDIVS_RND,
    217       FMAX_RND, FMAXS_RND,
    218       FMIN_RND, FMINS_RND,
    219       FSQRT_RND, FSQRTS_RND,
    220 
    221       // FP vector get exponent.
    222       FGETEXP_RND, FGETEXPS_RND,
    223       // Extract Normalized Mantissas.
    224       VGETMANT, VGETMANT_RND, VGETMANTS, VGETMANTS_RND,
    225       // FP Scale.
    226       SCALEF,
    227       SCALEFS,
    228 
    229       // Integer add/sub with unsigned saturation.
    230       ADDUS,
    231       SUBUS,
    232 
    233       // Integer add/sub with signed saturation.
    234       ADDS,
    235       SUBS,
    236 
    237       // Unsigned Integer average.
    238       AVG,
    239 
    240       /// Integer horizontal add/sub.
    241       HADD,
    242       HSUB,
    243 
    244       /// Floating point horizontal add/sub.
    245       FHADD,
    246       FHSUB,
    247 
    248       // Detect Conflicts Within a Vector
    249       CONFLICT,
    250 
    251       /// Floating point max and min.
    252       FMAX, FMIN,
    253 
    254       /// Commutative FMIN and FMAX.
    255       FMAXC, FMINC,
    256 
    257       /// Scalar intrinsic floating point max and min.
    258       FMAXS, FMINS,
    259 
    260       /// Floating point reciprocal-sqrt and reciprocal approximation.
    261       /// Note that these typically require refinement
    262       /// in order to obtain suitable precision.
    263       FRSQRT, FRCP,
    264 
    265       // AVX-512 reciprocal approximations with a little more precision.
    266       RSQRT14, RSQRT14S, RCP14, RCP14S,
    267 
    268       // Thread Local Storage.
    269       TLSADDR,
    270 
    271       // Thread Local Storage. A call to get the start address
    272       // of the TLS block for the current module.
    273       TLSBASEADDR,
    274 
    275       // Thread Local Storage.  When calling to an OS provided
    276       // thunk at the address from an earlier relocation.
    277       TLSCALL,
    278 
    279       // Exception Handling helpers.
    280       EH_RETURN,
    281 
    282       // SjLj exception handling setjmp.
    283       EH_SJLJ_SETJMP,
    284 
    285       // SjLj exception handling longjmp.
    286       EH_SJLJ_LONGJMP,
    287 
    288       // SjLj exception handling dispatch.
    289       EH_SJLJ_SETUP_DISPATCH,
    290 
    291       /// Tail call return. See X86TargetLowering::LowerCall for
    292       /// the list of operands.
    293       TC_RETURN,
    294 
    295       // Vector move to low scalar and zero higher vector elements.
    296       VZEXT_MOVL,
    297 
    298       // Vector integer zero-extend.
    299       VZEXT,
    300       // Vector integer signed-extend.
    301       VSEXT,
    302 
    303       // Vector integer truncate.
    304       VTRUNC,
    305       // Vector integer truncate with unsigned/signed saturation.
    306       VTRUNCUS, VTRUNCS,
    307 
    308       // Vector FP extend.
    309       VFPEXT, VFPEXT_RND, VFPEXTS_RND,
    310 
    311       // Vector FP round.
    312       VFPROUND, VFPROUND_RND, VFPROUNDS_RND,
    313 
    314       // 128-bit vector logical left / right shift
    315       VSHLDQ, VSRLDQ,
    316 
    317       // Vector shift elements
    318       VSHL, VSRL, VSRA,
    319 
    320       // Vector variable shift right arithmetic.
    321       // Unlike ISD::SRA, in case shift count greater then element size
    322       // use sign bit to fill destination data element.
    323       VSRAV,
    324 
    325       // Vector shift elements by immediate
    326       VSHLI, VSRLI, VSRAI,
    327 
    328       // Shifts of mask registers.
    329       KSHIFTL, KSHIFTR,
    330 
    331       // Bit rotate by immediate
    332       VROTLI, VROTRI,
    333 
    334       // Vector packed double/float comparison.
    335       CMPP,
    336 
    337       // Vector integer comparisons.
    338       PCMPEQ, PCMPGT,
    339 
    340       // v8i16 Horizontal minimum and position.
    341       PHMINPOS,
    342 
    343       MULTISHIFT,
    344 
    345       /// Vector comparison generating mask bits for fp and
    346       /// integer signed and unsigned data types.
    347       CMPM,
    348       // Vector comparison with rounding mode for FP values
    349       CMPM_RND,
    350 
    351       // Arithmetic operations with FLAGS results.
    352       ADD, SUB, ADC, SBB, SMUL,
    353       INC, DEC, OR, XOR, AND,
    354 
    355       // Bit field extract.
    356       BEXTR,
    357 
    358       // LOW, HI, FLAGS = umul LHS, RHS.
    359       UMUL,
    360 
    361       // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS.
    362       SMUL8, UMUL8,
    363 
    364       // 8-bit divrem that zero-extend the high result (AH).
    365       UDIVREM8_ZEXT_HREG,
    366       SDIVREM8_SEXT_HREG,
    367 
    368       // X86-specific multiply by immediate.
    369       MUL_IMM,
    370 
    371       // Vector sign bit extraction.
    372       MOVMSK,
    373 
    374       // Vector bitwise comparisons.
    375       PTEST,
    376 
    377       // Vector packed fp sign bitwise comparisons.
    378       TESTP,
    379 
    380       // OR/AND test for masks.
    381       KORTEST,
    382       KTEST,
    383 
    384       // ADD for masks.
    385       KADD,
    386 
    387       // Several flavors of instructions with vector shuffle behaviors.
    388       // Saturated signed/unnsigned packing.
    389       PACKSS,
    390       PACKUS,
    391       // Intra-lane alignr.
    392       PALIGNR,
    393       // AVX512 inter-lane alignr.
    394       VALIGN,
    395       PSHUFD,
    396       PSHUFHW,
    397       PSHUFLW,
    398       SHUFP,
    399       // VBMI2 Concat & Shift.
    400       VSHLD,
    401       VSHRD,
    402       VSHLDV,
    403       VSHRDV,
    404       //Shuffle Packed Values at 128-bit granularity.
    405       SHUF128,
    406       MOVDDUP,
    407       MOVSHDUP,
    408       MOVSLDUP,
    409       MOVLHPS,
    410       MOVHLPS,
    411       MOVSD,
    412       MOVSS,
    413       UNPCKL,
    414       UNPCKH,
    415       VPERMILPV,
    416       VPERMILPI,
    417       VPERMI,
    418       VPERM2X128,
    419 
    420       // Variable Permute (VPERM).
    421       // Res = VPERMV MaskV, V0
    422       VPERMV,
    423 
    424       // 3-op Variable Permute (VPERMT2).
    425       // Res = VPERMV3 V0, MaskV, V1
    426       VPERMV3,
    427 
    428       // Bitwise ternary logic.
    429       VPTERNLOG,
    430       // Fix Up Special Packed Float32/64 values.
    431       VFIXUPIMM,
    432       VFIXUPIMMS,
    433       // Range Restriction Calculation For Packed Pairs of Float32/64 values.
    434       VRANGE, VRANGE_RND, VRANGES, VRANGES_RND,
    435       // Reduce - Perform Reduction Transformation on scalar\packed FP.
    436       VREDUCE, VREDUCE_RND, VREDUCES, VREDUCES_RND,
    437       // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
    438       // Also used by the legacy (V)ROUND intrinsics where we mask out the
    439       // scaling part of the immediate.
    440       VRNDSCALE, VRNDSCALE_RND, VRNDSCALES, VRNDSCALES_RND,
    441       // Tests Types Of a FP Values for packed types.
    442       VFPCLASS,
    443       // Tests Types Of a FP Values for scalar types.
    444       VFPCLASSS,
    445 
    446       // Broadcast scalar to vector.
    447       VBROADCAST,
    448       // Broadcast mask to vector.
    449       VBROADCASTM,
    450       // Broadcast subvector to vector.
    451       SUBV_BROADCAST,
    452 
    453       /// SSE4A Extraction and Insertion.
    454       EXTRQI, INSERTQI,
    455 
    456       // XOP arithmetic/logical shifts.
    457       VPSHA, VPSHL,
    458       // XOP signed/unsigned integer comparisons.
    459       VPCOM, VPCOMU,
    460       // XOP packed permute bytes.
    461       VPPERM,
    462       // XOP two source permutation.
    463       VPERMIL2,
    464 
    465       // Vector multiply packed unsigned doubleword integers.
    466       PMULUDQ,
    467       // Vector multiply packed signed doubleword integers.
    468       PMULDQ,
    469       // Vector Multiply Packed UnsignedIntegers with Round and Scale.
    470       MULHRS,
    471 
    472       // Multiply and Add Packed Integers.
    473       VPMADDUBSW, VPMADDWD,
    474 
    475       // AVX512IFMA multiply and add.
    476       // NOTE: These are different than the instruction and perform
    477       // op0 x op1 + op2.
    478       VPMADD52L, VPMADD52H,
    479 
    480       // VNNI
    481       VPDPBUSD,
    482       VPDPBUSDS,
    483       VPDPWSSD,
    484       VPDPWSSDS,
    485 
    486       // FMA nodes.
    487       // We use the target independent ISD::FMA for the non-inverted case.
    488       FNMADD,
    489       FMSUB,
    490       FNMSUB,
    491       FMADDSUB,
    492       FMSUBADD,
    493 
    494       // FMA with rounding mode.
    495       FMADD_RND,
    496       FNMADD_RND,
    497       FMSUB_RND,
    498       FNMSUB_RND,
    499       FMADDSUB_RND,
    500       FMSUBADD_RND,
    501 
    502       // Compress and expand.
    503       COMPRESS,
    504       EXPAND,
    505 
    506       // Bits shuffle
    507       VPSHUFBITQMB,
    508 
    509       // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
    510       SINT_TO_FP_RND, UINT_TO_FP_RND,
    511       SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,
    512 
    513       // Vector float/double to signed/unsigned integer.
    514       CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND,
    515       // Scalar float/double to signed/unsigned integer.
    516       CVTS2SI_RND, CVTS2UI_RND,
    517 
    518       // Vector float/double to signed/unsigned integer with truncation.
    519       CVTTP2SI, CVTTP2UI, CVTTP2SI_RND, CVTTP2UI_RND,
    520       // Scalar float/double to signed/unsigned integer with truncation.
    521       CVTTS2SI_RND, CVTTS2UI_RND,
    522 
    523       // Vector signed/unsigned integer to float/double.
    524       CVTSI2P, CVTUI2P,
    525 
    526       // Save xmm argument registers to the stack, according to %al. An operator
    527       // is needed so that this can be expanded with control flow.
    528       VASTART_SAVE_XMM_REGS,
    529 
    530       // Windows's _chkstk call to do stack probing.
    531       WIN_ALLOCA,
    532 
    533       // For allocating variable amounts of stack space when using
    534       // segmented stacks. Check if the current stacklet has enough space, and
    535       // falls back to heap allocation if not.
    536       SEG_ALLOCA,
    537 
    538       // Memory barriers.
    539       MEMBARRIER,
    540       MFENCE,
    541 
    542       // Store FP status word into i16 register.
    543       FNSTSW16r,
    544 
    545       // Store contents of %ah into %eflags.
    546       SAHF,
    547 
    548       // Get a random integer and indicate whether it is valid in CF.
    549       RDRAND,
    550 
    551       // Get a NIST SP800-90B & C compliant random integer and
    552       // indicate whether it is valid in CF.
    553       RDSEED,
    554 
    555       // SSE42 string comparisons.
    556       // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
    557       // will emit one or two instructions based on which results are used. If
    558       // flags and index/mask this allows us to use a single instruction since
    559       // we won't have to pick and opcode for flags. Instead we can rely on the
    560       // DAG to CSE everything and decide at isel.
    561       PCMPISTR,
    562       PCMPESTR,
    563 
    564       // Test if in transactional execution.
    565       XTEST,
    566 
    567       // ERI instructions.
    568       RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2,
    569 
    570       // Conversions between float and half-float.
    571       CVTPS2PH, CVTPH2PS, CVTPH2PS_RND,
    572 
    573       // Galois Field Arithmetic Instructions
    574       GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB,
    575 
    576       // LWP insert record.
    577       LWPINS,
    578 
    579       // User level wait
    580       UMWAIT, TPAUSE,
    581 
    582       // Compare and swap.
    583       LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
    584       LCMPXCHG8_DAG,
    585       LCMPXCHG16_DAG,
    586       LCMPXCHG8_SAVE_EBX_DAG,
    587       LCMPXCHG16_SAVE_RBX_DAG,
    588 
    589       /// LOCK-prefixed arithmetic read-modify-write instructions.
    590       /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
    591       LADD, LSUB, LOR, LXOR, LAND, LINC, LDEC,
    592 
    593       // Load, scalar_to_vector, and zero extend.
    594       VZEXT_LOAD,
    595 
    596       // Store FP control world into i16 memory.
    597       FNSTCW16m,
    598 
    599       /// This instruction implements FP_TO_SINT with the
    600       /// integer destination in memory and a FP reg source.  This corresponds
    601       /// to the X86::FIST*m instructions and the rounding mode change stuff. It
    602       /// has two inputs (token chain and address) and two outputs (int value
    603       /// and token chain).
    604       FP_TO_INT16_IN_MEM,
    605       FP_TO_INT32_IN_MEM,
    606       FP_TO_INT64_IN_MEM,
    607 
    608       /// This instruction implements SINT_TO_FP with the
    609       /// integer source in memory and FP reg result.  This corresponds to the
    610       /// X86::FILD*m instructions. It has three inputs (token chain, address,
    611       /// and source type) and two outputs (FP value and token chain). FILD_FLAG
    612       /// also produces a flag).
    613       FILD,
    614       FILD_FLAG,
    615 
    616       /// This instruction implements an extending load to FP stack slots.
    617       /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
    618       /// operand, ptr to load from, and a ValueType node indicating the type
    619       /// to load to.
    620       FLD,
    621 
    622       /// This instruction implements a truncating store to FP stack
    623       /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
    624       /// chain operand, value to store, address, and a ValueType to store it
    625       /// as.
    626       FST,
    627 
    628       /// This instruction grabs the address of the next argument
    629       /// from a va_list. (reads and modifies the va_list in memory)
    630       VAARG_64,
    631 
    632       // Vector truncating store with unsigned/signed saturation
    633       VTRUNCSTOREUS, VTRUNCSTORES,
    634       // Vector truncating masked store with unsigned/signed saturation
    635       VMTRUNCSTOREUS, VMTRUNCSTORES,
    636 
    637       // X86 specific gather and scatter
    638       MGATHER, MSCATTER,
    639 
    640       // WARNING: Do not add anything in the end unless you want the node to
    641       // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
    642       // opcodes will be thought as target memory ops!
    643     };
    644   } // end namespace X86ISD
    645 
    646   /// Define some predicates that are used for node matching.
    647   namespace X86 {
    648     /// Returns true if Elt is a constant zero or floating point constant +0.0.
    649     bool isZeroNode(SDValue Elt);
    650 
    651     /// Returns true of the given offset can be
    652     /// fit into displacement field of the instruction.
    653     bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
    654                                       bool hasSymbolicDisplacement = true);
    655 
    656     /// Determines whether the callee is required to pop its
    657     /// own arguments. Callee pop is necessary to support tail calls.
    658     bool isCalleePop(CallingConv::ID CallingConv,
    659                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
    660 
    661   } // end namespace X86
    662 
    663   //===--------------------------------------------------------------------===//
    664   //  X86 Implementation of the TargetLowering interface
    665   class X86TargetLowering final : public TargetLowering {
    666   public:
    667     explicit X86TargetLowering(const X86TargetMachine &TM,
    668                                const X86Subtarget &STI);
    669 
    670     unsigned getJumpTableEncoding() const override;
    671     bool useSoftFloat() const override;
    672 
    673     void markLibCallAttributes(MachineFunction *MF, unsigned CC,
    674                                ArgListTy &Args) const override;
    675 
    676     MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
    677       return MVT::i8;
    678     }
    679 
    680     const MCExpr *
    681     LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
    682                               const MachineBasicBlock *MBB, unsigned uid,
    683                               MCContext &Ctx) const override;
    684 
    685     /// Returns relocation base for the given PIC jumptable.
    686     SDValue getPICJumpTableRelocBase(SDValue Table,
    687                                      SelectionDAG &DAG) const override;
    688     const MCExpr *
    689     getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
    690                                  unsigned JTI, MCContext &Ctx) const override;
    691 
    692     /// Return the desired alignment for ByVal aggregate
    693     /// function arguments in the caller parameter area. For X86, aggregates
    694     /// that contains are placed at 16-byte boundaries while the rest are at
    695     /// 4-byte boundaries.
    696     unsigned getByValTypeAlignment(Type *Ty,
    697                                    const DataLayout &DL) const override;
    698 
    699     /// Returns the target specific optimal type for load
    700     /// and store operations as a result of memset, memcpy, and memmove
    701     /// lowering. If DstAlign is zero that means it's safe to destination
    702     /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
    703     /// means there isn't a need to check it against alignment requirement,
    704     /// probably because the source does not need to be loaded. If 'IsMemset' is
    705     /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
    706     /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
    707     /// source is constant so it does not need to be loaded.
    708     /// It returns EVT::Other if the type should be determined using generic
    709     /// target-independent logic.
    710     EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
    711                             bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
    712                             MachineFunction &MF) const override;
    713 
    714     /// Returns true if it's safe to use load / store of the
    715     /// specified type to expand memcpy / memset inline. This is mostly true
    716     /// for all types except for some special cases. For example, on X86
    717     /// targets without SSE2 f64 load / store are done with fldl / fstpl which
    718     /// also does type conversion. Note the specified type doesn't have to be
    719     /// legal as the hook is used before type legalization.
    720     bool isSafeMemOpType(MVT VT) const override;
    721 
    722     /// Returns true if the target allows unaligned memory accesses of the
    723     /// specified type. Returns whether it is "fast" in the last argument.
    724     bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
    725                                        bool *Fast) const override;
    726 
    727     /// Provide custom lowering hooks for some operations.
    728     ///
    729     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
    730 
    731     /// Places new result values for the node in Results (their number
    732     /// and types must exactly match those of the original return values of
    733     /// the node), or leaves Results empty, which indicates that the node is not
    734     /// to be custom lowered after all.
    735     void LowerOperationWrapper(SDNode *N,
    736                                SmallVectorImpl<SDValue> &Results,
    737                                SelectionDAG &DAG) const override;
    738 
    739     /// Replace the results of node with an illegal result
    740     /// type with new values built out of custom code.
    741     ///
    742     void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
    743                             SelectionDAG &DAG) const override;
    744 
    745     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
    746 
    747     // Return true if it is profitable to combine a BUILD_VECTOR with a
    748     // stride-pattern to a shuffle and a truncate.
    749     // Example of such a combine:
    750     // v4i32 build_vector((extract_elt V, 1),
    751     //                    (extract_elt V, 3),
    752     //                    (extract_elt V, 5),
    753     //                    (extract_elt V, 7))
    754     //  -->
    755     // v4i32 truncate (bitcast (shuffle<1,u,3,u,4,u,5,u,6,u,7,u> V, u) to
    756     // v4i64)
    757     bool isDesirableToCombineBuildVectorToShuffleTruncate(
    758         ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const override;
    759 
    760     /// Return true if the target has native support for
    761     /// the specified value type and it is 'desirable' to use the type for the
    762     /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
    763     /// instruction encodings are longer and some i16 instructions are slow.
    764     bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
    765 
    766     /// Return true if the target has native support for the
    767     /// specified value type and it is 'desirable' to use the type. e.g. On x86
    768     /// i16 is legal, but undesirable since i16 instruction encodings are longer
    769     /// and some i16 instructions are slow.
    770     bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
    771 
    772     MachineBasicBlock *
    773     EmitInstrWithCustomInserter(MachineInstr &MI,
    774                                 MachineBasicBlock *MBB) const override;
    775 
    776     /// This method returns the name of a target specific DAG node.
    777     const char *getTargetNodeName(unsigned Opcode) const override;
    778 
    779     bool mergeStoresAfterLegalization() const override { return true; }
    780 
    781     bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
    782                           const SelectionDAG &DAG) const override;
    783 
    784     bool isCheapToSpeculateCttz() const override;
    785 
    786     bool isCheapToSpeculateCtlz() const override;
    787 
    788     bool isCtlzFast() const override;
    789 
    790     bool hasBitPreservingFPLogic(EVT VT) const override {
    791       return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
    792     }
    793 
    794     bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
    795       // If the pair to store is a mixture of float and int values, we will
    796       // save two bitwise instructions and one float-to-int instruction and
    797       // increase one store instruction. There is potentially a more
    798       // significant benefit because it avoids the float->int domain switch
    799       // for input value. So It is more likely a win.
    800       if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
    801           (LTy.isInteger() && HTy.isFloatingPoint()))
    802         return true;
    803       // If the pair only contains int values, we will save two bitwise
    804       // instructions and increase one store instruction (costing one more
    805       // store buffer). Since the benefit is more blurred so we leave
    806       // such pair out until we get testcase to prove it is a win.
    807       return false;
    808     }
    809 
    810     bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
    811 
    812     bool hasAndNotCompare(SDValue Y) const override;
    813 
    814     bool hasAndNot(SDValue Y) const override;
    815 
    816     bool preferShiftsToClearExtremeBits(SDValue Y) const override;
    817 
    818     bool
    819     shouldTransformSignedTruncationCheck(EVT XVT,
    820                                          unsigned KeptBits) const override {
    821       // For vectors, we don't have a preference..
    822       if (XVT.isVector())
    823         return false;
    824 
    825       auto VTIsOk = [](EVT VT) -> bool {
    826         return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
    827                VT == MVT::i64;
    828       };
    829 
    830       // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
    831       // XVT will be larger than KeptBitsVT.
    832       MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
    833       return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
    834     }
    835 
    836     bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
    837       return VT.isScalarInteger();
    838     }
    839 
    840     /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
    841     MVT hasFastEqualityCompare(unsigned NumBits) const override;
    842 
    843     /// Allow multiple load pairs per block for smaller and faster code.
    844     unsigned getMemcmpEqZeroLoadsPerBlock() const override {
    845       return 2;
    846     }
    847 
    848     /// Return the value type to use for ISD::SETCC.
    849     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
    850                            EVT VT) const override;
    851 
    852     bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
    853                                       TargetLoweringOpt &TLO) const override;
    854 
    855     /// Determine which of the bits specified in Mask are known to be either
    856     /// zero or one and return them in the KnownZero/KnownOne bitsets.
    857     void computeKnownBitsForTargetNode(const SDValue Op,
    858                                        KnownBits &Known,
    859                                        const APInt &DemandedElts,
    860                                        const SelectionDAG &DAG,
    861                                        unsigned Depth = 0) const override;
    862 
    863     /// Determine the number of bits in the operation that are sign bits.
    864     unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
    865                                              const APInt &DemandedElts,
    866                                              const SelectionDAG &DAG,
    867                                              unsigned Depth) const override;
    868 
    869     SDValue unwrapAddress(SDValue N) const override;
    870 
    871     bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
    872                         int64_t &Offset) const override;
    873 
    874     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
    875 
    876     bool ExpandInlineAsm(CallInst *CI) const override;
    877 
    878     ConstraintType getConstraintType(StringRef Constraint) const override;
    879 
    880     /// Examine constraint string and operand type and determine a weight value.
    881     /// The operand object must already have been set up with the operand type.
    882     ConstraintWeight
    883       getSingleConstraintMatchWeight(AsmOperandInfo &info,
    884                                      const char *constraint) const override;
    885 
    886     const char *LowerXConstraint(EVT ConstraintVT) const override;
    887 
    888     /// Lower the specified operand into the Ops vector. If it is invalid, don't
    889     /// add anything to Ops. If hasMemory is true it means one of the asm
    890     /// constraint of the inline asm instruction being processed is 'm'.
    891     void LowerAsmOperandForConstraint(SDValue Op,
    892                                       std::string &Constraint,
    893                                       std::vector<SDValue> &Ops,
    894                                       SelectionDAG &DAG) const override;
    895 
    896     unsigned
    897     getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
    898       if (ConstraintCode == "i")
    899         return InlineAsm::Constraint_i;
    900       else if (ConstraintCode == "o")
    901         return InlineAsm::Constraint_o;
    902       else if (ConstraintCode == "v")
    903         return InlineAsm::Constraint_v;
    904       else if (ConstraintCode == "X")
    905         return InlineAsm::Constraint_X;
    906       return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
    907     }
    908 
    909     /// Given a physical register constraint
    910     /// (e.g. {edx}), return the register number and the register class for the
    911     /// register.  This should only be used for C_Register constraints.  On
    912     /// error, this returns a register number of 0.
    913     std::pair<unsigned, const TargetRegisterClass *>
    914     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
    915                                  StringRef Constraint, MVT VT) const override;
    916 
    917     /// Return true if the addressing mode represented
    918     /// by AM is legal for this target, for a load/store of the specified type.
    919     bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
    920                                Type *Ty, unsigned AS,
    921                                Instruction *I = nullptr) const override;
    922 
    923     /// Return true if the specified immediate is legal
    924     /// icmp immediate, that is the target has icmp instructions which can
    925     /// compare a register against the immediate without having to materialize
    926     /// the immediate into a register.
    927     bool isLegalICmpImmediate(int64_t Imm) const override;
    928 
    929     /// Return true if the specified immediate is legal
    930     /// add immediate, that is the target has add instructions which can
    931     /// add a register and the immediate without having to materialize
    932     /// the immediate into a register.
    933     bool isLegalAddImmediate(int64_t Imm) const override;
    934 
    935     /// Return the cost of the scaling factor used in the addressing
    936     /// mode represented by AM for this target, for a load/store
    937     /// of the specified type.
    938     /// If the AM is supported, the return value must be >= 0.
    939     /// If the AM is not supported, it returns a negative value.
    940     int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
    941                              unsigned AS) const override;
    942 
    943     bool isVectorShiftByScalarCheap(Type *Ty) const override;
    944 
    945     /// Return true if it's free to truncate a value of
    946     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
    947     /// register EAX to i16 by referencing its sub-register AX.
    948     bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
    949     bool isTruncateFree(EVT VT1, EVT VT2) const override;
    950 
    951     bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
    952 
    953     /// Return true if any actual instruction that defines a
    954     /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
    955     /// register. This does not necessarily include registers defined in
    956     /// unknown ways, such as incoming arguments, or copies from unknown
    957     /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
    958     /// does not necessarily apply to truncate instructions. e.g. on x86-64,
    959     /// all instructions that define 32-bit values implicit zero-extend the
    960     /// result out to 64 bits.
    961     bool isZExtFree(Type *Ty1, Type *Ty2) const override;
    962     bool isZExtFree(EVT VT1, EVT VT2) const override;
    963     bool isZExtFree(SDValue Val, EVT VT2) const override;
    964 
    965     /// Return true if folding a vector load into ExtVal (a sign, zero, or any
    966     /// extend node) is profitable.
    967     bool isVectorLoadExtDesirable(SDValue) const override;
    968 
    969     /// Return true if an FMA operation is faster than a pair of fmul and fadd
    970     /// instructions. fmuladd intrinsics will be expanded to FMAs when this
    971     /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
    972     bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
    973 
    974     /// Return true if it's profitable to narrow
    975     /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
    976     /// from i32 to i8 but not from i32 to i16.
    977     bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
    978 
    979     /// Given an intrinsic, checks if on the target the intrinsic will need to map
    980     /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
    981     /// true and stores the intrinsic information into the IntrinsicInfo that was
    982     /// passed to the function.
    983     bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
    984                             MachineFunction &MF,
    985                             unsigned Intrinsic) const override;
    986 
    987     /// Returns true if the target can instruction select the
    988     /// specified FP immediate natively. If false, the legalizer will
    989     /// materialize the FP immediate as a load from a constant pool.
    990     bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
    991 
    992     /// Targets can use this to indicate that they only support *some*
    993     /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
    994     /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
    995     /// be legal.
    996     bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
    997 
    998     /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
    999     /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
   1000     /// constant pool entry.
   1001     bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
   1002 
   1003     /// Returns true if lowering to a jump table is allowed.
   1004     bool areJTsAllowed(const Function *Fn) const override;
   1005 
   1006     /// If true, then instruction selection should
   1007     /// seek to shrink the FP constant of the specified type to a smaller type
   1008     /// in order to save space and / or reduce runtime.
   1009     bool ShouldShrinkFPConstant(EVT VT) const override {
   1010       // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
   1011       // expensive than a straight movsd. On the other hand, it's important to
   1012       // shrink long double fp constant since fldt is very slow.
   1013       return !X86ScalarSSEf64 || VT == MVT::f80;
   1014     }
   1015 
   1016     /// Return true if we believe it is correct and profitable to reduce the
   1017     /// load node to a smaller type.
   1018     bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
   1019                                EVT NewVT) const override;
   1020 
   1021     /// Return true if the specified scalar FP type is computed in an SSE
   1022     /// register, not on the X87 floating point stack.
   1023     bool isScalarFPTypeInSSEReg(EVT VT) const {
   1024       return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
   1025              (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
   1026     }
   1027 
   1028     /// Returns true if it is beneficial to convert a load of a constant
   1029     /// to just the constant itself.
   1030     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
   1031                                            Type *Ty) const override;
   1032 
   1033     bool convertSelectOfConstantsToMath(EVT VT) const override;
   1034 
   1035     /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
   1036     /// with this index.
   1037     bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
   1038                                  unsigned Index) const override;
   1039 
   1040     bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
   1041                                       unsigned AddrSpace) const override {
   1042       // If we can replace more than 2 scalar stores, there will be a reduction
   1043       // in instructions even after we add a vector constant load.
   1044       return NumElem > 2;
   1045     }
   1046 
   1047     bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override;
   1048 
   1049     /// Intel processors have a unified instruction and data cache
   1050     const char * getClearCacheBuiltinName() const override {
   1051       return nullptr; // nothing to do, move along.
   1052     }
   1053 
   1054     unsigned getRegisterByName(const char* RegName, EVT VT,
   1055                                SelectionDAG &DAG) const override;
   1056 
   1057     /// If a physical register, this returns the register that receives the
   1058     /// exception address on entry to an EH pad.
   1059     unsigned
   1060     getExceptionPointerRegister(const Constant *PersonalityFn) const override;
   1061 
   1062     /// If a physical register, this returns the register that receives the
   1063     /// exception typeid on entry to a landing pad.
   1064     unsigned
   1065     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
   1066 
   1067     virtual bool needsFixedCatchObjects() const override;
   1068 
   1069     /// This method returns a target specific FastISel object,
   1070     /// or null if the target does not support "fast" ISel.
   1071     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
   1072                              const TargetLibraryInfo *libInfo) const override;
   1073 
   1074     /// If the target has a standard location for the stack protector cookie,
   1075     /// returns the address of that location. Otherwise, returns nullptr.
   1076     Value *getIRStackGuard(IRBuilder<> &IRB) const override;
   1077 
   1078     bool useLoadStackGuardNode() const override;
   1079     bool useStackGuardXorFP() const override;
   1080     void insertSSPDeclarations(Module &M) const override;
   1081     Value *getSDagStackGuard(const Module &M) const override;
   1082     Value *getSSPStackGuardCheck(const Module &M) const override;
   1083     SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
   1084                                 const SDLoc &DL) const override;
   1085 
   1086 
   1087     /// Return true if the target stores SafeStack pointer at a fixed offset in
   1088     /// some non-standard address space, and populates the address space and
   1089     /// offset as appropriate.
   1090     Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
   1091 
   1092     SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
   1093                       SelectionDAG &DAG) const;
   1094 
   1095     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
   1096 
   1097     /// Customize the preferred legalization strategy for certain types.
   1098     LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
   1099 
   1100     MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
   1101                                       EVT VT) const override;
   1102 
   1103     unsigned getNumRegistersForCallingConv(LLVMContext &Context,
   1104                                            CallingConv::ID CC,
   1105                                            EVT VT) const override;
   1106 
   1107     bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
   1108 
   1109     bool supportSwiftError() const override;
   1110 
   1111     StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
   1112 
   1113     bool hasVectorBlend() const override { return true; }
   1114 
   1115     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
   1116 
   1117     /// Lower interleaved load(s) into target specific
   1118     /// instructions/intrinsics.
   1119     bool lowerInterleavedLoad(LoadInst *LI,
   1120                               ArrayRef<ShuffleVectorInst *> Shuffles,
   1121                               ArrayRef<unsigned> Indices,
   1122                               unsigned Factor) const override;
   1123 
   1124     /// Lower interleaved store(s) into target specific
   1125     /// instructions/intrinsics.
   1126     bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
   1127                                unsigned Factor) const override;
   1128 
   1129     SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value,
   1130                                    SDValue Addr, SelectionDAG &DAG)
   1131                                    const override;
   1132 
   1133   protected:
   1134     std::pair<const TargetRegisterClass *, uint8_t>
   1135     findRepresentativeClass(const TargetRegisterInfo *TRI,
   1136                             MVT VT) const override;
   1137 
   1138   private:
   1139     /// Keep a reference to the X86Subtarget around so that we can
   1140     /// make the right decision when generating code for different targets.
   1141     const X86Subtarget &Subtarget;
   1142 
   1143     /// Select between SSE or x87 floating point ops.
   1144     /// When SSE is available, use it for f32 operations.
   1145     /// When SSE2 is available, use it for f64 operations.
   1146     bool X86ScalarSSEf32;
   1147     bool X86ScalarSSEf64;
   1148 
   1149     /// A list of legal FP immediates.
   1150     std::vector<APFloat> LegalFPImmediates;
   1151 
   1152     /// Indicate that this x86 target can instruction
   1153     /// select the specified FP immediate natively.
   1154     void addLegalFPImmediate(const APFloat& Imm) {
   1155       LegalFPImmediates.push_back(Imm);
   1156     }
   1157 
   1158     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
   1159                             CallingConv::ID CallConv, bool isVarArg,
   1160                             const SmallVectorImpl<ISD::InputArg> &Ins,
   1161                             const SDLoc &dl, SelectionDAG &DAG,
   1162                             SmallVectorImpl<SDValue> &InVals,
   1163                             uint32_t *RegMask) const;
   1164     SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
   1165                              const SmallVectorImpl<ISD::InputArg> &ArgInfo,
   1166                              const SDLoc &dl, SelectionDAG &DAG,
   1167                              const CCValAssign &VA, MachineFrameInfo &MFI,
   1168                              unsigned i) const;
   1169     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
   1170                              const SDLoc &dl, SelectionDAG &DAG,
   1171                              const CCValAssign &VA,
   1172                              ISD::ArgFlagsTy Flags) const;
   1173 
   1174     // Call lowering helpers.
   1175 
   1176     /// Check whether the call is eligible for tail call optimization. Targets
   1177     /// that want to do tail call optimization should implement this function.
   1178     bool IsEligibleForTailCallOptimization(SDValue Callee,
   1179                                            CallingConv::ID CalleeCC,
   1180                                            bool isVarArg,
   1181                                            bool isCalleeStructRet,
   1182                                            bool isCallerStructRet,
   1183                                            Type *RetTy,
   1184                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
   1185                                     const SmallVectorImpl<SDValue> &OutVals,
   1186                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   1187                                            SelectionDAG& DAG) const;
   1188     SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
   1189                                     SDValue Chain, bool IsTailCall,
   1190                                     bool Is64Bit, int FPDiff,
   1191                                     const SDLoc &dl) const;
   1192 
   1193     unsigned GetAlignedArgumentStackSize(unsigned StackSize,
   1194                                          SelectionDAG &DAG) const;
   1195 
   1196     unsigned getAddressSpace(void) const;
   1197 
   1198     std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   1199                                                bool isSigned,
   1200                                                bool isReplace) const;
   1201 
   1202     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   1203     SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
   1204     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   1205     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   1206 
   1207     unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr,
   1208                                   const unsigned char OpFlags = 0) const;
   1209     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
   1210     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   1211     SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl,
   1212                                int64_t Offset, SelectionDAG &DAG) const;
   1213     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   1214     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   1215     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
   1216 
   1217     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   1218     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   1219     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
   1220     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
   1221     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   1222     SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
   1223     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   1224     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
   1225     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
   1226     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   1227     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   1228     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
   1229     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   1230     SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   1231     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
   1232     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
   1233     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
   1234     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
   1235     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
   1236     SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
   1237     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
   1238     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
   1239     SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
   1240     SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const;
   1241     SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
   1242     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   1243 
   1244     SDValue
   1245     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   1246                          const SmallVectorImpl<ISD::InputArg> &Ins,
   1247                          const SDLoc &dl, SelectionDAG &DAG,
   1248                          SmallVectorImpl<SDValue> &InVals) const override;
   1249     SDValue LowerCall(CallLoweringInfo &CLI,
   1250                       SmallVectorImpl<SDValue> &InVals) const override;
   1251 
   1252     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   1253                         const SmallVectorImpl<ISD::OutputArg> &Outs,
   1254                         const SmallVectorImpl<SDValue> &OutVals,
   1255                         const SDLoc &dl, SelectionDAG &DAG) const override;
   1256 
   1257     bool supportSplitCSR(MachineFunction *MF) const override {
   1258       return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
   1259           MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
   1260     }
   1261     void initializeSplitCSR(MachineBasicBlock *Entry) const override;
   1262     void insertCopiesSplitCSR(
   1263       MachineBasicBlock *Entry,
   1264       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
   1265 
   1266     bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
   1267 
   1268     bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
   1269 
   1270     EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
   1271                             ISD::NodeType ExtendKind) const override;
   1272 
   1273     bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
   1274                         bool isVarArg,
   1275                         const SmallVectorImpl<ISD::OutputArg> &Outs,
   1276                         LLVMContext &Context) const override;
   1277 
   1278     const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
   1279 
   1280     TargetLoweringBase::AtomicExpansionKind
   1281     shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
   1282     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
   1283     TargetLoweringBase::AtomicExpansionKind
   1284     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
   1285 
   1286     LoadInst *
   1287     lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
   1288 
   1289     bool needsCmpXchgNb(Type *MemType) const;
   1290 
   1291     void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
   1292                                 MachineBasicBlock *DispatchBB, int FI) const;
   1293 
   1294     // Utility function to emit the low-level va_arg code for X86-64.
   1295     MachineBasicBlock *
   1296     EmitVAARG64WithCustomInserter(MachineInstr &MI,
   1297                                   MachineBasicBlock *MBB) const;
   1298 
   1299     /// Utility function to emit the xmm reg save portion of va_start.
   1300     MachineBasicBlock *
   1301     EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
   1302                                              MachineBasicBlock *BB) const;
   1303 
   1304     MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
   1305                                                  MachineInstr &MI2,
   1306                                                  MachineBasicBlock *BB) const;
   1307 
   1308     MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
   1309                                          MachineBasicBlock *BB) const;
   1310 
   1311     MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I,
   1312                                            MachineBasicBlock *BB) const;
   1313 
   1314     MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
   1315                                            MachineBasicBlock *BB) const;
   1316 
   1317     MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
   1318                                            MachineBasicBlock *BB) const;
   1319 
   1320     MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
   1321                                             MachineBasicBlock *BB) const;
   1322 
   1323     MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
   1324                                           MachineBasicBlock *BB) const;
   1325 
   1326     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
   1327                                           MachineBasicBlock *BB) const;
   1328 
   1329     MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI,
   1330                                             MachineBasicBlock *BB) const;
   1331 
   1332     MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
   1333                                         MachineBasicBlock *MBB) const;
   1334 
   1335     void emitSetJmpShadowStackFix(MachineInstr &MI,
   1336                                   MachineBasicBlock *MBB) const;
   1337 
   1338     MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
   1339                                          MachineBasicBlock *MBB) const;
   1340 
   1341     MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
   1342                                                  MachineBasicBlock *MBB) const;
   1343 
   1344     MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
   1345                                      MachineBasicBlock *MBB) const;
   1346 
   1347     MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
   1348                                              MachineBasicBlock *MBB) const;
   1349 
   1350     /// Emit nodes that will be selected as "test Op0,Op0", or something
   1351     /// equivalent, for use with the given x86 condition code.
   1352     SDValue EmitTest(SDValue Op0, unsigned X86CC, const SDLoc &dl,
   1353                      SelectionDAG &DAG) const;
   1354 
   1355     /// Emit nodes that will be selected as "cmp Op0,Op1", or something
   1356     /// equivalent, for use with the given x86 condition code.
   1357     SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl,
   1358                     SelectionDAG &DAG) const;
   1359 
   1360     /// Convert a comparison if required by the subtarget.
   1361     SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
   1362 
   1363     /// Check if replacement of SQRT with RSQRT should be disabled.
   1364     bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
   1365 
   1366     /// Use rsqrt* to speed up sqrt calculations.
   1367     SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
   1368                             int &RefinementSteps, bool &UseOneConstNR,
   1369                             bool Reciprocal) const override;
   1370 
   1371     /// Use rcp* to speed up fdiv calculations.
   1372     SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
   1373                              int &RefinementSteps) const override;
   1374 
   1375     /// Reassociate floating point divisions into multiply by reciprocal.
   1376     unsigned combineRepeatedFPDivisors() const override;
   1377   };
   1378 
   1379   namespace X86 {
   1380     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
   1381                              const TargetLibraryInfo *libInfo);
   1382   } // end namespace X86
   1383 
   1384   // Base class for all X86 non-masked store operations.
   1385   class X86StoreSDNode : public MemSDNode {
   1386   public:
   1387     X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
   1388                    SDVTList VTs, EVT MemVT,
   1389                    MachineMemOperand *MMO)
   1390       :MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
   1391     const SDValue &getValue() const { return getOperand(1); }
   1392     const SDValue &getBasePtr() const { return getOperand(2); }
   1393 
   1394     static bool classof(const SDNode *N) {
   1395       return N->getOpcode() == X86ISD::VTRUNCSTORES ||
   1396         N->getOpcode() == X86ISD::VTRUNCSTOREUS;
   1397     }
   1398   };
   1399 
   1400   // Base class for all X86 masked store operations.
   1401   // The class has the same order of operands as MaskedStoreSDNode for
   1402   // convenience.
   1403   class X86MaskedStoreSDNode : public MemSDNode {
   1404   public:
   1405     X86MaskedStoreSDNode(unsigned Opcode, unsigned Order,
   1406                          const DebugLoc &dl, SDVTList VTs, EVT MemVT,
   1407                          MachineMemOperand *MMO)
   1408       : MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
   1409 
   1410     const SDValue &getBasePtr() const { return getOperand(1); }
   1411     const SDValue &getMask()    const { return getOperand(2); }
   1412     const SDValue &getValue()   const { return getOperand(3); }
   1413 
   1414     static bool classof(const SDNode *N) {
   1415       return N->getOpcode() == X86ISD::VMTRUNCSTORES ||
   1416         N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
   1417     }
   1418   };
   1419 
   1420   // X86 Truncating Store with Signed saturation.
   1421   class TruncSStoreSDNode : public X86StoreSDNode {
   1422   public:
   1423     TruncSStoreSDNode(unsigned Order, const DebugLoc &dl,
   1424                         SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
   1425       : X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
   1426 
   1427     static bool classof(const SDNode *N) {
   1428       return N->getOpcode() == X86ISD::VTRUNCSTORES;
   1429     }
   1430   };
   1431 
   1432   // X86 Truncating Store with Unsigned saturation.
   1433   class TruncUSStoreSDNode : public X86StoreSDNode {
   1434   public:
   1435     TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl,
   1436                       SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
   1437       : X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
   1438 
   1439     static bool classof(const SDNode *N) {
   1440       return N->getOpcode() == X86ISD::VTRUNCSTOREUS;
   1441     }
   1442   };
   1443 
   1444   // X86 Truncating Masked Store with Signed saturation.
   1445   class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode {
   1446   public:
   1447     MaskedTruncSStoreSDNode(unsigned Order,
   1448                          const DebugLoc &dl, SDVTList VTs, EVT MemVT,
   1449                          MachineMemOperand *MMO)
   1450       : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
   1451 
   1452     static bool classof(const SDNode *N) {
   1453       return N->getOpcode() == X86ISD::VMTRUNCSTORES;
   1454     }
   1455   };
   1456 
   1457   // X86 Truncating Masked Store with Unsigned saturation.
   1458   class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode {
   1459   public:
   1460     MaskedTruncUSStoreSDNode(unsigned Order,
   1461                             const DebugLoc &dl, SDVTList VTs, EVT MemVT,
   1462                             MachineMemOperand *MMO)
   1463       : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
   1464 
   1465     static bool classof(const SDNode *N) {
   1466       return N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
   1467     }
   1468   };
   1469 
   1470   // X86 specific Gather/Scatter nodes.
   1471   // The class has the same order of operands as MaskedGatherScatterSDNode for
   1472   // convenience.
   1473   class X86MaskedGatherScatterSDNode : public MemSDNode {
   1474   public:
   1475     X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order,
   1476                                  const DebugLoc &dl, SDVTList VTs, EVT MemVT,
   1477                                  MachineMemOperand *MMO)
   1478         : MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {}
   1479 
   1480     const SDValue &getBasePtr() const { return getOperand(3); }
   1481     const SDValue &getIndex()   const { return getOperand(4); }
   1482     const SDValue &getMask()    const { return getOperand(2); }
   1483     const SDValue &getValue()   const { return getOperand(1); }
   1484     const SDValue &getScale()   const { return getOperand(5); }
   1485 
   1486     static bool classof(const SDNode *N) {
   1487       return N->getOpcode() == X86ISD::MGATHER ||
   1488              N->getOpcode() == X86ISD::MSCATTER;
   1489     }
   1490   };
   1491 
   1492   class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
   1493   public:
   1494     X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
   1495                           EVT MemVT, MachineMemOperand *MMO)
   1496         : X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT,
   1497                                        MMO) {}
   1498 
   1499     static bool classof(const SDNode *N) {
   1500       return N->getOpcode() == X86ISD::MGATHER;
   1501     }
   1502   };
   1503 
   1504   class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
   1505   public:
   1506     X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
   1507                            EVT MemVT, MachineMemOperand *MMO)
   1508         : X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT,
   1509                                        MMO) {}
   1510 
   1511     static bool classof(const SDNode *N) {
   1512       return N->getOpcode() == X86ISD::MSCATTER;
   1513     }
   1514   };
   1515 
   1516   /// Generate unpacklo/unpackhi shuffle mask.
   1517   template <typename T = int>
   1518   void createUnpackShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo,
   1519                                bool Unary) {
   1520     assert(Mask.empty() && "Expected an empty shuffle mask vector");
   1521     int NumElts = VT.getVectorNumElements();
   1522     int NumEltsInLane = 128 / VT.getScalarSizeInBits();
   1523     for (int i = 0; i < NumElts; ++i) {
   1524       unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
   1525       int Pos = (i % NumEltsInLane) / 2 + LaneStart;
   1526       Pos += (Unary ? 0 : NumElts * (i % 2));
   1527       Pos += (Lo ? 0 : NumEltsInLane / 2);
   1528       Mask.push_back(Pos);
   1529     }
   1530   }
   1531 
   1532   /// Helper function to scale a shuffle or target shuffle mask, replacing each
   1533   /// mask index with the scaled sequential indices for an equivalent narrowed
   1534   /// mask. This is the reverse process to canWidenShuffleElements, but can
   1535   /// always succeed.
   1536   template <typename T>
   1537   void scaleShuffleMask(int Scale, ArrayRef<T> Mask,
   1538                         SmallVectorImpl<T> &ScaledMask) {
   1539     assert(0 < Scale && "Unexpected scaling factor");
   1540     int NumElts = Mask.size();
   1541     ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
   1542 
   1543     for (int i = 0; i != NumElts; ++i) {
   1544       int M = Mask[i];
   1545 
   1546       // Repeat sentinel values in every mask element.
   1547       if (M < 0) {
   1548         for (int s = 0; s != Scale; ++s)
   1549           ScaledMask[(Scale * i) + s] = M;
   1550         continue;
   1551       }
   1552 
   1553       // Scale mask element and increment across each mask element.
   1554       for (int s = 0; s != Scale; ++s)
   1555         ScaledMask[(Scale * i) + s] = (Scale * M) + s;
   1556     }
   1557   }
   1558 } // end namespace llvm
   1559 
   1560 #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
   1561