Home | History | Annotate | Download | only in X86
      1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the interfaces that X86 uses to lower LLVM code into a
     11 // selection DAG.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "X86ISelLowering.h"
     16 #include "Utils/X86ShuffleDecode.h"
     17 #include "X86CallingConv.h"
     18 #include "X86FrameLowering.h"
     19 #include "X86InstrBuilder.h"
     20 #include "X86MachineFunctionInfo.h"
     21 #include "X86TargetMachine.h"
     22 #include "X86TargetObjectFile.h"
     23 #include "llvm/ADT/SmallBitVector.h"
     24 #include "llvm/ADT/SmallSet.h"
     25 #include "llvm/ADT/Statistic.h"
     26 #include "llvm/ADT/StringExtras.h"
     27 #include "llvm/ADT/StringSwitch.h"
     28 #include "llvm/Analysis/EHPersonalities.h"
     29 #include "llvm/CodeGen/IntrinsicLowering.h"
     30 #include "llvm/CodeGen/MachineFrameInfo.h"
     31 #include "llvm/CodeGen/MachineFunction.h"
     32 #include "llvm/CodeGen/MachineInstrBuilder.h"
     33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
     34 #include "llvm/CodeGen/MachineModuleInfo.h"
     35 #include "llvm/CodeGen/MachineRegisterInfo.h"
     36 #include "llvm/CodeGen/WinEHFuncInfo.h"
     37 #include "llvm/IR/CallSite.h"
     38 #include "llvm/IR/CallingConv.h"
     39 #include "llvm/IR/Constants.h"
     40 #include "llvm/IR/DerivedTypes.h"
     41 #include "llvm/IR/Function.h"
     42 #include "llvm/IR/GlobalAlias.h"
     43 #include "llvm/IR/GlobalVariable.h"
     44 #include "llvm/IR/Instructions.h"
     45 #include "llvm/IR/Intrinsics.h"
     46 #include "llvm/MC/MCAsmInfo.h"
     47 #include "llvm/MC/MCContext.h"
     48 #include "llvm/MC/MCExpr.h"
     49 #include "llvm/MC/MCSymbol.h"
     50 #include "llvm/Support/CommandLine.h"
     51 #include "llvm/Support/Debug.h"
     52 #include "llvm/Support/ErrorHandling.h"
     53 #include "llvm/Support/MathExtras.h"
     54 #include "llvm/Target/TargetOptions.h"
     55 #include "X86IntrinsicsInfo.h"
     56 #include <bitset>
     57 #include <numeric>
     58 #include <cctype>
     59 using namespace llvm;
     60 
     61 #define DEBUG_TYPE "x86-isel"
     62 
     63 STATISTIC(NumTailCalls, "Number of tail calls");
     64 
     65 static cl::opt<bool> ExperimentalVectorWideningLegalization(
     66     "x86-experimental-vector-widening-legalization", cl::init(false),
     67     cl::desc("Enable an experimental vector type legalization through widening "
     68              "rather than promotion."),
     69     cl::Hidden);
     70 
     71 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     72                                      const X86Subtarget &STI)
     73     : TargetLowering(TM), Subtarget(&STI) {
     74   X86ScalarSSEf64 = Subtarget->hasSSE2();
     75   X86ScalarSSEf32 = Subtarget->hasSSE1();
     76   MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
     77 
     78   // Set up the TargetLowering object.
     79 
     80   // X86 is weird. It always uses i8 for shift amounts and setcc results.
     81   setBooleanContents(ZeroOrOneBooleanContent);
     82   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
     83   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
     84 
     85   // For 64-bit, since we have so many registers, use the ILP scheduler.
     86   // For 32-bit, use the register pressure specific scheduling.
     87   // For Atom, always use ILP scheduling.
     88   if (Subtarget->isAtom())
     89     setSchedulingPreference(Sched::ILP);
     90   else if (Subtarget->is64Bit())
     91     setSchedulingPreference(Sched::ILP);
     92   else
     93     setSchedulingPreference(Sched::RegPressure);
     94   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
     95   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
     96 
     97   // Bypass expensive divides on Atom when compiling with O2.
     98   if (TM.getOptLevel() >= CodeGenOpt::Default) {
     99     if (Subtarget->hasSlowDivide32())
    100       addBypassSlowDiv(32, 8);
    101     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
    102       addBypassSlowDiv(64, 16);
    103   }
    104 
    105   if (Subtarget->isTargetKnownWindowsMSVC()) {
    106     // Setup Windows compiler runtime calls.
    107     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
    108     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
    109     setLibcallName(RTLIB::SREM_I64, "_allrem");
    110     setLibcallName(RTLIB::UREM_I64, "_aullrem");
    111     setLibcallName(RTLIB::MUL_I64, "_allmul");
    112     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
    113     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
    114     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
    115     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
    116     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
    117   }
    118 
    119   if (Subtarget->isTargetDarwin()) {
    120     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
    121     setUseUnderscoreSetJmp(false);
    122     setUseUnderscoreLongJmp(false);
    123   } else if (Subtarget->isTargetWindowsGNU()) {
    124     // MS runtime is weird: it exports _setjmp, but longjmp!
    125     setUseUnderscoreSetJmp(true);
    126     setUseUnderscoreLongJmp(false);
    127   } else {
    128     setUseUnderscoreSetJmp(true);
    129     setUseUnderscoreLongJmp(true);
    130   }
    131 
    132   // Set up the register classes.
    133   addRegisterClass(MVT::i8, &X86::GR8RegClass);
    134   addRegisterClass(MVT::i16, &X86::GR16RegClass);
    135   addRegisterClass(MVT::i32, &X86::GR32RegClass);
    136   if (Subtarget->is64Bit())
    137     addRegisterClass(MVT::i64, &X86::GR64RegClass);
    138 
    139   for (MVT VT : MVT::integer_valuetypes())
    140     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
    141 
    142   // We don't accept any truncstore of integer registers.
    143   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    144   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    145   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
    146   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
    147   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
    148   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
    149 
    150   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    151 
    152   // SETOEQ and SETUNE require checking two conditions.
    153   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
    154   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
    155   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
    156   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
    157   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
    158   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
    159 
    160   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
    161   // operation.
    162   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
    163   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
    164   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
    165 
    166   if (Subtarget->is64Bit()) {
    167     if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512())
    168       // f32/f64 are legal, f80 is custom.
    169       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
    170     else
    171       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
    172     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    173   } else if (!Subtarget->useSoftFloat()) {
    174     // We have an algorithm for SSE2->double, and we turn this into a
    175     // 64-bit FILD followed by conditional FADD for other targets.
    176     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    177     // We have an algorithm for SSE2, and we turn this into a 64-bit
    178     // FILD or VCVTUSI2SS/SD for other targets.
    179     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
    180   }
    181 
    182   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
    183   // this operation.
    184   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
    185   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
    186 
    187   if (!Subtarget->useSoftFloat()) {
    188     // SSE has no i16 to fp conversion, only i32
    189     if (X86ScalarSSEf32) {
    190       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    191       // f32 and f64 cases are Legal, f80 case is not
    192       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    193     } else {
    194       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
    195       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    196     }
    197   } else {
    198     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    199     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
    200   }
    201 
    202   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
    203   // this operation.
    204   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
    205   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
    206 
    207   if (!Subtarget->useSoftFloat()) {
    208     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
    209     // are Legal, f80 is custom lowered.
    210     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
    211     setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
    212 
    213     if (X86ScalarSSEf32) {
    214       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    215       // f32 and f64 cases are Legal, f80 case is not
    216       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    217     } else {
    218       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
    219       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    220     }
    221   } else {
    222     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    223     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
    224     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
    225   }
    226 
    227   // Handle FP_TO_UINT by promoting the destination to a larger signed
    228   // conversion.
    229   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
    230   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
    231   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
    232 
    233   if (Subtarget->is64Bit()) {
    234     if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
    235       // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
    236       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
    237       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
    238     } else {
    239       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
    240       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
    241     }
    242   } else if (!Subtarget->useSoftFloat()) {
    243     // Since AVX is a superset of SSE3, only check for SSE here.
    244     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
    245       // Expand FP_TO_UINT into a select.
    246       // FIXME: We would like to use a Custom expander here eventually to do
    247       // the optimal thing for SSE vs. the default expansion in the legalizer.
    248       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
    249     else
    250       // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
    251       // With SSE3 we can use fisttpll to convert to a signed i64; without
    252       // SSE, we're stuck with a fistpll.
    253       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
    254 
    255     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
    256   }
    257 
    258   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
    259   if (!X86ScalarSSEf64) {
    260     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
    261     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
    262     if (Subtarget->is64Bit()) {
    263       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
    264       // Without SSE, i64->f64 goes through memory.
    265       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
    266     }
    267   }
    268 
    269   // Scalar integer divide and remainder are lowered to use operations that
    270   // produce two results, to match the available instructions. This exposes
    271   // the two-result form to trivial CSE, which is able to combine x/y and x%y
    272   // into a single instruction.
    273   //
    274   // Scalar integer multiply-high is also lowered to use two-result
    275   // operations, to match the available instructions. However, plain multiply
    276   // (low) operations are left as Legal, as there are single-result
    277   // instructions for this in x86. Using the two-result multiply instructions
    278   // when both high and low results are needed must be arranged by dagcombine.
    279   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
    280     setOperationAction(ISD::MULHS, VT, Expand);
    281     setOperationAction(ISD::MULHU, VT, Expand);
    282     setOperationAction(ISD::SDIV, VT, Expand);
    283     setOperationAction(ISD::UDIV, VT, Expand);
    284     setOperationAction(ISD::SREM, VT, Expand);
    285     setOperationAction(ISD::UREM, VT, Expand);
    286 
    287     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
    288     setOperationAction(ISD::ADDC, VT, Custom);
    289     setOperationAction(ISD::ADDE, VT, Custom);
    290     setOperationAction(ISD::SUBC, VT, Custom);
    291     setOperationAction(ISD::SUBE, VT, Custom);
    292   }
    293 
    294   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
    295   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
    296   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
    297   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
    298   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
    299   setOperationAction(ISD::BR_CC            , MVT::f128,  Expand);
    300   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
    301   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
    302   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
    303   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
    304   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
    305   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
    306   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
    307   setOperationAction(ISD::SELECT_CC        , MVT::f128,  Expand);
    308   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
    309   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
    310   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
    311   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
    312   if (Subtarget->is64Bit())
    313     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    314   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
    315   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
    316   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
    317   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
    318 
    319   if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) {
    320     // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
    321     // is. We should promote the value to 64-bits to solve this.
    322     // This is what the CRT headers do - `fmodf` is an inline header
    323     // function casting to f64 and calling `fmod`.
    324     setOperationAction(ISD::FREM           , MVT::f32  , Promote);
    325   } else {
    326     setOperationAction(ISD::FREM           , MVT::f32  , Expand);
    327   }
    328 
    329   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
    330   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    331   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
    332 
    333   // Promote the i8 variants and force them on up to i32 which has a shorter
    334   // encoding.
    335   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
    336   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
    337   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
    338   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
    339   if (Subtarget->hasBMI()) {
    340     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
    341     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
    342     if (Subtarget->is64Bit())
    343       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
    344   } else {
    345     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
    346     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
    347     if (Subtarget->is64Bit())
    348       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
    349   }
    350 
    351   if (Subtarget->hasLZCNT()) {
    352     // When promoting the i8 variants, force them to i32 for a shorter
    353     // encoding.
    354     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
    355     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
    356     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
    357     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
    358     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
    359     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
    360     if (Subtarget->is64Bit())
    361       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
    362   } else {
    363     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
    364     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
    365     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
    366     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
    367     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
    368     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
    369     if (Subtarget->is64Bit()) {
    370       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
    371       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
    372     }
    373   }
    374 
    375   // Special handling for half-precision floating point conversions.
    376   // If we don't have F16C support, then lower half float conversions
    377   // into library calls.
    378   if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) {
    379     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
    380     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
    381   }
    382 
    383   // There's never any support for operations beyond MVT::f32.
    384   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
    385   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
    386   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
    387   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
    388 
    389   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
    390   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
    391   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
    392   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
    393   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
    394   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
    395 
    396   if (Subtarget->hasPOPCNT()) {
    397     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
    398   } else {
    399     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
    400     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
    401     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
    402     if (Subtarget->is64Bit())
    403       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
    404   }
    405 
    406   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
    407 
    408   if (!Subtarget->hasMOVBE())
    409     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
    410 
    411   // These should be promoted to a larger select which is supported.
    412   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
    413   // X86 wants to expand cmov itself.
    414   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
    415   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
    416   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
    417   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
    418   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
    419   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
    420   setOperationAction(ISD::SELECT          , MVT::f128 , Custom);
    421   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
    422   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
    423   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
    424   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
    425   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
    426   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
    427   setOperationAction(ISD::SETCC           , MVT::f128 , Custom);
    428   setOperationAction(ISD::SETCCE          , MVT::i8   , Custom);
    429   setOperationAction(ISD::SETCCE          , MVT::i16  , Custom);
    430   setOperationAction(ISD::SETCCE          , MVT::i32  , Custom);
    431   if (Subtarget->is64Bit()) {
    432     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
    433     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
    434     setOperationAction(ISD::SETCCE        , MVT::i64  , Custom);
    435   }
    436   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
    437   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
    438   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
    439   // support continuation, user-level threading, and etc.. As a result, no
    440   // other SjLj exception interfaces are implemented and please don't build
    441   // your own exception handling based on them.
    442   // LLVM/Clang supports zero-cost DWARF exception handling.
    443   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
    444   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
    445 
    446   // Darwin ABI issue.
    447   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
    448   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
    449   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
    450   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
    451   if (Subtarget->is64Bit())
    452     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
    453   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
    454   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
    455   if (Subtarget->is64Bit()) {
    456     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
    457     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
    458     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
    459     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
    460     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
    461   }
    462   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
    463   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
    464   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
    465   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
    466   if (Subtarget->is64Bit()) {
    467     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
    468     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
    469     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
    470   }
    471 
    472   if (Subtarget->hasSSE1())
    473     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
    474 
    475   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
    476 
    477   // Expand certain atomics
    478   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
    479     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
    480     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
    481     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
    482   }
    483 
    484   if (Subtarget->hasCmpxchg16b()) {
    485     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
    486   }
    487 
    488   // FIXME - use subtarget debug flags
    489   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
    490       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
    491     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
    492   }
    493 
    494   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
    495   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
    496 
    497   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
    498   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
    499 
    500   setOperationAction(ISD::TRAP, MVT::Other, Legal);
    501   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
    502 
    503   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    504   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    505   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    506   if (Subtarget->is64Bit()) {
    507     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
    508     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
    509   } else {
    510     // TargetInfo::CharPtrBuiltinVaList
    511     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
    512     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
    513   }
    514 
    515   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
    516   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
    517 
    518   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
    519 
    520   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
    521   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
    522   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
    523 
    524   if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) {
    525     // f32 and f64 use SSE.
    526     // Set up the FP register classes.
    527     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    528     addRegisterClass(MVT::f64, &X86::FR64RegClass);
    529 
    530     // Use ANDPD to simulate FABS.
    531     setOperationAction(ISD::FABS , MVT::f64, Custom);
    532     setOperationAction(ISD::FABS , MVT::f32, Custom);
    533 
    534     // Use XORP to simulate FNEG.
    535     setOperationAction(ISD::FNEG , MVT::f64, Custom);
    536     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    537 
    538     // Use ANDPD and ORPD to simulate FCOPYSIGN.
    539     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
    540     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    541 
    542     // Lower this to FGETSIGNx86 plus an AND.
    543     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
    544     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
    545 
    546     // We don't support sin/cos/fmod
    547     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    548     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    549     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    550     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    551     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    552     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    553 
    554     // Expand FP immediates into loads from the stack, except for the special
    555     // cases we handle.
    556     addLegalFPImmediate(APFloat(+0.0)); // xorpd
    557     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    558   } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) {
    559     // Use SSE for f32, x87 for f64.
    560     // Set up the FP register classes.
    561     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    562     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    563 
    564     // Use ANDPS to simulate FABS.
    565     setOperationAction(ISD::FABS , MVT::f32, Custom);
    566 
    567     // Use XORP to simulate FNEG.
    568     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    569 
    570     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    571 
    572     // Use ANDPS and ORPS to simulate FCOPYSIGN.
    573     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    574     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    575 
    576     // We don't support sin/cos/fmod
    577     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    578     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    579     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    580 
    581     // Special cases we handle for FP constants.
    582     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    583     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    584     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    585     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    586     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    587 
    588     if (!TM.Options.UnsafeFPMath) {
    589       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    590       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    591       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    592     }
    593   } else if (!Subtarget->useSoftFloat()) {
    594     // f32 and f64 in x87.
    595     // Set up the FP register classes.
    596     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    597     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
    598 
    599     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    600     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
    601     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    602     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    603 
    604     if (!TM.Options.UnsafeFPMath) {
    605       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    606       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    607       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    608       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    609       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    610       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    611     }
    612     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    613     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    614     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    615     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    616     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
    617     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
    618     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
    619     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
    620   }
    621 
    622   // We don't support FMA.
    623   setOperationAction(ISD::FMA, MVT::f64, Expand);
    624   setOperationAction(ISD::FMA, MVT::f32, Expand);
    625 
    626   // Long double always uses X87, except f128 in MMX.
    627   if (!Subtarget->useSoftFloat()) {
    628     if (Subtarget->is64Bit() && Subtarget->hasMMX()) {
    629       addRegisterClass(MVT::f128, &X86::FR128RegClass);
    630       ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
    631       setOperationAction(ISD::FABS , MVT::f128, Custom);
    632       setOperationAction(ISD::FNEG , MVT::f128, Custom);
    633       setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
    634     }
    635 
    636     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
    637     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
    638     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
    639     {
    640       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
    641       addLegalFPImmediate(TmpFlt);  // FLD0
    642       TmpFlt.changeSign();
    643       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
    644 
    645       bool ignored;
    646       APFloat TmpFlt2(+1.0);
    647       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
    648                       &ignored);
    649       addLegalFPImmediate(TmpFlt2);  // FLD1
    650       TmpFlt2.changeSign();
    651       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
    652     }
    653 
    654     if (!TM.Options.UnsafeFPMath) {
    655       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
    656       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
    657       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
    658     }
    659 
    660     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
    661     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
    662     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
    663     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
    664     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
    665     setOperationAction(ISD::FMA, MVT::f80, Expand);
    666   }
    667 
    668   // Always use a library call for pow.
    669   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
    670   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
    671   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
    672 
    673   setOperationAction(ISD::FLOG, MVT::f80, Expand);
    674   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
    675   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
    676   setOperationAction(ISD::FEXP, MVT::f80, Expand);
    677   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
    678   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
    679   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
    680 
    681   // First set operation action for all vector types to either promote
    682   // (for widening) or expand (for scalarization). Then we will selectively
    683   // turn on ones that can be effectively codegen'd.
    684   for (MVT VT : MVT::vector_valuetypes()) {
    685     setOperationAction(ISD::ADD , VT, Expand);
    686     setOperationAction(ISD::SUB , VT, Expand);
    687     setOperationAction(ISD::FADD, VT, Expand);
    688     setOperationAction(ISD::FNEG, VT, Expand);
    689     setOperationAction(ISD::FSUB, VT, Expand);
    690     setOperationAction(ISD::MUL , VT, Expand);
    691     setOperationAction(ISD::FMUL, VT, Expand);
    692     setOperationAction(ISD::SDIV, VT, Expand);
    693     setOperationAction(ISD::UDIV, VT, Expand);
    694     setOperationAction(ISD::FDIV, VT, Expand);
    695     setOperationAction(ISD::SREM, VT, Expand);
    696     setOperationAction(ISD::UREM, VT, Expand);
    697     setOperationAction(ISD::LOAD, VT, Expand);
    698     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    699     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
    700     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
    701     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
    702     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
    703     setOperationAction(ISD::FABS, VT, Expand);
    704     setOperationAction(ISD::FSIN, VT, Expand);
    705     setOperationAction(ISD::FSINCOS, VT, Expand);
    706     setOperationAction(ISD::FCOS, VT, Expand);
    707     setOperationAction(ISD::FSINCOS, VT, Expand);
    708     setOperationAction(ISD::FREM, VT, Expand);
    709     setOperationAction(ISD::FMA,  VT, Expand);
    710     setOperationAction(ISD::FPOWI, VT, Expand);
    711     setOperationAction(ISD::FSQRT, VT, Expand);
    712     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    713     setOperationAction(ISD::FFLOOR, VT, Expand);
    714     setOperationAction(ISD::FCEIL, VT, Expand);
    715     setOperationAction(ISD::FTRUNC, VT, Expand);
    716     setOperationAction(ISD::FRINT, VT, Expand);
    717     setOperationAction(ISD::FNEARBYINT, VT, Expand);
    718     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    719     setOperationAction(ISD::MULHS, VT, Expand);
    720     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    721     setOperationAction(ISD::MULHU, VT, Expand);
    722     setOperationAction(ISD::SDIVREM, VT, Expand);
    723     setOperationAction(ISD::UDIVREM, VT, Expand);
    724     setOperationAction(ISD::FPOW, VT, Expand);
    725     setOperationAction(ISD::CTPOP, VT, Expand);
    726     setOperationAction(ISD::CTTZ, VT, Expand);
    727     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
    728     setOperationAction(ISD::CTLZ, VT, Expand);
    729     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
    730     setOperationAction(ISD::SHL, VT, Expand);
    731     setOperationAction(ISD::SRA, VT, Expand);
    732     setOperationAction(ISD::SRL, VT, Expand);
    733     setOperationAction(ISD::ROTL, VT, Expand);
    734     setOperationAction(ISD::ROTR, VT, Expand);
    735     setOperationAction(ISD::BSWAP, VT, Expand);
    736     setOperationAction(ISD::SETCC, VT, Expand);
    737     setOperationAction(ISD::FLOG, VT, Expand);
    738     setOperationAction(ISD::FLOG2, VT, Expand);
    739     setOperationAction(ISD::FLOG10, VT, Expand);
    740     setOperationAction(ISD::FEXP, VT, Expand);
    741     setOperationAction(ISD::FEXP2, VT, Expand);
    742     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    743     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    744     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    745     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    746     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
    747     setOperationAction(ISD::TRUNCATE, VT, Expand);
    748     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
    749     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
    750     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
    751     setOperationAction(ISD::VSELECT, VT, Expand);
    752     setOperationAction(ISD::SELECT_CC, VT, Expand);
    753     for (MVT InnerVT : MVT::vector_valuetypes()) {
    754       setTruncStoreAction(InnerVT, VT, Expand);
    755 
    756       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
    757       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
    758 
    759       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
    760       // types, we have to deal with them whether we ask for Expansion or not.
    761       // Setting Expand causes its own optimisation problems though, so leave
    762       // them legal.
    763       if (VT.getVectorElementType() == MVT::i1)
    764         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
    765 
    766       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
    767       // split/scalarized right now.
    768       if (VT.getVectorElementType() == MVT::f16)
    769         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
    770     }
    771   }
    772 
    773   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
    774   // with -msoft-float, disable use of MMX as well.
    775   if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) {
    776     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
    777     // No operations on x86mmx supported, everything uses intrinsics.
    778   }
    779 
    780   // MMX-sized vectors (other than x86mmx) are expected to be expanded
    781   // into smaller operations.
    782   for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) {
    783     setOperationAction(ISD::MULHS,              MMXTy,      Expand);
    784     setOperationAction(ISD::AND,                MMXTy,      Expand);
    785     setOperationAction(ISD::OR,                 MMXTy,      Expand);
    786     setOperationAction(ISD::XOR,                MMXTy,      Expand);
    787     setOperationAction(ISD::SCALAR_TO_VECTOR,   MMXTy,      Expand);
    788     setOperationAction(ISD::SELECT,             MMXTy,      Expand);
    789     setOperationAction(ISD::BITCAST,            MMXTy,      Expand);
    790   }
    791   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
    792 
    793   if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) {
    794     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
    795 
    796     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
    797     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
    798     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
    799     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
    800     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
    801     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
    802     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
    803     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
    804     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
    805     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
    806     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
    807     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    808     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
    809     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
    810   }
    811 
    812   if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) {
    813     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
    814 
    815     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
    816     // registers cannot be used even for integer operations.
    817     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
    818     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
    819     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
    820     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
    821 
    822     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
    823     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
    824     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
    825     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
    826     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
    827     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
    828     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
    829     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
    830     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
    831     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
    832     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
    833     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
    834     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
    835     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
    836     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
    837     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
    838     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
    839     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
    840     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
    841     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
    842     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
    843     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
    844     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
    845 
    846     setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
    847     setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
    848     setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
    849     setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
    850 
    851     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
    852     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
    853     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
    854     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
    855 
    856     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
    857     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
    858     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    859     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    860     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    861 
    862     setOperationAction(ISD::CTPOP,              MVT::v16i8, Custom);
    863     setOperationAction(ISD::CTPOP,              MVT::v8i16, Custom);
    864     setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
    865     setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
    866 
    867     setOperationAction(ISD::CTTZ,               MVT::v16i8, Custom);
    868     setOperationAction(ISD::CTTZ,               MVT::v8i16, Custom);
    869     setOperationAction(ISD::CTTZ,               MVT::v4i32, Custom);
    870     // ISD::CTTZ v2i64 - scalarization is faster.
    871     setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v16i8, Custom);
    872     setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v8i16, Custom);
    873     setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v4i32, Custom);
    874     // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster.
    875 
    876     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
    877     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
    878       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
    879       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
    880       setOperationAction(ISD::VSELECT,            VT, Custom);
    881       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    882     }
    883 
    884     // We support custom legalizing of sext and anyext loads for specific
    885     // memory vector types which we can load as a scalar (or sequence of
    886     // scalars) and extend in-register to a legal 128-bit vector type. For sext
    887     // loads these must work with a single scalar load.
    888     for (MVT VT : MVT::integer_vector_valuetypes()) {
    889       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
    890       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
    891       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
    892       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
    893       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
    894       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
    895       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
    896       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
    897       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
    898     }
    899 
    900     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
    901     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
    902     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
    903     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
    904     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
    905     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
    906     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
    907     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
    908 
    909     if (Subtarget->is64Bit()) {
    910       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
    911       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
    912     }
    913 
    914     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
    915     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
    916       setOperationAction(ISD::AND,    VT, Promote);
    917       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
    918       setOperationAction(ISD::OR,     VT, Promote);
    919       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
    920       setOperationAction(ISD::XOR,    VT, Promote);
    921       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
    922       setOperationAction(ISD::LOAD,   VT, Promote);
    923       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
    924       setOperationAction(ISD::SELECT, VT, Promote);
    925       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
    926     }
    927 
    928     // Custom lower v2i64 and v2f64 selects.
    929     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
    930     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
    931     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
    932     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
    933 
    934     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
    935     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
    936 
    937     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
    938 
    939     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
    940     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
    941     // As there is no 64-bit GPR available, we need build a special custom
    942     // sequence to convert from v2i32 to v2f32.
    943     if (!Subtarget->is64Bit())
    944       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
    945 
    946     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
    947     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
    948 
    949     for (MVT VT : MVT::fp_vector_valuetypes())
    950       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
    951 
    952     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
    953     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
    954     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
    955   }
    956 
    957   if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) {
    958     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
    959       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
    960       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
    961       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
    962       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
    963       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
    964     }
    965 
    966     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
    967     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
    968     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
    969     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
    970     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
    971     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
    972     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
    973     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
    974 
    975     // FIXME: Do we need to handle scalar-to-vector here?
    976     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
    977 
    978     // We directly match byte blends in the backend as they match the VSELECT
    979     // condition form.
    980     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
    981 
    982     // SSE41 brings specific instructions for doing vector sign extend even in
    983     // cases where we don't have SRA.
    984     for (MVT VT : MVT::integer_vector_valuetypes()) {
    985       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
    986       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
    987       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
    988     }
    989 
    990     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
    991     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
    992     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
    993     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
    994     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
    995     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
    996     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
    997 
    998     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
    999     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
   1000     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
   1001     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
   1002     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
   1003     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
   1004 
   1005     // i8 and i16 vectors are custom because the source register and source
   1006     // source memory operand types are not the same width.  f32 vectors are
   1007     // custom since the immediate controlling the insert encodes additional
   1008     // information.
   1009     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
   1010     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
   1011     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
   1012     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
   1013 
   1014     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
   1015     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
   1016     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
   1017     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
   1018 
   1019     // FIXME: these should be Legal, but that's only for the case where
   1020     // the index is constant.  For now custom expand to deal with that.
   1021     if (Subtarget->is64Bit()) {
   1022       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
   1023       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
   1024     }
   1025   }
   1026 
   1027   if (Subtarget->hasSSE2()) {
   1028     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
   1029     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
   1030     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
   1031 
   1032     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
   1033     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
   1034 
   1035     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
   1036     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
   1037 
   1038     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
   1039     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
   1040 
   1041     // In the customized shift lowering, the legal cases in AVX2 will be
   1042     // recognized.
   1043     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
   1044     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
   1045 
   1046     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
   1047     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
   1048 
   1049     setOperationAction(ISD::SRA,               MVT::v2i64, Custom);
   1050     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
   1051   }
   1052 
   1053   if (Subtarget->hasXOP()) {
   1054     setOperationAction(ISD::ROTL,              MVT::v16i8, Custom);
   1055     setOperationAction(ISD::ROTL,              MVT::v8i16, Custom);
   1056     setOperationAction(ISD::ROTL,              MVT::v4i32, Custom);
   1057     setOperationAction(ISD::ROTL,              MVT::v2i64, Custom);
   1058     setOperationAction(ISD::ROTL,              MVT::v32i8, Custom);
   1059     setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
   1060     setOperationAction(ISD::ROTL,              MVT::v8i32, Custom);
   1061     setOperationAction(ISD::ROTL,              MVT::v4i64, Custom);
   1062   }
   1063 
   1064   if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) {
   1065     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
   1066     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
   1067     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
   1068     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
   1069     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
   1070     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
   1071 
   1072     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
   1073     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
   1074     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
   1075 
   1076     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
   1077     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
   1078     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
   1079     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
   1080     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
   1081     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
   1082     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
   1083     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
   1084     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
   1085     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
   1086     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
   1087     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
   1088 
   1089     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
   1090     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
   1091     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
   1092     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
   1093     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
   1094     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
   1095     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
   1096     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
   1097     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
   1098     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
   1099     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
   1100     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
   1101 
   1102     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
   1103     // even though v8i16 is a legal type.
   1104     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
   1105     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
   1106     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
   1107 
   1108     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
   1109     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
   1110     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
   1111 
   1112     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
   1113     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
   1114 
   1115     for (MVT VT : MVT::fp_vector_valuetypes())
   1116       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
   1117 
   1118     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
   1119     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
   1120 
   1121     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
   1122     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
   1123 
   1124     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
   1125     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
   1126 
   1127     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
   1128     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
   1129     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
   1130     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
   1131 
   1132     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
   1133     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
   1134     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
   1135 
   1136     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
   1137     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
   1138     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
   1139     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
   1140     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
   1141     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
   1142     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
   1143     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
   1144     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
   1145     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
   1146     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
   1147     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
   1148 
   1149     setOperationAction(ISD::CTPOP,             MVT::v32i8, Custom);
   1150     setOperationAction(ISD::CTPOP,             MVT::v16i16, Custom);
   1151     setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
   1152     setOperationAction(ISD::CTPOP,             MVT::v4i64, Custom);
   1153 
   1154     setOperationAction(ISD::CTTZ,              MVT::v32i8, Custom);
   1155     setOperationAction(ISD::CTTZ,              MVT::v16i16, Custom);
   1156     setOperationAction(ISD::CTTZ,              MVT::v8i32, Custom);
   1157     setOperationAction(ISD::CTTZ,              MVT::v4i64, Custom);
   1158     setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v32i8, Custom);
   1159     setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v16i16, Custom);
   1160     setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v8i32, Custom);
   1161     setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v4i64, Custom);
   1162 
   1163     if (Subtarget->hasAnyFMA()) {
   1164       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
   1165       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
   1166       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
   1167       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
   1168       setOperationAction(ISD::FMA,             MVT::f32, Legal);
   1169       setOperationAction(ISD::FMA,             MVT::f64, Legal);
   1170     }
   1171 
   1172     if (Subtarget->hasInt256()) {
   1173       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
   1174       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
   1175       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
   1176       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
   1177 
   1178       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
   1179       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
   1180       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
   1181       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
   1182 
   1183       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1184       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
   1185       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
   1186       setOperationAction(ISD::MUL,             MVT::v32i8, Custom);
   1187 
   1188       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
   1189       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
   1190       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
   1191       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
   1192 
   1193       setOperationAction(ISD::SMAX,            MVT::v32i8,  Legal);
   1194       setOperationAction(ISD::SMAX,            MVT::v16i16, Legal);
   1195       setOperationAction(ISD::SMAX,            MVT::v8i32,  Legal);
   1196       setOperationAction(ISD::UMAX,            MVT::v32i8,  Legal);
   1197       setOperationAction(ISD::UMAX,            MVT::v16i16, Legal);
   1198       setOperationAction(ISD::UMAX,            MVT::v8i32,  Legal);
   1199       setOperationAction(ISD::SMIN,            MVT::v32i8,  Legal);
   1200       setOperationAction(ISD::SMIN,            MVT::v16i16, Legal);
   1201       setOperationAction(ISD::SMIN,            MVT::v8i32,  Legal);
   1202       setOperationAction(ISD::UMIN,            MVT::v32i8,  Legal);
   1203       setOperationAction(ISD::UMIN,            MVT::v16i16, Legal);
   1204       setOperationAction(ISD::UMIN,            MVT::v8i32,  Legal);
   1205 
   1206       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
   1207       // when we have a 256bit-wide blend with immediate.
   1208       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
   1209 
   1210       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
   1211       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
   1212       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
   1213       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
   1214       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
   1215       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
   1216       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
   1217 
   1218       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
   1219       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
   1220       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
   1221       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
   1222       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
   1223       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
   1224     } else {
   1225       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
   1226       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
   1227       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
   1228       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
   1229 
   1230       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
   1231       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
   1232       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
   1233       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
   1234 
   1235       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1236       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
   1237       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
   1238       setOperationAction(ISD::MUL,             MVT::v32i8, Custom);
   1239 
   1240       setOperationAction(ISD::SMAX,            MVT::v32i8,  Custom);
   1241       setOperationAction(ISD::SMAX,            MVT::v16i16, Custom);
   1242       setOperationAction(ISD::SMAX,            MVT::v8i32,  Custom);
   1243       setOperationAction(ISD::UMAX,            MVT::v32i8,  Custom);
   1244       setOperationAction(ISD::UMAX,            MVT::v16i16, Custom);
   1245       setOperationAction(ISD::UMAX,            MVT::v8i32,  Custom);
   1246       setOperationAction(ISD::SMIN,            MVT::v32i8,  Custom);
   1247       setOperationAction(ISD::SMIN,            MVT::v16i16, Custom);
   1248       setOperationAction(ISD::SMIN,            MVT::v8i32,  Custom);
   1249       setOperationAction(ISD::UMIN,            MVT::v32i8,  Custom);
   1250       setOperationAction(ISD::UMIN,            MVT::v16i16, Custom);
   1251       setOperationAction(ISD::UMIN,            MVT::v8i32,  Custom);
   1252     }
   1253 
   1254     // In the customized shift lowering, the legal cases in AVX2 will be
   1255     // recognized.
   1256     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
   1257     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
   1258 
   1259     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
   1260     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
   1261 
   1262     setOperationAction(ISD::SRA,               MVT::v4i64, Custom);
   1263     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
   1264 
   1265     // Custom lower several nodes for 256-bit types.
   1266     for (MVT VT : MVT::vector_valuetypes()) {
   1267       if (VT.getScalarSizeInBits() >= 32) {
   1268         setOperationAction(ISD::MLOAD,  VT, Legal);
   1269         setOperationAction(ISD::MSTORE, VT, Legal);
   1270       }
   1271       // Extract subvector is special because the value type
   1272       // (result) is 128-bit but the source is 256-bit wide.
   1273       if (VT.is128BitVector()) {
   1274         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1275       }
   1276       // Do not attempt to custom lower other non-256-bit vectors
   1277       if (!VT.is256BitVector())
   1278         continue;
   1279 
   1280       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
   1281       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
   1282       setOperationAction(ISD::VSELECT,            VT, Custom);
   1283       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
   1284       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   1285       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
   1286       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
   1287       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
   1288     }
   1289 
   1290     if (Subtarget->hasInt256())
   1291       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
   1292 
   1293     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
   1294     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
   1295       setOperationAction(ISD::AND,    VT, Promote);
   1296       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
   1297       setOperationAction(ISD::OR,     VT, Promote);
   1298       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
   1299       setOperationAction(ISD::XOR,    VT, Promote);
   1300       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
   1301       setOperationAction(ISD::LOAD,   VT, Promote);
   1302       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
   1303       setOperationAction(ISD::SELECT, VT, Promote);
   1304       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
   1305     }
   1306   }
   1307 
   1308   if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
   1309     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
   1310     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
   1311     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
   1312     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
   1313 
   1314     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
   1315     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
   1316     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
   1317 
   1318     for (MVT VT : MVT::fp_vector_valuetypes())
   1319       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
   1320 
   1321     setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
   1322     setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
   1323     setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
   1324     setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
   1325     setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
   1326     setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
   1327     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i8,  Legal);
   1328     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i8,  Legal);
   1329     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i16,  Legal);
   1330     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i16,  Legal);
   1331     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i32,  Legal);
   1332     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i32,  Legal);
   1333 
   1334     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
   1335     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
   1336     setOperationAction(ISD::SELECT_CC,          MVT::i1,    Expand);
   1337     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
   1338     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
   1339     setOperationAction(ISD::AND,                MVT::i1,    Legal);
   1340     setOperationAction(ISD::SUB,                MVT::i1,    Custom);
   1341     setOperationAction(ISD::ADD,                MVT::i1,    Custom);
   1342     setOperationAction(ISD::MUL,                MVT::i1,    Custom);
   1343     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
   1344     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
   1345     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
   1346     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
   1347     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
   1348 
   1349     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
   1350     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
   1351     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
   1352     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
   1353     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
   1354     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
   1355     setOperationAction(ISD::FABS,               MVT::v16f32, Custom);
   1356 
   1357     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
   1358     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
   1359     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
   1360     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
   1361     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
   1362     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
   1363     setOperationAction(ISD::FABS,               MVT::v8f64, Custom);
   1364     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
   1365     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
   1366 
   1367     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
   1368     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
   1369     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
   1370     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
   1371     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
   1372     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
   1373     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
   1374     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
   1375     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
   1376     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
   1377     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
   1378     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
   1379     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
   1380     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
   1381     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
   1382     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
   1383 
   1384     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
   1385     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
   1386     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
   1387     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
   1388     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
   1389     if (Subtarget->hasVLX()){
   1390       setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
   1391       setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
   1392       setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
   1393       setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
   1394       setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
   1395 
   1396       setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
   1397       setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
   1398       setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
   1399       setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
   1400       setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
   1401     } else {
   1402       setOperationAction(ISD::MLOAD,    MVT::v8i32, Custom);
   1403       setOperationAction(ISD::MLOAD,    MVT::v8f32, Custom);
   1404       setOperationAction(ISD::MSTORE,   MVT::v8i32, Custom);
   1405       setOperationAction(ISD::MSTORE,   MVT::v8f32, Custom);
   1406     }
   1407     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
   1408     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
   1409     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
   1410     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i1,  Custom);
   1411     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v16i1, Custom);
   1412     if (Subtarget->hasDQI()) {
   1413       setOperationAction(ISD::TRUNCATE,         MVT::v2i1, Custom);
   1414       setOperationAction(ISD::TRUNCATE,         MVT::v4i1, Custom);
   1415 
   1416       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i64, Legal);
   1417       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i64, Legal);
   1418       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i64, Legal);
   1419       setOperationAction(ISD::FP_TO_UINT,       MVT::v8i64, Legal);
   1420       if (Subtarget->hasVLX()) {
   1421         setOperationAction(ISD::SINT_TO_FP,    MVT::v4i64, Legal);
   1422         setOperationAction(ISD::SINT_TO_FP,    MVT::v2i64, Legal);
   1423         setOperationAction(ISD::UINT_TO_FP,    MVT::v4i64, Legal);
   1424         setOperationAction(ISD::UINT_TO_FP,    MVT::v2i64, Legal);
   1425         setOperationAction(ISD::FP_TO_SINT,    MVT::v4i64, Legal);
   1426         setOperationAction(ISD::FP_TO_SINT,    MVT::v2i64, Legal);
   1427         setOperationAction(ISD::FP_TO_UINT,    MVT::v4i64, Legal);
   1428         setOperationAction(ISD::FP_TO_UINT,    MVT::v2i64, Legal);
   1429       }
   1430     }
   1431     if (Subtarget->hasVLX()) {
   1432       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
   1433       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
   1434       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
   1435       setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
   1436       setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
   1437       setOperationAction(ISD::UINT_TO_FP,       MVT::v4i32, Legal);
   1438       setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
   1439       setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
   1440     }
   1441     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
   1442     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
   1443     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
   1444     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
   1445     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
   1446     setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
   1447     setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
   1448     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
   1449     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
   1450     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
   1451     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
   1452     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
   1453     if (Subtarget->hasDQI()) {
   1454       setOperationAction(ISD::SIGN_EXTEND,        MVT::v4i32, Custom);
   1455       setOperationAction(ISD::SIGN_EXTEND,        MVT::v2i64, Custom);
   1456     }
   1457     setOperationAction(ISD::FFLOOR,             MVT::v16f32, Legal);
   1458     setOperationAction(ISD::FFLOOR,             MVT::v8f64, Legal);
   1459     setOperationAction(ISD::FCEIL,              MVT::v16f32, Legal);
   1460     setOperationAction(ISD::FCEIL,              MVT::v8f64, Legal);
   1461     setOperationAction(ISD::FTRUNC,             MVT::v16f32, Legal);
   1462     setOperationAction(ISD::FTRUNC,             MVT::v8f64, Legal);
   1463     setOperationAction(ISD::FRINT,              MVT::v16f32, Legal);
   1464     setOperationAction(ISD::FRINT,              MVT::v8f64, Legal);
   1465     setOperationAction(ISD::FNEARBYINT,         MVT::v16f32, Legal);
   1466     setOperationAction(ISD::FNEARBYINT,         MVT::v8f64, Legal);
   1467 
   1468     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
   1469     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
   1470     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
   1471     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
   1472     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
   1473 
   1474     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
   1475     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
   1476 
   1477     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
   1478 
   1479     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
   1480     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
   1481     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
   1482     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
   1483     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
   1484     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
   1485     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
   1486     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
   1487     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
   1488     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
   1489     setOperationAction(ISD::SELECT,             MVT::v16i1, Custom);
   1490     setOperationAction(ISD::SELECT,             MVT::v8i1,  Custom);
   1491 
   1492     setOperationAction(ISD::SMAX,               MVT::v16i32, Legal);
   1493     setOperationAction(ISD::SMAX,               MVT::v8i64, Legal);
   1494     setOperationAction(ISD::UMAX,               MVT::v16i32, Legal);
   1495     setOperationAction(ISD::UMAX,               MVT::v8i64, Legal);
   1496     setOperationAction(ISD::SMIN,               MVT::v16i32, Legal);
   1497     setOperationAction(ISD::SMIN,               MVT::v8i64, Legal);
   1498     setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
   1499     setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
   1500 
   1501     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
   1502     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
   1503 
   1504     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
   1505     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
   1506 
   1507     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
   1508 
   1509     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
   1510     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
   1511 
   1512     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
   1513     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
   1514 
   1515     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
   1516     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
   1517 
   1518     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
   1519     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
   1520     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
   1521     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
   1522     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
   1523     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
   1524 
   1525     if (Subtarget->hasCDI()) {
   1526       setOperationAction(ISD::CTLZ,             MVT::v8i64,  Legal);
   1527       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
   1528       setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i64,  Legal);
   1529       setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i32, Legal);
   1530 
   1531       setOperationAction(ISD::CTLZ,             MVT::v8i16,  Custom);
   1532       setOperationAction(ISD::CTLZ,             MVT::v16i8,  Custom);
   1533       setOperationAction(ISD::CTLZ,             MVT::v16i16, Custom);
   1534       setOperationAction(ISD::CTLZ,             MVT::v32i8,  Custom);
   1535       setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i16,  Custom);
   1536       setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i8,  Custom);
   1537       setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i16, Custom);
   1538       setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v32i8,  Custom);
   1539 
   1540       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i64,  Custom);
   1541       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v16i32, Custom);
   1542 
   1543       if (Subtarget->hasVLX()) {
   1544         setOperationAction(ISD::CTLZ,             MVT::v4i64, Legal);
   1545         setOperationAction(ISD::CTLZ,             MVT::v8i32, Legal);
   1546         setOperationAction(ISD::CTLZ,             MVT::v2i64, Legal);
   1547         setOperationAction(ISD::CTLZ,             MVT::v4i32, Legal);
   1548         setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i64, Legal);
   1549         setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i32, Legal);
   1550         setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v2i64, Legal);
   1551         setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i32, Legal);
   1552 
   1553         setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i64, Custom);
   1554         setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i32, Custom);
   1555         setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v2i64, Custom);
   1556         setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i32, Custom);
   1557       } else {
   1558         setOperationAction(ISD::CTLZ,             MVT::v4i64, Custom);
   1559         setOperationAction(ISD::CTLZ,             MVT::v8i32, Custom);
   1560         setOperationAction(ISD::CTLZ,             MVT::v2i64, Custom);
   1561         setOperationAction(ISD::CTLZ,             MVT::v4i32, Custom);
   1562         setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i64, Custom);
   1563         setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i32, Custom);
   1564         setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v2i64, Custom);
   1565         setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i32, Custom);
   1566       }
   1567     } // Subtarget->hasCDI()
   1568 
   1569     if (Subtarget->hasDQI()) {
   1570       setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
   1571       setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
   1572       setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
   1573     }
   1574     // Custom lower several nodes.
   1575     for (MVT VT : MVT::vector_valuetypes()) {
   1576       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   1577       if (EltSize == 1) {
   1578         setOperationAction(ISD::AND, VT, Legal);
   1579         setOperationAction(ISD::OR,  VT, Legal);
   1580         setOperationAction(ISD::XOR,  VT, Legal);
   1581       }
   1582       if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) {
   1583         setOperationAction(ISD::MGATHER,  VT, Custom);
   1584         setOperationAction(ISD::MSCATTER, VT, Custom);
   1585       }
   1586       // Extract subvector is special because the value type
   1587       // (result) is 256/128-bit but the source is 512-bit wide.
   1588       if (VT.is128BitVector() || VT.is256BitVector()) {
   1589         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1590       }
   1591       if (VT.getVectorElementType() == MVT::i1)
   1592         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
   1593 
   1594       // Do not attempt to custom lower other non-512-bit vectors
   1595       if (!VT.is512BitVector())
   1596         continue;
   1597 
   1598       if (EltSize >= 32) {
   1599         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
   1600         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
   1601         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
   1602         setOperationAction(ISD::VSELECT,             VT, Legal);
   1603         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
   1604         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
   1605         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
   1606         setOperationAction(ISD::MLOAD,               VT, Legal);
   1607         setOperationAction(ISD::MSTORE,              VT, Legal);
   1608         setOperationAction(ISD::MGATHER,  VT, Legal);
   1609         setOperationAction(ISD::MSCATTER, VT, Custom);
   1610       }
   1611     }
   1612     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
   1613       setOperationAction(ISD::SELECT, VT, Promote);
   1614       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
   1615     }
   1616   }// has  AVX-512
   1617 
   1618   if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) {
   1619     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
   1620     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
   1621 
   1622     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
   1623     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
   1624 
   1625     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
   1626     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
   1627     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
   1628     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
   1629     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
   1630     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
   1631     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
   1632     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
   1633     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
   1634     setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
   1635     setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
   1636     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
   1637     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
   1638     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
   1639     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
   1640     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
   1641     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
   1642     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Custom);
   1643     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
   1644     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
   1645     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
   1646     setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
   1647     setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
   1648     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
   1649     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
   1650     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
   1651     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
   1652     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
   1653     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
   1654     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
   1655     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
   1656     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
   1657     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
   1658     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
   1659     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
   1660     setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
   1661     setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
   1662     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
   1663     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
   1664     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
   1665     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
   1666     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
   1667 
   1668     setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
   1669     setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
   1670     setOperationAction(ISD::UMAX,               MVT::v64i8, Legal);
   1671     setOperationAction(ISD::UMAX,               MVT::v32i16, Legal);
   1672     setOperationAction(ISD::SMIN,               MVT::v64i8, Legal);
   1673     setOperationAction(ISD::SMIN,               MVT::v32i16, Legal);
   1674     setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
   1675     setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
   1676 
   1677     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
   1678     setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
   1679     if (Subtarget->hasVLX())
   1680       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
   1681 
   1682     if (Subtarget->hasCDI()) {
   1683       setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
   1684       setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
   1685       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Custom);
   1686       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8,  Custom);
   1687     }
   1688 
   1689     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
   1690       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
   1691       setOperationAction(ISD::VSELECT,             VT, Legal);
   1692 
   1693       setOperationAction(ISD::AND,    VT, Promote);
   1694       AddPromotedToType (ISD::AND,    VT, MVT::v8i64);
   1695       setOperationAction(ISD::OR,     VT, Promote);
   1696       AddPromotedToType (ISD::OR,     VT, MVT::v8i64);
   1697       setOperationAction(ISD::XOR,    VT, Promote);
   1698       AddPromotedToType (ISD::XOR,    VT, MVT::v8i64);
   1699     }
   1700   }
   1701 
   1702   if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) {
   1703     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
   1704     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
   1705 
   1706     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
   1707     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
   1708     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
   1709     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
   1710     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
   1711     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
   1712     setOperationAction(ISD::SELECT,             MVT::v4i1, Custom);
   1713     setOperationAction(ISD::SELECT,             MVT::v2i1, Custom);
   1714     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i1, Custom);
   1715     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i1, Custom);
   1716     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i1, Custom);
   1717     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i1, Custom);
   1718 
   1719     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
   1720     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
   1721     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
   1722     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
   1723     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
   1724     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
   1725     setOperationAction(ISD::SRA,                MVT::v2i64, Custom);
   1726     setOperationAction(ISD::SRA,                MVT::v4i64, Custom);
   1727 
   1728     setOperationAction(ISD::SMAX,               MVT::v2i64, Legal);
   1729     setOperationAction(ISD::SMAX,               MVT::v4i64, Legal);
   1730     setOperationAction(ISD::UMAX,               MVT::v2i64, Legal);
   1731     setOperationAction(ISD::UMAX,               MVT::v4i64, Legal);
   1732     setOperationAction(ISD::SMIN,               MVT::v2i64, Legal);
   1733     setOperationAction(ISD::SMIN,               MVT::v4i64, Legal);
   1734     setOperationAction(ISD::UMIN,               MVT::v2i64, Legal);
   1735     setOperationAction(ISD::UMIN,               MVT::v4i64, Legal);
   1736   }
   1737 
   1738   // We want to custom lower some of our intrinsics.
   1739   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   1740   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   1741   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   1742   if (!Subtarget->is64Bit()) {
   1743     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
   1744     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   1745   }
   1746 
   1747   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   1748   // handle type legalization for these operations here.
   1749   //
   1750   // FIXME: We really should do custom legalization for addition and
   1751   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   1752   // than generic legalization for 64-bit multiplication-with-overflow, though.
   1753   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
   1754     if (VT == MVT::i64 && !Subtarget->is64Bit())
   1755       continue;
   1756     // Add/Sub/Mul with overflow operations are custom lowered.
   1757     setOperationAction(ISD::SADDO, VT, Custom);
   1758     setOperationAction(ISD::UADDO, VT, Custom);
   1759     setOperationAction(ISD::SSUBO, VT, Custom);
   1760     setOperationAction(ISD::USUBO, VT, Custom);
   1761     setOperationAction(ISD::SMULO, VT, Custom);
   1762     setOperationAction(ISD::UMULO, VT, Custom);
   1763   }
   1764 
   1765   if (!Subtarget->is64Bit()) {
   1766     // These libcalls are not available in 32-bit.
   1767     setLibcallName(RTLIB::SHL_I128, nullptr);
   1768     setLibcallName(RTLIB::SRL_I128, nullptr);
   1769     setLibcallName(RTLIB::SRA_I128, nullptr);
   1770   }
   1771 
   1772   // Combine sin / cos into one node or libcall if possible.
   1773   if (Subtarget->hasSinCos()) {
   1774     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
   1775     setLibcallName(RTLIB::SINCOS_F64, "sincos");
   1776     if (Subtarget->isTargetDarwin()) {
   1777       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
   1778       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
   1779       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
   1780       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   1781     }
   1782   }
   1783 
   1784   if (Subtarget->isTargetWin64()) {
   1785     setOperationAction(ISD::SDIV, MVT::i128, Custom);
   1786     setOperationAction(ISD::UDIV, MVT::i128, Custom);
   1787     setOperationAction(ISD::SREM, MVT::i128, Custom);
   1788     setOperationAction(ISD::UREM, MVT::i128, Custom);
   1789     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
   1790     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
   1791   }
   1792 
   1793   // We have target-specific dag combine patterns for the following nodes:
   1794   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   1795   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   1796   setTargetDAGCombine(ISD::BITCAST);
   1797   setTargetDAGCombine(ISD::VSELECT);
   1798   setTargetDAGCombine(ISD::SELECT);
   1799   setTargetDAGCombine(ISD::SHL);
   1800   setTargetDAGCombine(ISD::SRA);
   1801   setTargetDAGCombine(ISD::SRL);
   1802   setTargetDAGCombine(ISD::OR);
   1803   setTargetDAGCombine(ISD::AND);
   1804   setTargetDAGCombine(ISD::ADD);
   1805   setTargetDAGCombine(ISD::FADD);
   1806   setTargetDAGCombine(ISD::FSUB);
   1807   setTargetDAGCombine(ISD::FNEG);
   1808   setTargetDAGCombine(ISD::FMA);
   1809   setTargetDAGCombine(ISD::FMAXNUM);
   1810   setTargetDAGCombine(ISD::SUB);
   1811   setTargetDAGCombine(ISD::LOAD);
   1812   setTargetDAGCombine(ISD::MLOAD);
   1813   setTargetDAGCombine(ISD::STORE);
   1814   setTargetDAGCombine(ISD::MSTORE);
   1815   setTargetDAGCombine(ISD::TRUNCATE);
   1816   setTargetDAGCombine(ISD::ZERO_EXTEND);
   1817   setTargetDAGCombine(ISD::ANY_EXTEND);
   1818   setTargetDAGCombine(ISD::SIGN_EXTEND);
   1819   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   1820   setTargetDAGCombine(ISD::SINT_TO_FP);
   1821   setTargetDAGCombine(ISD::UINT_TO_FP);
   1822   setTargetDAGCombine(ISD::SETCC);
   1823   setTargetDAGCombine(ISD::BUILD_VECTOR);
   1824   setTargetDAGCombine(ISD::MUL);
   1825   setTargetDAGCombine(ISD::XOR);
   1826   setTargetDAGCombine(ISD::MSCATTER);
   1827   setTargetDAGCombine(ISD::MGATHER);
   1828 
   1829   computeRegisterProperties(Subtarget->getRegisterInfo());
   1830 
   1831   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   1832   MaxStoresPerMemsetOptSize = 8;
   1833   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   1834   MaxStoresPerMemcpyOptSize = 4;
   1835   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   1836   MaxStoresPerMemmoveOptSize = 4;
   1837   setPrefLoopAlignment(4); // 2^4 bytes.
   1838 
   1839   // A predictable cmov does not hurt on an in-order CPU.
   1840   // FIXME: Use a CPU attribute to trigger this, not a CPU model.
   1841   PredictableSelectIsExpensive = !Subtarget->isAtom();
   1842   EnableExtLdPromotion = true;
   1843   setPrefFunctionAlignment(4); // 2^4 bytes.
   1844 
   1845   verifyIntrinsicTables();
   1846 }
   1847 
   1848 // This has so far only been implemented for 64-bit MachO.
   1849 bool X86TargetLowering::useLoadStackGuardNode() const {
   1850   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
   1851 }
   1852 
   1853 TargetLoweringBase::LegalizeTypeAction
   1854 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
   1855   if (ExperimentalVectorWideningLegalization &&
   1856       VT.getVectorNumElements() != 1 &&
   1857       VT.getVectorElementType().getSimpleVT() != MVT::i1)
   1858     return TypeWidenVector;
   1859 
   1860   return TargetLoweringBase::getPreferredVectorAction(VT);
   1861 }
   1862 
   1863 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
   1864                                           EVT VT) const {
   1865   if (!VT.isVector())
   1866     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
   1867 
   1868   if (VT.isSimple()) {
   1869     MVT VVT = VT.getSimpleVT();
   1870     const unsigned NumElts = VVT.getVectorNumElements();
   1871     const MVT EltVT = VVT.getVectorElementType();
   1872     if (VVT.is512BitVector()) {
   1873       if (Subtarget->hasAVX512())
   1874         if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
   1875             EltVT == MVT::f32 || EltVT == MVT::f64)
   1876           switch(NumElts) {
   1877           case  8: return MVT::v8i1;
   1878           case 16: return MVT::v16i1;
   1879         }
   1880       if (Subtarget->hasBWI())
   1881         if (EltVT == MVT::i8 || EltVT == MVT::i16)
   1882           switch(NumElts) {
   1883           case 32: return MVT::v32i1;
   1884           case 64: return MVT::v64i1;
   1885         }
   1886     }
   1887 
   1888     if (VVT.is256BitVector() || VVT.is128BitVector()) {
   1889       if (Subtarget->hasVLX())
   1890         if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
   1891             EltVT == MVT::f32 || EltVT == MVT::f64)
   1892           switch(NumElts) {
   1893           case 2: return MVT::v2i1;
   1894           case 4: return MVT::v4i1;
   1895           case 8: return MVT::v8i1;
   1896         }
   1897       if (Subtarget->hasBWI() && Subtarget->hasVLX())
   1898         if (EltVT == MVT::i8 || EltVT == MVT::i16)
   1899           switch(NumElts) {
   1900           case  8: return MVT::v8i1;
   1901           case 16: return MVT::v16i1;
   1902           case 32: return MVT::v32i1;
   1903         }
   1904     }
   1905   }
   1906 
   1907   return VT.changeVectorElementTypeToInteger();
   1908 }
   1909 
   1910 /// Helper for getByValTypeAlignment to determine
   1911 /// the desired ByVal argument alignment.
   1912 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   1913   if (MaxAlign == 16)
   1914     return;
   1915   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
   1916     if (VTy->getBitWidth() == 128)
   1917       MaxAlign = 16;
   1918   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
   1919     unsigned EltAlign = 0;
   1920     getMaxByValAlign(ATy->getElementType(), EltAlign);
   1921     if (EltAlign > MaxAlign)
   1922       MaxAlign = EltAlign;
   1923   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
   1924     for (auto *EltTy : STy->elements()) {
   1925       unsigned EltAlign = 0;
   1926       getMaxByValAlign(EltTy, EltAlign);
   1927       if (EltAlign > MaxAlign)
   1928         MaxAlign = EltAlign;
   1929       if (MaxAlign == 16)
   1930         break;
   1931     }
   1932   }
   1933 }
   1934 
   1935 /// Return the desired alignment for ByVal aggregate
   1936 /// function arguments in the caller parameter area. For X86, aggregates
   1937 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
   1938 /// are at 4-byte boundaries.
   1939 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
   1940                                                   const DataLayout &DL) const {
   1941   if (Subtarget->is64Bit()) {
   1942     // Max of 8 and alignment of type.
   1943     unsigned TyAlign = DL.getABITypeAlignment(Ty);
   1944     if (TyAlign > 8)
   1945       return TyAlign;
   1946     return 8;
   1947   }
   1948 
   1949   unsigned Align = 4;
   1950   if (Subtarget->hasSSE1())
   1951     getMaxByValAlign(Ty, Align);
   1952   return Align;
   1953 }
   1954 
   1955 /// Returns the target specific optimal type for load
   1956 /// and store operations as a result of memset, memcpy, and memmove
   1957 /// lowering. If DstAlign is zero that means it's safe to destination
   1958 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
   1959 /// means there isn't a need to check it against alignment requirement,
   1960 /// probably because the source does not need to be loaded. If 'IsMemset' is
   1961 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
   1962 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
   1963 /// source is constant so it does not need to be loaded.
   1964 /// It returns EVT::Other if the type should be determined using generic
   1965 /// target-independent logic.
   1966 EVT
   1967 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   1968                                        unsigned DstAlign, unsigned SrcAlign,
   1969                                        bool IsMemset, bool ZeroMemset,
   1970                                        bool MemcpyStrSrc,
   1971                                        MachineFunction &MF) const {
   1972   const Function *F = MF.getFunction();
   1973   if ((!IsMemset || ZeroMemset) &&
   1974       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
   1975     if (Size >= 16 &&
   1976         (!Subtarget->isUnalignedMem16Slow() ||
   1977          ((DstAlign == 0 || DstAlign >= 16) &&
   1978           (SrcAlign == 0 || SrcAlign >= 16)))) {
   1979       if (Size >= 32) {
   1980         // FIXME: Check if unaligned 32-byte accesses are slow.
   1981         if (Subtarget->hasInt256())
   1982           return MVT::v8i32;
   1983         if (Subtarget->hasFp256())
   1984           return MVT::v8f32;
   1985       }
   1986       if (Subtarget->hasSSE2())
   1987         return MVT::v4i32;
   1988       if (Subtarget->hasSSE1())
   1989         return MVT::v4f32;
   1990     } else if (!MemcpyStrSrc && Size >= 8 &&
   1991                !Subtarget->is64Bit() &&
   1992                Subtarget->hasSSE2()) {
   1993       // Do not use f64 to lower memcpy if source is string constant. It's
   1994       // better to use i32 to avoid the loads.
   1995       return MVT::f64;
   1996     }
   1997   }
   1998   // This is a compromise. If we reach here, unaligned accesses may be slow on
   1999   // this target. However, creating smaller, aligned accesses could be even
   2000   // slower and would certainly be a lot more code.
   2001   if (Subtarget->is64Bit() && Size >= 8)
   2002     return MVT::i64;
   2003   return MVT::i32;
   2004 }
   2005 
   2006 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   2007   if (VT == MVT::f32)
   2008     return X86ScalarSSEf32;
   2009   else if (VT == MVT::f64)
   2010     return X86ScalarSSEf64;
   2011   return true;
   2012 }
   2013 
   2014 bool
   2015 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   2016                                                   unsigned,
   2017                                                   unsigned,
   2018                                                   bool *Fast) const {
   2019   if (Fast) {
   2020     switch (VT.getSizeInBits()) {
   2021     default:
   2022       // 8-byte and under are always assumed to be fast.
   2023       *Fast = true;
   2024       break;
   2025     case 128:
   2026       *Fast = !Subtarget->isUnalignedMem16Slow();
   2027       break;
   2028     case 256:
   2029       *Fast = !Subtarget->isUnalignedMem32Slow();
   2030       break;
   2031     // TODO: What about AVX-512 (512-bit) accesses?
   2032     }
   2033   }
   2034   // Misaligned accesses of any size are always allowed.
   2035   return true;
   2036 }
   2037 
   2038 /// Return the entry encoding for a jump table in the
   2039 /// current function.  The returned value is a member of the
   2040 /// MachineJumpTableInfo::JTEntryKind enum.
   2041 unsigned X86TargetLowering::getJumpTableEncoding() const {
   2042   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
   2043   // symbol.
   2044   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   2045       Subtarget->isPICStyleGOT())
   2046     return MachineJumpTableInfo::EK_Custom32;
   2047 
   2048   // Otherwise, use the normal jump table encoding heuristics.
   2049   return TargetLowering::getJumpTableEncoding();
   2050 }
   2051 
   2052 bool X86TargetLowering::useSoftFloat() const {
   2053   return Subtarget->useSoftFloat();
   2054 }
   2055 
   2056 const MCExpr *
   2057 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
   2058                                              const MachineBasicBlock *MBB,
   2059                                              unsigned uid,MCContext &Ctx) const{
   2060   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
   2061          Subtarget->isPICStyleGOT());
   2062   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   2063   // entries.
   2064   return MCSymbolRefExpr::create(MBB->getSymbol(),
   2065                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
   2066 }
   2067 
   2068 /// Returns relocation base for the given PIC jumptable.
   2069 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   2070                                                     SelectionDAG &DAG) const {
   2071   if (!Subtarget->is64Bit())
   2072     // This doesn't have SDLoc associated with it, but is not really the
   2073     // same as a Register.
   2074     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
   2075                        getPointerTy(DAG.getDataLayout()));
   2076   return Table;
   2077 }
   2078 
   2079 /// This returns the relocation base for the given PIC jumptable,
   2080 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
   2081 const MCExpr *X86TargetLowering::
   2082 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
   2083                              MCContext &Ctx) const {
   2084   // X86-64 uses RIP relative addressing based on the jump table label.
   2085   if (Subtarget->isPICStyleRIPRel())
   2086     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
   2087 
   2088   // Otherwise, the reference is relative to the PIC base.
   2089   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
   2090 }
   2091 
   2092 std::pair<const TargetRegisterClass *, uint8_t>
   2093 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
   2094                                            MVT VT) const {
   2095   const TargetRegisterClass *RRC = nullptr;
   2096   uint8_t Cost = 1;
   2097   switch (VT.SimpleTy) {
   2098   default:
   2099     return TargetLowering::findRepresentativeClass(TRI, VT);
   2100   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
   2101     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
   2102     break;
   2103   case MVT::x86mmx:
   2104     RRC = &X86::VR64RegClass;
   2105     break;
   2106   case MVT::f32: case MVT::f64:
   2107   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   2108   case MVT::v4f32: case MVT::v2f64:
   2109   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
   2110   case MVT::v4f64:
   2111     RRC = &X86::VR128RegClass;
   2112     break;
   2113   }
   2114   return std::make_pair(RRC, Cost);
   2115 }
   2116 
   2117 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
   2118                                                unsigned &Offset) const {
   2119   if (!Subtarget->isTargetLinux())
   2120     return false;
   2121 
   2122   if (Subtarget->is64Bit()) {
   2123     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
   2124     Offset = 0x28;
   2125     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
   2126       AddressSpace = 256;
   2127     else
   2128       AddressSpace = 257;
   2129   } else {
   2130     // %gs:0x14 on i386
   2131     Offset = 0x14;
   2132     AddressSpace = 256;
   2133   }
   2134   return true;
   2135 }
   2136 
   2137 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   2138   if (!Subtarget->isTargetAndroid())
   2139     return TargetLowering::getSafeStackPointerLocation(IRB);
   2140 
   2141   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   2142   // definition of TLS_SLOT_SAFESTACK in
   2143   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
   2144   unsigned AddressSpace, Offset;
   2145   if (Subtarget->is64Bit()) {
   2146     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
   2147     Offset = 0x48;
   2148     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
   2149       AddressSpace = 256;
   2150     else
   2151       AddressSpace = 257;
   2152   } else {
   2153     // %gs:0x24 on i386
   2154     Offset = 0x24;
   2155     AddressSpace = 256;
   2156   }
   2157 
   2158   return ConstantExpr::getIntToPtr(
   2159       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
   2160       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
   2161 }
   2162 
   2163 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
   2164                                             unsigned DestAS) const {
   2165   assert(SrcAS != DestAS && "Expected different address spaces!");
   2166 
   2167   return SrcAS < 256 && DestAS < 256;
   2168 }
   2169 
   2170 //===----------------------------------------------------------------------===//
   2171 //               Return Value Calling Convention Implementation
   2172 //===----------------------------------------------------------------------===//
   2173 
   2174 #include "X86GenCallingConv.inc"
   2175 
   2176 bool X86TargetLowering::CanLowerReturn(
   2177     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
   2178     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   2179   SmallVector<CCValAssign, 16> RVLocs;
   2180   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   2181   return CCInfo.CheckReturn(Outs, RetCC_X86);
   2182 }
   2183 
   2184 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
   2185   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
   2186   return ScratchRegs;
   2187 }
   2188 
   2189 SDValue
   2190 X86TargetLowering::LowerReturn(SDValue Chain,
   2191                                CallingConv::ID CallConv, bool isVarArg,
   2192                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   2193                                const SmallVectorImpl<SDValue> &OutVals,
   2194                                SDLoc dl, SelectionDAG &DAG) const {
   2195   MachineFunction &MF = DAG.getMachineFunction();
   2196   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   2197 
   2198   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
   2199     report_fatal_error("X86 interrupts may not return any value");
   2200 
   2201   SmallVector<CCValAssign, 16> RVLocs;
   2202   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
   2203   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
   2204 
   2205   SDValue Flag;
   2206   SmallVector<SDValue, 6> RetOps;
   2207   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   2208   // Operand #1 = Bytes To Pop
   2209   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
   2210                    MVT::i16));
   2211 
   2212   // Copy the result values into the output registers.
   2213   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   2214     CCValAssign &VA = RVLocs[i];
   2215     assert(VA.isRegLoc() && "Can only return in registers!");
   2216     SDValue ValToCopy = OutVals[i];
   2217     EVT ValVT = ValToCopy.getValueType();
   2218 
   2219     // Promote values to the appropriate types.
   2220     if (VA.getLocInfo() == CCValAssign::SExt)
   2221       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
   2222     else if (VA.getLocInfo() == CCValAssign::ZExt)
   2223       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
   2224     else if (VA.getLocInfo() == CCValAssign::AExt) {
   2225       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
   2226         ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
   2227       else
   2228         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
   2229     }
   2230     else if (VA.getLocInfo() == CCValAssign::BCvt)
   2231       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
   2232 
   2233     assert(VA.getLocInfo() != CCValAssign::FPExt &&
   2234            "Unexpected FP-extend for return value.");
   2235 
   2236     // If this is x86-64, and we disabled SSE, we can't return FP values,
   2237     // or SSE or MMX vectors.
   2238     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
   2239          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
   2240           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
   2241       report_fatal_error("SSE register return with SSE disabled");
   2242     }
   2243     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
   2244     // llvm-gcc has never done it right and no one has noticed, so this
   2245     // should be OK for now.
   2246     if (ValVT == MVT::f64 &&
   2247         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
   2248       report_fatal_error("SSE2 register return with SSE2 disabled");
   2249 
   2250     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
   2251     // the RET instruction and handled by the FP Stackifier.
   2252     if (VA.getLocReg() == X86::FP0 ||
   2253         VA.getLocReg() == X86::FP1) {
   2254       // If this is a copy from an xmm register to ST(0), use an FPExtend to
   2255       // change the value to the FP stack register class.
   2256       if (isScalarFPTypeInSSEReg(VA.getValVT()))
   2257         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
   2258       RetOps.push_back(ValToCopy);
   2259       // Don't emit a copytoreg.
   2260       continue;
   2261     }
   2262 
   2263     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
   2264     // which is returned in RAX / RDX.
   2265     if (Subtarget->is64Bit()) {
   2266       if (ValVT == MVT::x86mmx) {
   2267         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
   2268           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
   2269           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   2270                                   ValToCopy);
   2271           // If we don't have SSE2 available, convert to v4f32 so the generated
   2272           // register is legal.
   2273           if (!Subtarget->hasSSE2())
   2274             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
   2275         }
   2276       }
   2277     }
   2278 
   2279     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
   2280     Flag = Chain.getValue(1);
   2281     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   2282   }
   2283 
   2284   // All x86 ABIs require that for returning structs by value we copy
   2285   // the sret argument into %rax/%eax (depending on ABI) for the return.
   2286   // We saved the argument into a virtual register in the entry block,
   2287   // so now we copy the value out and into %rax/%eax.
   2288   //
   2289   // Checking Function.hasStructRetAttr() here is insufficient because the IR
   2290   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
   2291   // false, then an sret argument may be implicitly inserted in the SelDAG. In
   2292   // either case FuncInfo->setSRetReturnReg() will have been called.
   2293   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
   2294     SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg,
   2295                                      getPointerTy(MF.getDataLayout()));
   2296 
   2297     unsigned RetValReg
   2298         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
   2299           X86::RAX : X86::EAX;
   2300     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
   2301     Flag = Chain.getValue(1);
   2302 
   2303     // RAX/EAX now acts like a return value.
   2304     RetOps.push_back(
   2305         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
   2306   }
   2307 
   2308   RetOps[0] = Chain;  // Update chain.
   2309 
   2310   // Add the flag if we have it.
   2311   if (Flag.getNode())
   2312     RetOps.push_back(Flag);
   2313 
   2314   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
   2315   if (CallConv == CallingConv::X86_INTR)
   2316     opcode = X86ISD::IRET;
   2317   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
   2318 }
   2319 
   2320 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   2321   if (N->getNumValues() != 1)
   2322     return false;
   2323   if (!N->hasNUsesOfValue(1, 0))
   2324     return false;
   2325 
   2326   SDValue TCChain = Chain;
   2327   SDNode *Copy = *N->use_begin();
   2328   if (Copy->getOpcode() == ISD::CopyToReg) {
   2329     // If the copy has a glue operand, we conservatively assume it isn't safe to
   2330     // perform a tail call.
   2331     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
   2332       return false;
   2333     TCChain = Copy->getOperand(0);
   2334   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
   2335     return false;
   2336 
   2337   bool HasRet = false;
   2338   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
   2339        UI != UE; ++UI) {
   2340     if (UI->getOpcode() != X86ISD::RET_FLAG)
   2341       return false;
   2342     // If we are returning more than one value, we can definitely
   2343     // not make a tail call see PR19530
   2344     if (UI->getNumOperands() > 4)
   2345       return false;
   2346     if (UI->getNumOperands() == 4 &&
   2347         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
   2348       return false;
   2349     HasRet = true;
   2350   }
   2351 
   2352   if (!HasRet)
   2353     return false;
   2354 
   2355   Chain = TCChain;
   2356   return true;
   2357 }
   2358 
   2359 EVT
   2360 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
   2361                                             ISD::NodeType ExtendKind) const {
   2362   MVT ReturnMVT;
   2363   // TODO: Is this also valid on 32-bit?
   2364   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
   2365     ReturnMVT = MVT::i8;
   2366   else
   2367     ReturnMVT = MVT::i32;
   2368 
   2369   EVT MinVT = getRegisterType(Context, ReturnMVT);
   2370   return VT.bitsLT(MinVT) ? MinVT : VT;
   2371 }
   2372 
   2373 /// Lower the result values of a call into the
   2374 /// appropriate copies out of appropriate physical registers.
   2375 ///
   2376 SDValue
   2377 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   2378                                    CallingConv::ID CallConv, bool isVarArg,
   2379                                    const SmallVectorImpl<ISD::InputArg> &Ins,
   2380                                    SDLoc dl, SelectionDAG &DAG,
   2381                                    SmallVectorImpl<SDValue> &InVals) const {
   2382 
   2383   // Assign locations to each value returned by this call.
   2384   SmallVector<CCValAssign, 16> RVLocs;
   2385   bool Is64Bit = Subtarget->is64Bit();
   2386   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
   2387                  *DAG.getContext());
   2388   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   2389 
   2390   // Copy all of the result registers out of their specified physreg.
   2391   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   2392     CCValAssign &VA = RVLocs[i];
   2393     EVT CopyVT = VA.getLocVT();
   2394 
   2395     // If this is x86-64, and we disabled SSE, we can't return FP values
   2396     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
   2397         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
   2398       report_fatal_error("SSE register return with SSE disabled");
   2399     }
   2400 
   2401     // If we prefer to use the value in xmm registers, copy it out as f80 and
   2402     // use a truncate to move it from fp stack reg to xmm reg.
   2403     bool RoundAfterCopy = false;
   2404     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
   2405         isScalarFPTypeInSSEReg(VA.getValVT())) {
   2406       CopyVT = MVT::f80;
   2407       RoundAfterCopy = (CopyVT != VA.getLocVT());
   2408     }
   2409 
   2410     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
   2411                                CopyVT, InFlag).getValue(1);
   2412     SDValue Val = Chain.getValue(0);
   2413 
   2414     if (RoundAfterCopy)
   2415       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
   2416                         // This truncation won't change the value.
   2417                         DAG.getIntPtrConstant(1, dl));
   2418 
   2419     if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
   2420       Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
   2421 
   2422     InFlag = Chain.getValue(2);
   2423     InVals.push_back(Val);
   2424   }
   2425 
   2426   return Chain;
   2427 }
   2428 
   2429 //===----------------------------------------------------------------------===//
   2430 //                C & StdCall & Fast Calling Convention implementation
   2431 //===----------------------------------------------------------------------===//
   2432 //  StdCall calling convention seems to be standard for many Windows' API
   2433 //  routines and around. It differs from C calling convention just a little:
   2434 //  callee should clean up the stack, not caller. Symbols should be also
   2435 //  decorated in some fancy way :) It doesn't support any vector arguments.
   2436 //  For info on fast calling convention see Fast Calling Convention (tail call)
   2437 //  implementation LowerX86_32FastCCCallTo.
   2438 
   2439 /// CallIsStructReturn - Determines whether a call uses struct return
   2440 /// semantics.
   2441 enum StructReturnType {
   2442   NotStructReturn,
   2443   RegStructReturn,
   2444   StackStructReturn
   2445 };
   2446 static StructReturnType
   2447 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
   2448   if (Outs.empty())
   2449     return NotStructReturn;
   2450 
   2451   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
   2452   if (!Flags.isSRet())
   2453     return NotStructReturn;
   2454   if (Flags.isInReg())
   2455     return RegStructReturn;
   2456   return StackStructReturn;
   2457 }
   2458 
   2459 /// Determines whether a function uses struct return semantics.
   2460 static StructReturnType
   2461 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
   2462   if (Ins.empty())
   2463     return NotStructReturn;
   2464 
   2465   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
   2466   if (!Flags.isSRet())
   2467     return NotStructReturn;
   2468   if (Flags.isInReg())
   2469     return RegStructReturn;
   2470   return StackStructReturn;
   2471 }
   2472 
   2473 /// Make a copy of an aggregate at address specified by "Src" to address
   2474 /// "Dst" with size and alignment information specified by the specific
   2475 /// parameter attribute. The copy will be passed as a byval function parameter.
   2476 static SDValue
   2477 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
   2478                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
   2479                           SDLoc dl) {
   2480   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
   2481 
   2482   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
   2483                        /*isVolatile*/false, /*AlwaysInline=*/true,
   2484                        /*isTailCall*/false,
   2485                        MachinePointerInfo(), MachinePointerInfo());
   2486 }
   2487 
   2488 /// Return true if the calling convention is one that we can guarantee TCO for.
   2489 static bool canGuaranteeTCO(CallingConv::ID CC) {
   2490   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
   2491           CC == CallingConv::HiPE || CC == CallingConv::HHVM);
   2492 }
   2493 
   2494 /// Return true if we might ever do TCO for calls with this calling convention.
   2495 static bool mayTailCallThisCC(CallingConv::ID CC) {
   2496   switch (CC) {
   2497   // C calling conventions:
   2498   case CallingConv::C:
   2499   case CallingConv::X86_64_Win64:
   2500   case CallingConv::X86_64_SysV:
   2501   // Callee pop conventions:
   2502   case CallingConv::X86_ThisCall:
   2503   case CallingConv::X86_StdCall:
   2504   case CallingConv::X86_VectorCall:
   2505   case CallingConv::X86_FastCall:
   2506     return true;
   2507   default:
   2508     return canGuaranteeTCO(CC);
   2509   }
   2510 }
   2511 
   2512 /// Return true if the function is being made into a tailcall target by
   2513 /// changing its ABI.
   2514 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
   2515   return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
   2516 }
   2517 
   2518 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   2519   auto Attr =
   2520       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
   2521   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
   2522     return false;
   2523 
   2524   CallSite CS(CI);
   2525   CallingConv::ID CalleeCC = CS.getCallingConv();
   2526   if (!mayTailCallThisCC(CalleeCC))
   2527     return false;
   2528 
   2529   return true;
   2530 }
   2531 
   2532 SDValue
   2533 X86TargetLowering::LowerMemArgument(SDValue Chain,
   2534                                     CallingConv::ID CallConv,
   2535                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   2536                                     SDLoc dl, SelectionDAG &DAG,
   2537                                     const CCValAssign &VA,
   2538                                     MachineFrameInfo *MFI,
   2539                                     unsigned i) const {
   2540   // Create the nodes corresponding to a load from this parameter slot.
   2541   ISD::ArgFlagsTy Flags = Ins[i].Flags;
   2542   bool AlwaysUseMutable = shouldGuaranteeTCO(
   2543       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   2544   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   2545   EVT ValVT;
   2546 
   2547   // If value is passed by pointer we have address passed instead of the value
   2548   // itself.
   2549   bool ExtendedInMem = VA.isExtInLoc() &&
   2550     VA.getValVT().getScalarType() == MVT::i1;
   2551 
   2552   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
   2553     ValVT = VA.getLocVT();
   2554   else
   2555     ValVT = VA.getValVT();
   2556 
   2557   // Calculate SP offset of interrupt parameter, re-arrange the slot normally
   2558   // taken by a return address.
   2559   int Offset = 0;
   2560   if (CallConv == CallingConv::X86_INTR) {
   2561     const X86Subtarget& Subtarget =
   2562         static_cast<const X86Subtarget&>(DAG.getSubtarget());
   2563     // X86 interrupts may take one or two arguments.
   2564     // On the stack there will be no return address as in regular call.
   2565     // Offset of last argument need to be set to -4/-8 bytes.
   2566     // Where offset of the first argument out of two, should be set to 0 bytes.
   2567     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
   2568   }
   2569 
   2570   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   2571   // changed with more analysis.
   2572   // In case of tail call optimization mark all arguments mutable. Since they
   2573   // could be overwritten by lowering of arguments in case of a tail call.
   2574   if (Flags.isByVal()) {
   2575     unsigned Bytes = Flags.getByValSize();
   2576     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
   2577     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
   2578     // Adjust SP offset of interrupt parameter.
   2579     if (CallConv == CallingConv::X86_INTR) {
   2580       MFI->setObjectOffset(FI, Offset);
   2581     }
   2582     return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   2583   } else {
   2584     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
   2585                                     VA.getLocMemOffset(), isImmutable);
   2586     // Adjust SP offset of interrupt parameter.
   2587     if (CallConv == CallingConv::X86_INTR) {
   2588       MFI->setObjectOffset(FI, Offset);
   2589     }
   2590 
   2591     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   2592     SDValue Val = DAG.getLoad(
   2593         ValVT, dl, Chain, FIN,
   2594         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
   2595         false, false, 0);
   2596     return ExtendedInMem ?
   2597       DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
   2598   }
   2599 }
   2600 
   2601 // FIXME: Get this from tablegen.
   2602 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
   2603                                                 const X86Subtarget *Subtarget) {
   2604   assert(Subtarget->is64Bit());
   2605 
   2606   if (Subtarget->isCallingConvWin64(CallConv)) {
   2607     static const MCPhysReg GPR64ArgRegsWin64[] = {
   2608       X86::RCX, X86::RDX, X86::R8,  X86::R9
   2609     };
   2610     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
   2611   }
   2612 
   2613   static const MCPhysReg GPR64ArgRegs64Bit[] = {
   2614     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
   2615   };
   2616   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
   2617 }
   2618 
   2619 // FIXME: Get this from tablegen.
   2620 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
   2621                                                 CallingConv::ID CallConv,
   2622                                                 const X86Subtarget *Subtarget) {
   2623   assert(Subtarget->is64Bit());
   2624   if (Subtarget->isCallingConvWin64(CallConv)) {
   2625     // The XMM registers which might contain var arg parameters are shadowed
   2626     // in their paired GPR.  So we only need to save the GPR to their home
   2627     // slots.
   2628     // TODO: __vectorcall will change this.
   2629     return None;
   2630   }
   2631 
   2632   const Function *Fn = MF.getFunction();
   2633   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
   2634   bool isSoftFloat = Subtarget->useSoftFloat();
   2635   assert(!(isSoftFloat && NoImplicitFloatOps) &&
   2636          "SSE register cannot be used when SSE is disabled!");
   2637   if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
   2638     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
   2639     // registers.
   2640     return None;
   2641 
   2642   static const MCPhysReg XMMArgRegs64Bit[] = {
   2643     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2644     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2645   };
   2646   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
   2647 }
   2648 
   2649 SDValue X86TargetLowering::LowerFormalArguments(
   2650     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   2651     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
   2652     SmallVectorImpl<SDValue> &InVals) const {
   2653   MachineFunction &MF = DAG.getMachineFunction();
   2654   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   2655   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   2656 
   2657   const Function* Fn = MF.getFunction();
   2658   if (Fn->hasExternalLinkage() &&
   2659       Subtarget->isTargetCygMing() &&
   2660       Fn->getName() == "main")
   2661     FuncInfo->setForceFramePointer(true);
   2662 
   2663   MachineFrameInfo *MFI = MF.getFrameInfo();
   2664   bool Is64Bit = Subtarget->is64Bit();
   2665   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
   2666 
   2667   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
   2668          "Var args not supported with calling convention fastcc, ghc or hipe");
   2669 
   2670   if (CallConv == CallingConv::X86_INTR) {
   2671     bool isLegal = Ins.size() == 1 ||
   2672                    (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
   2673                                         (!Is64Bit && Ins[1].VT == MVT::i32)));
   2674     if (!isLegal)
   2675       report_fatal_error("X86 interrupts may take one or two arguments");
   2676   }
   2677 
   2678   // Assign locations to all of the incoming arguments.
   2679   SmallVector<CCValAssign, 16> ArgLocs;
   2680   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
   2681 
   2682   // Allocate shadow area for Win64
   2683   if (IsWin64)
   2684     CCInfo.AllocateStack(32, 8);
   2685 
   2686   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
   2687 
   2688   unsigned LastVal = ~0U;
   2689   SDValue ArgValue;
   2690   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2691     CCValAssign &VA = ArgLocs[i];
   2692     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
   2693     // places.
   2694     assert(VA.getValNo() != LastVal &&
   2695            "Don't support value assigned to multiple locs yet");
   2696     (void)LastVal;
   2697     LastVal = VA.getValNo();
   2698 
   2699     if (VA.isRegLoc()) {
   2700       EVT RegVT = VA.getLocVT();
   2701       const TargetRegisterClass *RC;
   2702       if (RegVT == MVT::i32)
   2703         RC = &X86::GR32RegClass;
   2704       else if (Is64Bit && RegVT == MVT::i64)
   2705         RC = &X86::GR64RegClass;
   2706       else if (RegVT == MVT::f32)
   2707         RC = &X86::FR32RegClass;
   2708       else if (RegVT == MVT::f64)
   2709         RC = &X86::FR64RegClass;
   2710       else if (RegVT == MVT::f128)
   2711         RC = &X86::FR128RegClass;
   2712       else if (RegVT.is512BitVector())
   2713         RC = &X86::VR512RegClass;
   2714       else if (RegVT.is256BitVector())
   2715         RC = &X86::VR256RegClass;
   2716       else if (RegVT.is128BitVector())
   2717         RC = &X86::VR128RegClass;
   2718       else if (RegVT == MVT::x86mmx)
   2719         RC = &X86::VR64RegClass;
   2720       else if (RegVT == MVT::i1)
   2721         RC = &X86::VK1RegClass;
   2722       else if (RegVT == MVT::v8i1)
   2723         RC = &X86::VK8RegClass;
   2724       else if (RegVT == MVT::v16i1)
   2725         RC = &X86::VK16RegClass;
   2726       else if (RegVT == MVT::v32i1)
   2727         RC = &X86::VK32RegClass;
   2728       else if (RegVT == MVT::v64i1)
   2729         RC = &X86::VK64RegClass;
   2730       else
   2731         llvm_unreachable("Unknown argument type!");
   2732 
   2733       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   2734       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
   2735 
   2736       // If this is an 8 or 16-bit value, it is really passed promoted to 32
   2737       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
   2738       // right size.
   2739       if (VA.getLocInfo() == CCValAssign::SExt)
   2740         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
   2741                                DAG.getValueType(VA.getValVT()));
   2742       else if (VA.getLocInfo() == CCValAssign::ZExt)
   2743         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
   2744                                DAG.getValueType(VA.getValVT()));
   2745       else if (VA.getLocInfo() == CCValAssign::BCvt)
   2746         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
   2747 
   2748       if (VA.isExtInLoc()) {
   2749         // Handle MMX values passed in XMM regs.
   2750         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
   2751           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
   2752         else
   2753           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
   2754       }
   2755     } else {
   2756       assert(VA.isMemLoc());
   2757       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
   2758     }
   2759 
   2760     // If value is passed via pointer - do a load.
   2761     if (VA.getLocInfo() == CCValAssign::Indirect)
   2762       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
   2763                              MachinePointerInfo(), false, false, false, 0);
   2764 
   2765     InVals.push_back(ArgValue);
   2766   }
   2767 
   2768   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2769     // All x86 ABIs require that for returning structs by value we copy the
   2770     // sret argument into %rax/%eax (depending on ABI) for the return. Save
   2771     // the argument into a virtual register so that we can access it from the
   2772     // return points.
   2773     if (Ins[i].Flags.isSRet()) {
   2774       unsigned Reg = FuncInfo->getSRetReturnReg();
   2775       if (!Reg) {
   2776         MVT PtrTy = getPointerTy(DAG.getDataLayout());
   2777         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
   2778         FuncInfo->setSRetReturnReg(Reg);
   2779       }
   2780       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
   2781       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   2782       break;
   2783     }
   2784   }
   2785 
   2786   unsigned StackSize = CCInfo.getNextStackOffset();
   2787   // Align stack specially for tail calls.
   2788   if (shouldGuaranteeTCO(CallConv,
   2789                          MF.getTarget().Options.GuaranteedTailCallOpt))
   2790     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
   2791 
   2792   // If the function takes variable number of arguments, make a frame index for
   2793   // the start of the first vararg value... for expansion of llvm.va_start. We
   2794   // can skip this if there are no va_start calls.
   2795   if (MFI->hasVAStart() &&
   2796       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
   2797                    CallConv != CallingConv::X86_ThisCall))) {
   2798     FuncInfo->setVarArgsFrameIndex(
   2799         MFI->CreateFixedObject(1, StackSize, true));
   2800   }
   2801 
   2802   // Figure out if XMM registers are in use.
   2803   assert(!(Subtarget->useSoftFloat() &&
   2804            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
   2805          "SSE register cannot be used when SSE is disabled!");
   2806 
   2807   // 64-bit calling conventions support varargs and register parameters, so we
   2808   // have to do extra work to spill them in the prologue.
   2809   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
   2810     // Find the first unallocated argument registers.
   2811     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
   2812     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
   2813     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
   2814     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
   2815     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
   2816            "SSE register cannot be used when SSE is disabled!");
   2817 
   2818     // Gather all the live in physical registers.
   2819     SmallVector<SDValue, 6> LiveGPRs;
   2820     SmallVector<SDValue, 8> LiveXMMRegs;
   2821     SDValue ALVal;
   2822     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
   2823       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
   2824       LiveGPRs.push_back(
   2825           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
   2826     }
   2827     if (!ArgXMMs.empty()) {
   2828       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
   2829       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
   2830       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
   2831         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
   2832         LiveXMMRegs.push_back(
   2833             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
   2834       }
   2835     }
   2836 
   2837     if (IsWin64) {
   2838       // Get to the caller-allocated home save location.  Add 8 to account
   2839       // for the return address.
   2840       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
   2841       FuncInfo->setRegSaveFrameIndex(
   2842           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
   2843       // Fixup to set vararg frame on shadow area (4 x i64).
   2844       if (NumIntRegs < 4)
   2845         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
   2846     } else {
   2847       // For X86-64, if there are vararg parameters that are passed via
   2848       // registers, then we must store them to their spots on the stack so
   2849       // they may be loaded by deferencing the result of va_next.
   2850       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
   2851       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
   2852       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
   2853           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
   2854     }
   2855 
   2856     // Store the integer parameter registers.
   2857     SmallVector<SDValue, 8> MemOps;
   2858     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   2859                                       getPointerTy(DAG.getDataLayout()));
   2860     unsigned Offset = FuncInfo->getVarArgsGPOffset();
   2861     for (SDValue Val : LiveGPRs) {
   2862       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
   2863                                 RSFIN, DAG.getIntPtrConstant(Offset, dl));
   2864       SDValue Store =
   2865           DAG.getStore(Val.getValue(1), dl, Val, FIN,
   2866                        MachinePointerInfo::getFixedStack(
   2867                            DAG.getMachineFunction(),
   2868                            FuncInfo->getRegSaveFrameIndex(), Offset),
   2869                        false, false, 0);
   2870       MemOps.push_back(Store);
   2871       Offset += 8;
   2872     }
   2873 
   2874     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
   2875       // Now store the XMM (fp + vector) parameter registers.
   2876       SmallVector<SDValue, 12> SaveXMMOps;
   2877       SaveXMMOps.push_back(Chain);
   2878       SaveXMMOps.push_back(ALVal);
   2879       SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2880                              FuncInfo->getRegSaveFrameIndex(), dl));
   2881       SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2882                              FuncInfo->getVarArgsFPOffset(), dl));
   2883       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
   2884                         LiveXMMRegs.end());
   2885       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
   2886                                    MVT::Other, SaveXMMOps));
   2887     }
   2888 
   2889     if (!MemOps.empty())
   2890       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
   2891   }
   2892 
   2893   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
   2894     // Find the largest legal vector type.
   2895     MVT VecVT = MVT::Other;
   2896     // FIXME: Only some x86_32 calling conventions support AVX512.
   2897     if (Subtarget->hasAVX512() &&
   2898         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
   2899                      CallConv == CallingConv::Intel_OCL_BI)))
   2900       VecVT = MVT::v16f32;
   2901     else if (Subtarget->hasAVX())
   2902       VecVT = MVT::v8f32;
   2903     else if (Subtarget->hasSSE2())
   2904       VecVT = MVT::v4f32;
   2905 
   2906     // We forward some GPRs and some vector types.
   2907     SmallVector<MVT, 2> RegParmTypes;
   2908     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
   2909     RegParmTypes.push_back(IntVT);
   2910     if (VecVT != MVT::Other)
   2911       RegParmTypes.push_back(VecVT);
   2912 
   2913     // Compute the set of forwarded registers. The rest are scratch.
   2914     SmallVectorImpl<ForwardedRegister> &Forwards =
   2915         FuncInfo->getForwardedMustTailRegParms();
   2916     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
   2917 
   2918     // Conservatively forward AL on x86_64, since it might be used for varargs.
   2919     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
   2920       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
   2921       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
   2922     }
   2923 
   2924     // Copy all forwards from physical to virtual registers.
   2925     for (ForwardedRegister &F : Forwards) {
   2926       // FIXME: Can we use a less constrained schedule?
   2927       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
   2928       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
   2929       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
   2930     }
   2931   }
   2932 
   2933   // Some CCs need callee pop.
   2934   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2935                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
   2936     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   2937   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
   2938     // X86 interrupts must pop the error code if present
   2939     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
   2940   } else {
   2941     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
   2942     // If this is an sret function, the return should pop the hidden pointer.
   2943     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
   2944         !Subtarget->getTargetTriple().isOSMSVCRT() &&
   2945         argsAreStructReturn(Ins) == StackStructReturn)
   2946       FuncInfo->setBytesToPopOnReturn(4);
   2947   }
   2948 
   2949   if (!Is64Bit) {
   2950     // RegSaveFrameIndex is X86-64 only.
   2951     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
   2952     if (CallConv == CallingConv::X86_FastCall ||
   2953         CallConv == CallingConv::X86_ThisCall)
   2954       // fastcc functions can't have varargs.
   2955       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   2956   }
   2957 
   2958   FuncInfo->setArgumentStackSize(StackSize);
   2959 
   2960   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
   2961     EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
   2962     if (Personality == EHPersonality::CoreCLR) {
   2963       assert(Is64Bit);
   2964       // TODO: Add a mechanism to frame lowering that will allow us to indicate
   2965       // that we'd prefer this slot be allocated towards the bottom of the frame
   2966       // (i.e. near the stack pointer after allocating the frame).  Every
   2967       // funclet needs a copy of this slot in its (mostly empty) frame, and the
   2968       // offset from the bottom of this and each funclet's frame must be the
   2969       // same, so the size of funclets' (mostly empty) frames is dictated by
   2970       // how far this slot is from the bottom (since they allocate just enough
   2971       // space to accomodate holding this slot at the correct offset).
   2972       int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
   2973       EHInfo->PSPSymFrameIdx = PSPSymFI;
   2974     }
   2975   }
   2976 
   2977   return Chain;
   2978 }
   2979 
   2980 SDValue
   2981 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
   2982                                     SDValue StackPtr, SDValue Arg,
   2983                                     SDLoc dl, SelectionDAG &DAG,
   2984                                     const CCValAssign &VA,
   2985                                     ISD::ArgFlagsTy Flags) const {
   2986   unsigned LocMemOffset = VA.getLocMemOffset();
   2987   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
   2988   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
   2989                        StackPtr, PtrOff);
   2990   if (Flags.isByVal())
   2991     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
   2992 
   2993   return DAG.getStore(
   2994       Chain, dl, Arg, PtrOff,
   2995       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
   2996       false, false, 0);
   2997 }
   2998 
   2999 /// Emit a load of return address if tail call
   3000 /// optimization is performed and it is required.
   3001 SDValue
   3002 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
   3003                                            SDValue &OutRetAddr, SDValue Chain,
   3004                                            bool IsTailCall, bool Is64Bit,
   3005                                            int FPDiff, SDLoc dl) const {
   3006   // Adjust the Return address stack slot.
   3007   EVT VT = getPointerTy(DAG.getDataLayout());
   3008   OutRetAddr = getReturnAddressFrameIndex(DAG);
   3009 
   3010   // Load the "old" Return address.
   3011   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
   3012                            false, false, false, 0);
   3013   return SDValue(OutRetAddr.getNode(), 1);
   3014 }
   3015 
   3016 /// Emit a store of the return address if tail call
   3017 /// optimization is performed and it is required (FPDiff!=0).
   3018 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
   3019                                         SDValue Chain, SDValue RetAddrFrIdx,
   3020                                         EVT PtrVT, unsigned SlotSize,
   3021                                         int FPDiff, SDLoc dl) {
   3022   // Store the return address to the appropriate stack slot.
   3023   if (!FPDiff) return Chain;
   3024   // Calculate the new stack slot for the return address.
   3025   int NewReturnAddrFI =
   3026     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
   3027                                          false);
   3028   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   3029   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
   3030                        MachinePointerInfo::getFixedStack(
   3031                            DAG.getMachineFunction(), NewReturnAddrFI),
   3032                        false, false, 0);
   3033   return Chain;
   3034 }
   3035 
   3036 /// Returns a vector_shuffle mask for an movs{s|d}, movd
   3037 /// operation of specified width.
   3038 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
   3039                        SDValue V2) {
   3040   unsigned NumElems = VT.getVectorNumElements();
   3041   SmallVector<int, 8> Mask;
   3042   Mask.push_back(NumElems);
   3043   for (unsigned i = 1; i != NumElems; ++i)
   3044     Mask.push_back(i);
   3045   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   3046 }
   3047 
   3048 SDValue
   3049 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   3050                              SmallVectorImpl<SDValue> &InVals) const {
   3051   SelectionDAG &DAG                     = CLI.DAG;
   3052   SDLoc &dl                             = CLI.DL;
   3053   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   3054   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
   3055   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   3056   SDValue Chain                         = CLI.Chain;
   3057   SDValue Callee                        = CLI.Callee;
   3058   CallingConv::ID CallConv              = CLI.CallConv;
   3059   bool &isTailCall                      = CLI.IsTailCall;
   3060   bool isVarArg                         = CLI.IsVarArg;
   3061 
   3062   MachineFunction &MF = DAG.getMachineFunction();
   3063   bool Is64Bit        = Subtarget->is64Bit();
   3064   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
   3065   StructReturnType SR = callIsStructReturn(Outs);
   3066   bool IsSibcall      = false;
   3067   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   3068   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
   3069 
   3070   if (CallConv == CallingConv::X86_INTR)
   3071     report_fatal_error("X86 interrupts may not be called directly");
   3072 
   3073   if (Attr.getValueAsString() == "true")
   3074     isTailCall = false;
   3075 
   3076   if (Subtarget->isPICStyleGOT() &&
   3077       !MF.getTarget().Options.GuaranteedTailCallOpt) {
   3078     // If we are using a GOT, disable tail calls to external symbols with
   3079     // default visibility. Tail calling such a symbol requires using a GOT
   3080     // relocation, which forces early binding of the symbol. This breaks code
   3081     // that require lazy function symbol resolution. Using musttail or
   3082     // GuaranteedTailCallOpt will override this.
   3083     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   3084     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
   3085                G->getGlobal()->hasDefaultVisibility()))
   3086       isTailCall = false;
   3087   }
   3088 
   3089   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
   3090   if (IsMustTail) {
   3091     // Force this to be a tail call.  The verifier rules are enough to ensure
   3092     // that we can lower this successfully without moving the return address
   3093     // around.
   3094     isTailCall = true;
   3095   } else if (isTailCall) {
   3096     // Check if it's really possible to do a tail call.
   3097     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
   3098                     isVarArg, SR != NotStructReturn,
   3099                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
   3100                     Outs, OutVals, Ins, DAG);
   3101 
   3102     // Sibcalls are automatically detected tailcalls which do not require
   3103     // ABI changes.
   3104     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
   3105       IsSibcall = true;
   3106 
   3107     if (isTailCall)
   3108       ++NumTailCalls;
   3109   }
   3110 
   3111   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
   3112          "Var args not supported with calling convention fastcc, ghc or hipe");
   3113 
   3114   // Analyze operands of the call, assigning locations to each operand.
   3115   SmallVector<CCValAssign, 16> ArgLocs;
   3116   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
   3117 
   3118   // Allocate shadow area for Win64
   3119   if (IsWin64)
   3120     CCInfo.AllocateStack(32, 8);
   3121 
   3122   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   3123 
   3124   // Get a count of how many bytes are to be pushed on the stack.
   3125   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
   3126   if (IsSibcall)
   3127     // This is a sibcall. The memory operands are available in caller's
   3128     // own caller's stack.
   3129     NumBytes = 0;
   3130   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
   3131            canGuaranteeTCO(CallConv))
   3132     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
   3133 
   3134   int FPDiff = 0;
   3135   if (isTailCall && !IsSibcall && !IsMustTail) {
   3136     // Lower arguments at fp - stackoffset + fpdiff.
   3137     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
   3138 
   3139     FPDiff = NumBytesCallerPushed - NumBytes;
   3140 
   3141     // Set the delta of movement of the returnaddr stackslot.
   3142     // But only set if delta is greater than previous delta.
   3143     if (FPDiff < X86Info->getTCReturnAddrDelta())
   3144       X86Info->setTCReturnAddrDelta(FPDiff);
   3145   }
   3146 
   3147   unsigned NumBytesToPush = NumBytes;
   3148   unsigned NumBytesToPop = NumBytes;
   3149 
   3150   // If we have an inalloca argument, all stack space has already been allocated
   3151   // for us and be right at the top of the stack.  We don't support multiple
   3152   // arguments passed in memory when using inalloca.
   3153   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
   3154     NumBytesToPush = 0;
   3155     if (!ArgLocs.back().isMemLoc())
   3156       report_fatal_error("cannot use inalloca attribute on a register "
   3157                          "parameter");
   3158     if (ArgLocs.back().getLocMemOffset() != 0)
   3159       report_fatal_error("any parameter with the inalloca attribute must be "
   3160                          "the only memory argument");
   3161   }
   3162 
   3163   if (!IsSibcall)
   3164     Chain = DAG.getCALLSEQ_START(
   3165         Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
   3166 
   3167   SDValue RetAddrFrIdx;
   3168   // Load return address for tail calls.
   3169   if (isTailCall && FPDiff)
   3170     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
   3171                                     Is64Bit, FPDiff, dl);
   3172 
   3173   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   3174   SmallVector<SDValue, 8> MemOpChains;
   3175   SDValue StackPtr;
   3176 
   3177   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   3178   // of tail call optimization arguments are handle later.
   3179   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   3180   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3181     // Skip inalloca arguments, they have already been written.
   3182     ISD::ArgFlagsTy Flags = Outs[i].Flags;
   3183     if (Flags.isInAlloca())
   3184       continue;
   3185 
   3186     CCValAssign &VA = ArgLocs[i];
   3187     EVT RegVT = VA.getLocVT();
   3188     SDValue Arg = OutVals[i];
   3189     bool isByVal = Flags.isByVal();
   3190 
   3191     // Promote the value if needed.
   3192     switch (VA.getLocInfo()) {
   3193     default: llvm_unreachable("Unknown loc info!");
   3194     case CCValAssign::Full: break;
   3195     case CCValAssign::SExt:
   3196       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
   3197       break;
   3198     case CCValAssign::ZExt:
   3199       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
   3200       break;
   3201     case CCValAssign::AExt:
   3202       if (Arg.getValueType().isVector() &&
   3203           Arg.getValueType().getVectorElementType() == MVT::i1)
   3204         Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
   3205       else if (RegVT.is128BitVector()) {
   3206         // Special case: passing MMX values in XMM registers.
   3207         Arg = DAG.getBitcast(MVT::i64, Arg);
   3208         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
   3209         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
   3210       } else
   3211         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
   3212       break;
   3213     case CCValAssign::BCvt:
   3214       Arg = DAG.getBitcast(RegVT, Arg);
   3215       break;
   3216     case CCValAssign::Indirect: {
   3217       // Store the argument.
   3218       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
   3219       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
   3220       Chain = DAG.getStore(
   3221           Chain, dl, Arg, SpillSlot,
   3222           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
   3223           false, false, 0);
   3224       Arg = SpillSlot;
   3225       break;
   3226     }
   3227     }
   3228 
   3229     if (VA.isRegLoc()) {
   3230       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   3231       if (isVarArg && IsWin64) {
   3232         // Win64 ABI requires argument XMM reg to be copied to the corresponding
   3233         // shadow reg if callee is a varargs function.
   3234         unsigned ShadowReg = 0;
   3235         switch (VA.getLocReg()) {
   3236         case X86::XMM0: ShadowReg = X86::RCX; break;
   3237         case X86::XMM1: ShadowReg = X86::RDX; break;
   3238         case X86::XMM2: ShadowReg = X86::R8; break;
   3239         case X86::XMM3: ShadowReg = X86::R9; break;
   3240         }
   3241         if (ShadowReg)
   3242           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
   3243       }
   3244     } else if (!IsSibcall && (!isTailCall || isByVal)) {
   3245       assert(VA.isMemLoc());
   3246       if (!StackPtr.getNode())
   3247         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
   3248                                       getPointerTy(DAG.getDataLayout()));
   3249       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
   3250                                              dl, DAG, VA, Flags));
   3251     }
   3252   }
   3253 
   3254   if (!MemOpChains.empty())
   3255     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
   3256 
   3257   if (Subtarget->isPICStyleGOT()) {
   3258     // ELF / PIC requires GOT in the EBX register before function calls via PLT
   3259     // GOT pointer.
   3260     if (!isTailCall) {
   3261       RegsToPass.push_back(std::make_pair(
   3262           unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
   3263                                           getPointerTy(DAG.getDataLayout()))));
   3264     } else {
   3265       // If we are tail calling and generating PIC/GOT style code load the
   3266       // address of the callee into ECX. The value in ecx is used as target of
   3267       // the tail jump. This is done to circumvent the ebx/callee-saved problem
   3268       // for tail calls on PIC/GOT architectures. Normally we would just put the
   3269       // address of GOT into ebx and then call target@PLT. But for tail calls
   3270       // ebx would be restored (since ebx is callee saved) before jumping to the
   3271       // target@PLT.
   3272 
   3273       // Note: The actual moving to ECX is done further down.
   3274       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   3275       if (G && !G->getGlobal()->hasLocalLinkage() &&
   3276           G->getGlobal()->hasDefaultVisibility())
   3277         Callee = LowerGlobalAddress(Callee, DAG);
   3278       else if (isa<ExternalSymbolSDNode>(Callee))
   3279         Callee = LowerExternalSymbol(Callee, DAG);
   3280     }
   3281   }
   3282 
   3283   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
   3284     // From AMD64 ABI document:
   3285     // For calls that may call functions that use varargs or stdargs
   3286     // (prototype-less calls or calls to functions containing ellipsis (...) in
   3287     // the declaration) %al is used as hidden argument to specify the number
   3288     // of SSE registers used. The contents of %al do not need to match exactly
   3289     // the number of registers, but must be an ubound on the number of SSE
   3290     // registers used and is in the range 0 - 8 inclusive.
   3291 
   3292     // Count the number of XMM registers allocated.
   3293     static const MCPhysReg XMMArgRegs[] = {
   3294       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   3295       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   3296     };
   3297     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
   3298     assert((Subtarget->hasSSE1() || !NumXMMRegs)
   3299            && "SSE registers cannot be used when SSE is disabled");
   3300 
   3301     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
   3302                                         DAG.getConstant(NumXMMRegs, dl,
   3303                                                         MVT::i8)));
   3304   }
   3305 
   3306   if (isVarArg && IsMustTail) {
   3307     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
   3308     for (const auto &F : Forwards) {
   3309       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
   3310       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
   3311     }
   3312   }
   3313 
   3314   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
   3315   // don't need this because the eligibility check rejects calls that require
   3316   // shuffling arguments passed in memory.
   3317   if (!IsSibcall && isTailCall) {
   3318     // Force all the incoming stack arguments to be loaded from the stack
   3319     // before any new outgoing arguments are stored to the stack, because the
   3320     // outgoing stack slots may alias the incoming argument stack slots, and
   3321     // the alias isn't otherwise explicit. This is slightly more conservative
   3322     // than necessary, because it means that each store effectively depends
   3323     // on every argument instead of just those arguments it would clobber.
   3324     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
   3325 
   3326     SmallVector<SDValue, 8> MemOpChains2;
   3327     SDValue FIN;
   3328     int FI = 0;
   3329     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3330       CCValAssign &VA = ArgLocs[i];
   3331       if (VA.isRegLoc())
   3332         continue;
   3333       assert(VA.isMemLoc());
   3334       SDValue Arg = OutVals[i];
   3335       ISD::ArgFlagsTy Flags = Outs[i].Flags;
   3336       // Skip inalloca arguments.  They don't require any work.
   3337       if (Flags.isInAlloca())
   3338         continue;
   3339       // Create frame index.
   3340       int32_t Offset = VA.getLocMemOffset()+FPDiff;
   3341       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
   3342       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
   3343       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   3344 
   3345       if (Flags.isByVal()) {
   3346         // Copy relative to framepointer.
   3347         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
   3348         if (!StackPtr.getNode())
   3349           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
   3350                                         getPointerTy(DAG.getDataLayout()));
   3351         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
   3352                              StackPtr, Source);
   3353 
   3354         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
   3355                                                          ArgChain,
   3356                                                          Flags, DAG, dl));
   3357       } else {
   3358         // Store relative to framepointer.
   3359         MemOpChains2.push_back(DAG.getStore(
   3360             ArgChain, dl, Arg, FIN,
   3361             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
   3362             false, false, 0));
   3363       }
   3364     }
   3365 
   3366     if (!MemOpChains2.empty())
   3367       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
   3368 
   3369     // Store the return address to the appropriate stack slot.
   3370     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
   3371                                      getPointerTy(DAG.getDataLayout()),
   3372                                      RegInfo->getSlotSize(), FPDiff, dl);
   3373   }
   3374 
   3375   // Build a sequence of copy-to-reg nodes chained together with token chain
   3376   // and flag operands which copy the outgoing args into registers.
   3377   SDValue InFlag;
   3378   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   3379     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   3380                              RegsToPass[i].second, InFlag);
   3381     InFlag = Chain.getValue(1);
   3382   }
   3383 
   3384   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
   3385     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
   3386     // In the 64-bit large code model, we have to make all calls
   3387     // through a register, since the call instruction's 32-bit
   3388     // pc-relative offset may not be large enough to hold the whole
   3389     // address.
   3390   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
   3391     // If the callee is a GlobalAddress node (quite common, every direct call
   3392     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
   3393     // it.
   3394     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
   3395 
   3396     // We should use extra load for direct calls to dllimported functions in
   3397     // non-JIT mode.
   3398     const GlobalValue *GV = G->getGlobal();
   3399     if (!GV->hasDLLImportStorageClass()) {
   3400       unsigned char OpFlags = 0;
   3401       bool ExtraLoad = false;
   3402       unsigned WrapperKind = ISD::DELETED_NODE;
   3403 
   3404       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
   3405       // external symbols most go through the PLT in PIC mode.  If the symbol
   3406       // has hidden or protected visibility, or if it is static or local, then
   3407       // we don't need to use the PLT - we can directly call it.
   3408       if (Subtarget->isTargetELF() &&
   3409           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
   3410           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
   3411         OpFlags = X86II::MO_PLT;
   3412       } else if (Subtarget->isPICStyleStubAny() &&
   3413                  !GV->isStrongDefinitionForLinker() &&
   3414                  (!Subtarget->getTargetTriple().isMacOSX() ||
   3415                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   3416         // PC-relative references to external symbols should go through $stub,
   3417         // unless we're building with the leopard linker or later, which
   3418         // automatically synthesizes these stubs.
   3419         OpFlags = X86II::MO_DARWIN_STUB;
   3420       } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
   3421                  cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
   3422         // If the function is marked as non-lazy, generate an indirect call
   3423         // which loads from the GOT directly. This avoids runtime overhead
   3424         // at the cost of eager binding (and one extra byte of encoding).
   3425         OpFlags = X86II::MO_GOTPCREL;
   3426         WrapperKind = X86ISD::WrapperRIP;
   3427         ExtraLoad = true;
   3428       }
   3429 
   3430       Callee = DAG.getTargetGlobalAddress(
   3431           GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
   3432 
   3433       // Add a wrapper if needed.
   3434       if (WrapperKind != ISD::DELETED_NODE)
   3435         Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
   3436                              getPointerTy(DAG.getDataLayout()), Callee);
   3437       // Add extra indirection if needed.
   3438       if (ExtraLoad)
   3439         Callee = DAG.getLoad(
   3440             getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
   3441             MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false,
   3442             false, 0);
   3443     }
   3444   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   3445     unsigned char OpFlags = 0;
   3446 
   3447     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
   3448     // external symbols should go through the PLT.
   3449     if (Subtarget->isTargetELF() &&
   3450         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
   3451       OpFlags = X86II::MO_PLT;
   3452     } else if (Subtarget->isPICStyleStubAny() &&
   3453                (!Subtarget->getTargetTriple().isMacOSX() ||
   3454                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   3455       // PC-relative references to external symbols should go through $stub,
   3456       // unless we're building with the leopard linker or later, which
   3457       // automatically synthesizes these stubs.
   3458       OpFlags = X86II::MO_DARWIN_STUB;
   3459     }
   3460 
   3461     Callee = DAG.getTargetExternalSymbol(
   3462         S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
   3463   } else if (Subtarget->isTarget64BitILP32() &&
   3464              Callee->getValueType(0) == MVT::i32) {
   3465     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
   3466     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
   3467   }
   3468 
   3469   // Returns a chain & a flag for retval copy to use.
   3470   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   3471   SmallVector<SDValue, 8> Ops;
   3472 
   3473   if (!IsSibcall && isTailCall) {
   3474     Chain = DAG.getCALLSEQ_END(Chain,
   3475                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
   3476                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
   3477     InFlag = Chain.getValue(1);
   3478   }
   3479 
   3480   Ops.push_back(Chain);
   3481   Ops.push_back(Callee);
   3482 
   3483   if (isTailCall)
   3484     Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
   3485 
   3486   // Add argument registers to the end of the list so that they are known live
   3487   // into the call.
   3488   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   3489     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   3490                                   RegsToPass[i].second.getValueType()));
   3491 
   3492   // Add a register mask operand representing the call-preserved registers.
   3493   const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
   3494   assert(Mask && "Missing call preserved mask for calling convention");
   3495 
   3496   // If this is an invoke in a 32-bit function using a funclet-based
   3497   // personality, assume the function clobbers all registers. If an exception
   3498   // is thrown, the runtime will not restore CSRs.
   3499   // FIXME: Model this more precisely so that we can register allocate across
   3500   // the normal edge and spill and fill across the exceptional edge.
   3501   if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
   3502     const Function *CallerFn = MF.getFunction();
   3503     EHPersonality Pers =
   3504         CallerFn->hasPersonalityFn()
   3505             ? classifyEHPersonality(CallerFn->getPersonalityFn())
   3506             : EHPersonality::Unknown;
   3507     if (isFuncletEHPersonality(Pers))
   3508       Mask = RegInfo->getNoPreservedMask();
   3509   }
   3510 
   3511   Ops.push_back(DAG.getRegisterMask(Mask));
   3512 
   3513   if (InFlag.getNode())
   3514     Ops.push_back(InFlag);
   3515 
   3516   if (isTailCall) {
   3517     // We used to do:
   3518     //// If this is the first return lowered for this function, add the regs
   3519     //// to the liveout set for the function.
   3520     // This isn't right, although it's probably harmless on x86; liveouts
   3521     // should be computed from returns not tail calls.  Consider a void
   3522     // function making a tail call to a function returning int.
   3523     MF.getFrameInfo()->setHasTailCall();
   3524     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
   3525   }
   3526 
   3527   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
   3528   InFlag = Chain.getValue(1);
   3529 
   3530   // Create the CALLSEQ_END node.
   3531   unsigned NumBytesForCalleeToPop;
   3532   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   3533                        DAG.getTarget().Options.GuaranteedTailCallOpt))
   3534     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
   3535   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
   3536            !Subtarget->getTargetTriple().isOSMSVCRT() &&
   3537            SR == StackStructReturn)
   3538     // If this is a call to a struct-return function, the callee
   3539     // pops the hidden struct pointer, so we have to push it back.
   3540     // This is common for Darwin/X86, Linux & Mingw32 targets.
   3541     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
   3542     NumBytesForCalleeToPop = 4;
   3543   else
   3544     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
   3545 
   3546   // Returns a flag for retval copy to use.
   3547   if (!IsSibcall) {
   3548     Chain = DAG.getCALLSEQ_END(Chain,
   3549                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
   3550                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
   3551                                                      true),
   3552                                InFlag, dl);
   3553     InFlag = Chain.getValue(1);
   3554   }
   3555 
   3556   // Handle result values, copying them out of physregs into vregs that we
   3557   // return.
   3558   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
   3559                          Ins, dl, DAG, InVals);
   3560 }
   3561 
   3562 //===----------------------------------------------------------------------===//
   3563 //                Fast Calling Convention (tail call) implementation
   3564 //===----------------------------------------------------------------------===//
   3565 
   3566 //  Like std call, callee cleans arguments, convention except that ECX is
   3567 //  reserved for storing the tail called function address. Only 2 registers are
   3568 //  free for argument passing (inreg). Tail call optimization is performed
   3569 //  provided:
   3570 //                * tailcallopt is enabled
   3571 //                * caller/callee are fastcc
   3572 //  On X86_64 architecture with GOT-style position independent code only local
   3573 //  (within module) calls are supported at the moment.
   3574 //  To keep the stack aligned according to platform abi the function
   3575 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
   3576 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
   3577 //  If a tail called function callee has more arguments than the caller the
   3578 //  caller needs to make sure that there is room to move the RETADDR to. This is
   3579 //  achieved by reserving an area the size of the argument delta right after the
   3580 //  original RETADDR, but before the saved framepointer or the spilled registers
   3581 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
   3582 //  stack layout:
   3583 //    arg1
   3584 //    arg2
   3585 //    RETADDR
   3586 //    [ new RETADDR
   3587 //      move area ]
   3588 //    (possible EBP)
   3589 //    ESI
   3590 //    EDI
   3591 //    local1 ..
   3592 
   3593 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
   3594 /// requirement.
   3595 unsigned
   3596 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   3597                                                SelectionDAG& DAG) const {
   3598   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   3599   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   3600   unsigned StackAlignment = TFI.getStackAlignment();
   3601   uint64_t AlignMask = StackAlignment - 1;
   3602   int64_t Offset = StackSize;
   3603   unsigned SlotSize = RegInfo->getSlotSize();
   3604   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
   3605     // Number smaller than 12 so just add the difference.
   3606     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
   3607   } else {
   3608     // Mask out lower bits, add stackalignment once plus the 12 bytes.
   3609     Offset = ((~AlignMask) & Offset) + StackAlignment +
   3610       (StackAlignment-SlotSize);
   3611   }
   3612   return Offset;
   3613 }
   3614 
   3615 /// Return true if the given stack call argument is already available in the
   3616 /// same position (relatively) of the caller's incoming argument stack.
   3617 static
   3618 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   3619                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
   3620                          const X86InstrInfo *TII) {
   3621   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
   3622   int FI = INT_MAX;
   3623   if (Arg.getOpcode() == ISD::CopyFromReg) {
   3624     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
   3625     if (!TargetRegisterInfo::isVirtualRegister(VR))
   3626       return false;
   3627     MachineInstr *Def = MRI->getVRegDef(VR);
   3628     if (!Def)
   3629       return false;
   3630     if (!Flags.isByVal()) {
   3631       if (!TII->isLoadFromStackSlot(Def, FI))
   3632         return false;
   3633     } else {
   3634       unsigned Opcode = Def->getOpcode();
   3635       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
   3636            Opcode == X86::LEA64_32r) &&
   3637           Def->getOperand(1).isFI()) {
   3638         FI = Def->getOperand(1).getIndex();
   3639         Bytes = Flags.getByValSize();
   3640       } else
   3641         return false;
   3642     }
   3643   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
   3644     if (Flags.isByVal())
   3645       // ByVal argument is passed in as a pointer but it's now being
   3646       // dereferenced. e.g.
   3647       // define @foo(%struct.X* %A) {
   3648       //   tail call @bar(%struct.X* byval %A)
   3649       // }
   3650       return false;
   3651     SDValue Ptr = Ld->getBasePtr();
   3652     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
   3653     if (!FINode)
   3654       return false;
   3655     FI = FINode->getIndex();
   3656   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
   3657     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
   3658     FI = FINode->getIndex();
   3659     Bytes = Flags.getByValSize();
   3660   } else
   3661     return false;
   3662 
   3663   assert(FI != INT_MAX);
   3664   if (!MFI->isFixedObjectIndex(FI))
   3665     return false;
   3666   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
   3667 }
   3668 
   3669 /// Check whether the call is eligible for tail call optimization. Targets
   3670 /// that want to do tail call optimization should implement this function.
   3671 bool X86TargetLowering::IsEligibleForTailCallOptimization(
   3672     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
   3673     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
   3674     const SmallVectorImpl<ISD::OutputArg> &Outs,
   3675     const SmallVectorImpl<SDValue> &OutVals,
   3676     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   3677   if (!mayTailCallThisCC(CalleeCC))
   3678     return false;
   3679 
   3680   // If -tailcallopt is specified, make fastcc functions tail-callable.
   3681   MachineFunction &MF = DAG.getMachineFunction();
   3682   const Function *CallerF = MF.getFunction();
   3683 
   3684   // If the function return type is x86_fp80 and the callee return type is not,
   3685   // then the FP_EXTEND of the call result is not a nop. It's not safe to
   3686   // perform a tailcall optimization here.
   3687   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
   3688     return false;
   3689 
   3690   CallingConv::ID CallerCC = CallerF->getCallingConv();
   3691   bool CCMatch = CallerCC == CalleeCC;
   3692   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
   3693   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
   3694 
   3695   // Win64 functions have extra shadow space for argument homing. Don't do the
   3696   // sibcall if the caller and callee have mismatched expectations for this
   3697   // space.
   3698   if (IsCalleeWin64 != IsCallerWin64)
   3699     return false;
   3700 
   3701   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
   3702     if (canGuaranteeTCO(CalleeCC) && CCMatch)
   3703       return true;
   3704     return false;
   3705   }
   3706 
   3707   // Look for obvious safe cases to perform tail call optimization that do not
   3708   // require ABI changes. This is what gcc calls sibcall.
   3709 
   3710   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   3711   // emit a special epilogue.
   3712   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   3713   if (RegInfo->needsStackRealignment(MF))
   3714     return false;
   3715 
   3716   // Also avoid sibcall optimization if either caller or callee uses struct
   3717   // return semantics.
   3718   if (isCalleeStructRet || isCallerStructRet)
   3719     return false;
   3720 
   3721   // Do not sibcall optimize vararg calls unless all arguments are passed via
   3722   // registers.
   3723   if (isVarArg && !Outs.empty()) {
   3724     // Optimizing for varargs on Win64 is unlikely to be safe without
   3725     // additional testing.
   3726     if (IsCalleeWin64 || IsCallerWin64)
   3727       return false;
   3728 
   3729     SmallVector<CCValAssign, 16> ArgLocs;
   3730     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
   3731                    *DAG.getContext());
   3732 
   3733     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   3734     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
   3735       if (!ArgLocs[i].isRegLoc())
   3736         return false;
   3737   }
   3738 
   3739   // If the call result is in ST0 / ST1, it needs to be popped off the x87
   3740   // stack.  Therefore, if it's not used by the call it is not safe to optimize
   3741   // this into a sibcall.
   3742   bool Unused = false;
   3743   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
   3744     if (!Ins[i].Used) {
   3745       Unused = true;
   3746       break;
   3747     }
   3748   }
   3749   if (Unused) {
   3750     SmallVector<CCValAssign, 16> RVLocs;
   3751     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
   3752                    *DAG.getContext());
   3753     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   3754     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   3755       CCValAssign &VA = RVLocs[i];
   3756       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
   3757         return false;
   3758     }
   3759   }
   3760 
   3761   // If the calling conventions do not match, then we'd better make sure the
   3762   // results are returned in the same way as what the caller expects.
   3763   if (!CCMatch) {
   3764     SmallVector<CCValAssign, 16> RVLocs1;
   3765     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
   3766                     *DAG.getContext());
   3767     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
   3768 
   3769     SmallVector<CCValAssign, 16> RVLocs2;
   3770     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
   3771                     *DAG.getContext());
   3772     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
   3773 
   3774     if (RVLocs1.size() != RVLocs2.size())
   3775       return false;
   3776     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
   3777       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
   3778         return false;
   3779       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
   3780         return false;
   3781       if (RVLocs1[i].isRegLoc()) {
   3782         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
   3783           return false;
   3784       } else {
   3785         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
   3786           return false;
   3787       }
   3788     }
   3789   }
   3790 
   3791   unsigned StackArgsSize = 0;
   3792 
   3793   // If the callee takes no arguments then go on to check the results of the
   3794   // call.
   3795   if (!Outs.empty()) {
   3796     // Check if stack adjustment is needed. For now, do not do this if any
   3797     // argument is passed on the stack.
   3798     SmallVector<CCValAssign, 16> ArgLocs;
   3799     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
   3800                    *DAG.getContext());
   3801 
   3802     // Allocate shadow area for Win64
   3803     if (IsCalleeWin64)
   3804       CCInfo.AllocateStack(32, 8);
   3805 
   3806     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   3807     StackArgsSize = CCInfo.getNextStackOffset();
   3808 
   3809     if (CCInfo.getNextStackOffset()) {
   3810       // Check if the arguments are already laid out in the right way as
   3811       // the caller's fixed stack objects.
   3812       MachineFrameInfo *MFI = MF.getFrameInfo();
   3813       const MachineRegisterInfo *MRI = &MF.getRegInfo();
   3814       const X86InstrInfo *TII = Subtarget->getInstrInfo();
   3815       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3816         CCValAssign &VA = ArgLocs[i];
   3817         SDValue Arg = OutVals[i];
   3818         ISD::ArgFlagsTy Flags = Outs[i].Flags;
   3819         if (VA.getLocInfo() == CCValAssign::Indirect)
   3820           return false;
   3821         if (!VA.isRegLoc()) {
   3822           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
   3823                                    MFI, MRI, TII))
   3824             return false;
   3825         }
   3826       }
   3827     }
   3828 
   3829     // If the tailcall address may be in a register, then make sure it's
   3830     // possible to register allocate for it. In 32-bit, the call address can
   3831     // only target EAX, EDX, or ECX since the tail call must be scheduled after
   3832     // callee-saved registers are restored. These happen to be the same
   3833     // registers used to pass 'inreg' arguments so watch out for those.
   3834     if (!Subtarget->is64Bit() &&
   3835         ((!isa<GlobalAddressSDNode>(Callee) &&
   3836           !isa<ExternalSymbolSDNode>(Callee)) ||
   3837          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
   3838       unsigned NumInRegs = 0;
   3839       // In PIC we need an extra register to formulate the address computation
   3840       // for the callee.
   3841       unsigned MaxInRegs =
   3842         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
   3843 
   3844       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3845         CCValAssign &VA = ArgLocs[i];
   3846         if (!VA.isRegLoc())
   3847           continue;
   3848         unsigned Reg = VA.getLocReg();
   3849         switch (Reg) {
   3850         default: break;
   3851         case X86::EAX: case X86::EDX: case X86::ECX:
   3852           if (++NumInRegs == MaxInRegs)
   3853             return false;
   3854           break;
   3855         }
   3856       }
   3857     }
   3858   }
   3859 
   3860   bool CalleeWillPop =
   3861       X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg,
   3862                        MF.getTarget().Options.GuaranteedTailCallOpt);
   3863 
   3864   if (unsigned BytesToPop =
   3865           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
   3866     // If we have bytes to pop, the callee must pop them.
   3867     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
   3868     if (!CalleePopMatches)
   3869       return false;
   3870   } else if (CalleeWillPop && StackArgsSize > 0) {
   3871     // If we don't have bytes to pop, make sure the callee doesn't pop any.
   3872     return false;
   3873   }
   3874 
   3875   return true;
   3876 }
   3877 
   3878 FastISel *
   3879 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
   3880                                   const TargetLibraryInfo *libInfo) const {
   3881   return X86::createFastISel(funcInfo, libInfo);
   3882 }
   3883 
   3884 //===----------------------------------------------------------------------===//
   3885 //                           Other Lowering Hooks
   3886 //===----------------------------------------------------------------------===//
   3887 
   3888 static bool MayFoldLoad(SDValue Op) {
   3889   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
   3890 }
   3891 
   3892 static bool MayFoldIntoStore(SDValue Op) {
   3893   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
   3894 }
   3895 
   3896 static bool isTargetShuffle(unsigned Opcode) {
   3897   switch(Opcode) {
   3898   default: return false;
   3899   case X86ISD::BLENDI:
   3900   case X86ISD::PSHUFB:
   3901   case X86ISD::PSHUFD:
   3902   case X86ISD::PSHUFHW:
   3903   case X86ISD::PSHUFLW:
   3904   case X86ISD::SHUFP:
   3905   case X86ISD::PALIGNR:
   3906   case X86ISD::MOVLHPS:
   3907   case X86ISD::MOVLHPD:
   3908   case X86ISD::MOVHLPS:
   3909   case X86ISD::MOVLPS:
   3910   case X86ISD::MOVLPD:
   3911   case X86ISD::MOVSHDUP:
   3912   case X86ISD::MOVSLDUP:
   3913   case X86ISD::MOVDDUP:
   3914   case X86ISD::MOVSS:
   3915   case X86ISD::MOVSD:
   3916   case X86ISD::UNPCKL:
   3917   case X86ISD::UNPCKH:
   3918   case X86ISD::VPERMILPI:
   3919   case X86ISD::VPERM2X128:
   3920   case X86ISD::VPERMI:
   3921   case X86ISD::VPERMV:
   3922   case X86ISD::VPERMV3:
   3923     return true;
   3924   }
   3925 }
   3926 
   3927 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
   3928                                     SDValue V1, unsigned TargetMask,
   3929                                     SelectionDAG &DAG) {
   3930   switch(Opc) {
   3931   default: llvm_unreachable("Unknown x86 shuffle node");
   3932   case X86ISD::PSHUFD:
   3933   case X86ISD::PSHUFHW:
   3934   case X86ISD::PSHUFLW:
   3935   case X86ISD::VPERMILPI:
   3936   case X86ISD::VPERMI:
   3937     return DAG.getNode(Opc, dl, VT, V1,
   3938                        DAG.getConstant(TargetMask, dl, MVT::i8));
   3939   }
   3940 }
   3941 
   3942 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
   3943                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   3944   switch(Opc) {
   3945   default: llvm_unreachable("Unknown x86 shuffle node");
   3946   case X86ISD::MOVLHPS:
   3947   case X86ISD::MOVLHPD:
   3948   case X86ISD::MOVHLPS:
   3949   case X86ISD::MOVLPS:
   3950   case X86ISD::MOVLPD:
   3951   case X86ISD::MOVSS:
   3952   case X86ISD::MOVSD:
   3953   case X86ISD::UNPCKL:
   3954   case X86ISD::UNPCKH:
   3955     return DAG.getNode(Opc, dl, VT, V1, V2);
   3956   }
   3957 }
   3958 
   3959 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   3960   MachineFunction &MF = DAG.getMachineFunction();
   3961   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   3962   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   3963   int ReturnAddrIndex = FuncInfo->getRAIndex();
   3964 
   3965   if (ReturnAddrIndex == 0) {
   3966     // Set up a frame object for the return address.
   3967     unsigned SlotSize = RegInfo->getSlotSize();
   3968     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
   3969                                                            -(int64_t)SlotSize,
   3970                                                            false);
   3971     FuncInfo->setRAIndex(ReturnAddrIndex);
   3972   }
   3973 
   3974   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
   3975 }
   3976 
   3977 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
   3978                                        bool hasSymbolicDisplacement) {
   3979   // Offset should fit into 32 bit immediate field.
   3980   if (!isInt<32>(Offset))
   3981     return false;
   3982 
   3983   // If we don't have a symbolic displacement - we don't have any extra
   3984   // restrictions.
   3985   if (!hasSymbolicDisplacement)
   3986     return true;
   3987 
   3988   // FIXME: Some tweaks might be needed for medium code model.
   3989   if (M != CodeModel::Small && M != CodeModel::Kernel)
   3990     return false;
   3991 
   3992   // For small code model we assume that latest object is 16MB before end of 31
   3993   // bits boundary. We may also accept pretty large negative constants knowing
   3994   // that all objects are in the positive half of address space.
   3995   if (M == CodeModel::Small && Offset < 16*1024*1024)
   3996     return true;
   3997 
   3998   // For kernel code model we know that all object resist in the negative half
   3999   // of 32bits address space. We may not accept negative offsets, since they may
   4000   // be just off and we may accept pretty large positive ones.
   4001   if (M == CodeModel::Kernel && Offset >= 0)
   4002     return true;
   4003 
   4004   return false;
   4005 }
   4006 
   4007 /// Determines whether the callee is required to pop its own arguments.
   4008 /// Callee pop is necessary to support tail calls.
   4009 bool X86::isCalleePop(CallingConv::ID CallingConv,
   4010                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
   4011   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
   4012   // can guarantee TCO.
   4013   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
   4014     return true;
   4015 
   4016   switch (CallingConv) {
   4017   default:
   4018     return false;
   4019   case CallingConv::X86_StdCall:
   4020   case CallingConv::X86_FastCall:
   4021   case CallingConv::X86_ThisCall:
   4022   case CallingConv::X86_VectorCall:
   4023     return !is64Bit;
   4024   }
   4025 }
   4026 
   4027 /// \brief Return true if the condition is an unsigned comparison operation.
   4028 static bool isX86CCUnsigned(unsigned X86CC) {
   4029   switch (X86CC) {
   4030   default: llvm_unreachable("Invalid integer condition!");
   4031   case X86::COND_E:     return true;
   4032   case X86::COND_G:     return false;
   4033   case X86::COND_GE:    return false;
   4034   case X86::COND_L:     return false;
   4035   case X86::COND_LE:    return false;
   4036   case X86::COND_NE:    return true;
   4037   case X86::COND_B:     return true;
   4038   case X86::COND_A:     return true;
   4039   case X86::COND_BE:    return true;
   4040   case X86::COND_AE:    return true;
   4041   }
   4042 }
   4043 
   4044 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
   4045   switch (SetCCOpcode) {
   4046   default: llvm_unreachable("Invalid integer condition!");
   4047   case ISD::SETEQ:  return X86::COND_E;
   4048   case ISD::SETGT:  return X86::COND_G;
   4049   case ISD::SETGE:  return X86::COND_GE;
   4050   case ISD::SETLT:  return X86::COND_L;
   4051   case ISD::SETLE:  return X86::COND_LE;
   4052   case ISD::SETNE:  return X86::COND_NE;
   4053   case ISD::SETULT: return X86::COND_B;
   4054   case ISD::SETUGT: return X86::COND_A;
   4055   case ISD::SETULE: return X86::COND_BE;
   4056   case ISD::SETUGE: return X86::COND_AE;
   4057   }
   4058 }
   4059 
   4060 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
   4061 /// condition code, returning the condition code and the LHS/RHS of the
   4062 /// comparison to make.
   4063 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
   4064                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
   4065   if (!isFP) {
   4066     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
   4067       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
   4068         // X > -1   -> X == 0, jump !sign.
   4069         RHS = DAG.getConstant(0, DL, RHS.getValueType());
   4070         return X86::COND_NS;
   4071       }
   4072       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
   4073         // X < 0   -> X == 0, jump on sign.
   4074         return X86::COND_S;
   4075       }
   4076       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
   4077         // X < 1   -> X <= 0
   4078         RHS = DAG.getConstant(0, DL, RHS.getValueType());
   4079         return X86::COND_LE;
   4080       }
   4081     }
   4082 
   4083     return TranslateIntegerX86CC(SetCCOpcode);
   4084   }
   4085 
   4086   // First determine if it is required or is profitable to flip the operands.
   4087 
   4088   // If LHS is a foldable load, but RHS is not, flip the condition.
   4089   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
   4090       !ISD::isNON_EXTLoad(RHS.getNode())) {
   4091     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
   4092     std::swap(LHS, RHS);
   4093   }
   4094 
   4095   switch (SetCCOpcode) {
   4096   default: break;
   4097   case ISD::SETOLT:
   4098   case ISD::SETOLE:
   4099   case ISD::SETUGT:
   4100   case ISD::SETUGE:
   4101     std::swap(LHS, RHS);
   4102     break;
   4103   }
   4104 
   4105   // On a floating point condition, the flags are set as follows:
   4106   // ZF  PF  CF   op
   4107   //  0 | 0 | 0 | X > Y
   4108   //  0 | 0 | 1 | X < Y
   4109   //  1 | 0 | 0 | X == Y
   4110   //  1 | 1 | 1 | unordered
   4111   switch (SetCCOpcode) {
   4112   default: llvm_unreachable("Condcode should be pre-legalized away");
   4113   case ISD::SETUEQ:
   4114   case ISD::SETEQ:   return X86::COND_E;
   4115   case ISD::SETOLT:              // flipped
   4116   case ISD::SETOGT:
   4117   case ISD::SETGT:   return X86::COND_A;
   4118   case ISD::SETOLE:              // flipped
   4119   case ISD::SETOGE:
   4120   case ISD::SETGE:   return X86::COND_AE;
   4121   case ISD::SETUGT:              // flipped
   4122   case ISD::SETULT:
   4123   case ISD::SETLT:   return X86::COND_B;
   4124   case ISD::SETUGE:              // flipped
   4125   case ISD::SETULE:
   4126   case ISD::SETLE:   return X86::COND_BE;
   4127   case ISD::SETONE:
   4128   case ISD::SETNE:   return X86::COND_NE;
   4129   case ISD::SETUO:   return X86::COND_P;
   4130   case ISD::SETO:    return X86::COND_NP;
   4131   case ISD::SETOEQ:
   4132   case ISD::SETUNE:  return X86::COND_INVALID;
   4133   }
   4134 }
   4135 
   4136 /// Is there a floating point cmov for the specific X86 condition code?
   4137 /// Current x86 isa includes the following FP cmov instructions:
   4138 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
   4139 static bool hasFPCMov(unsigned X86CC) {
   4140   switch (X86CC) {
   4141   default:
   4142     return false;
   4143   case X86::COND_B:
   4144   case X86::COND_BE:
   4145   case X86::COND_E:
   4146   case X86::COND_P:
   4147   case X86::COND_A:
   4148   case X86::COND_AE:
   4149   case X86::COND_NE:
   4150   case X86::COND_NP:
   4151     return true;
   4152   }
   4153 }
   4154 
   4155 /// Returns true if the target can instruction select the
   4156 /// specified FP immediate natively. If false, the legalizer will
   4157 /// materialize the FP immediate as a load from a constant pool.
   4158 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   4159   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
   4160     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
   4161       return true;
   4162   }
   4163   return false;
   4164 }
   4165 
   4166 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
   4167                                               ISD::LoadExtType ExtTy,
   4168                                               EVT NewVT) const {
   4169   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
   4170   // relocation target a movq or addq instruction: don't let the load shrink.
   4171   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
   4172   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
   4173     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
   4174       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
   4175   return true;
   4176 }
   4177 
   4178 /// \brief Returns true if it is beneficial to convert a load of a constant
   4179 /// to just the constant itself.
   4180 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   4181                                                           Type *Ty) const {
   4182   assert(Ty->isIntegerTy());
   4183 
   4184   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   4185   if (BitSize == 0 || BitSize > 64)
   4186     return false;
   4187   return true;
   4188 }
   4189 
   4190 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
   4191                                                 unsigned Index) const {
   4192   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
   4193     return false;
   4194 
   4195   return (Index == 0 || Index == ResVT.getVectorNumElements());
   4196 }
   4197 
   4198 bool X86TargetLowering::isCheapToSpeculateCttz() const {
   4199   // Speculate cttz only if we can directly use TZCNT.
   4200   return Subtarget->hasBMI();
   4201 }
   4202 
   4203 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
   4204   // Speculate ctlz only if we can directly use LZCNT.
   4205   return Subtarget->hasLZCNT();
   4206 }
   4207 
   4208 /// Return true if every element in Mask, beginning
   4209 /// from position Pos and ending in Pos+Size is undef.
   4210 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
   4211   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
   4212     if (0 <= Mask[i])
   4213       return false;
   4214   return true;
   4215 }
   4216 
   4217 /// Return true if Val is undef or if its value falls within the
   4218 /// specified range (L, H].
   4219 static bool isUndefOrInRange(int Val, int Low, int Hi) {
   4220   return (Val < 0) || (Val >= Low && Val < Hi);
   4221 }
   4222 
   4223 /// Val is either less than zero (undef) or equal to the specified value.
   4224 static bool isUndefOrEqual(int Val, int CmpVal) {
   4225   return (Val < 0 || Val == CmpVal);
   4226 }
   4227 
   4228 /// Return true if every element in Mask, beginning
   4229 /// from position Pos and ending in Pos+Size, falls within the specified
   4230 /// sequential range (Low, Low+Size]. or is undef.
   4231 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
   4232                                        unsigned Pos, unsigned Size, int Low) {
   4233   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
   4234     if (!isUndefOrEqual(Mask[i], Low))
   4235       return false;
   4236   return true;
   4237 }
   4238 
   4239 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
   4240 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
   4241 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
   4242   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   4243   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   4244     return false;
   4245 
   4246   // The index should be aligned on a vecWidth-bit boundary.
   4247   uint64_t Index =
   4248     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   4249 
   4250   MVT VT = N->getSimpleValueType(0);
   4251   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   4252   bool Result = (Index * ElSize) % vecWidth == 0;
   4253 
   4254   return Result;
   4255 }
   4256 
   4257 /// Return true if the specified INSERT_SUBVECTOR
   4258 /// operand specifies a subvector insert that is suitable for input to
   4259 /// insertion of 128 or 256-bit subvectors
   4260 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
   4261   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   4262   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   4263     return false;
   4264   // The index should be aligned on a vecWidth-bit boundary.
   4265   uint64_t Index =
   4266     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   4267 
   4268   MVT VT = N->getSimpleValueType(0);
   4269   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   4270   bool Result = (Index * ElSize) % vecWidth == 0;
   4271 
   4272   return Result;
   4273 }
   4274 
   4275 bool X86::isVINSERT128Index(SDNode *N) {
   4276   return isVINSERTIndex(N, 128);
   4277 }
   4278 
   4279 bool X86::isVINSERT256Index(SDNode *N) {
   4280   return isVINSERTIndex(N, 256);
   4281 }
   4282 
   4283 bool X86::isVEXTRACT128Index(SDNode *N) {
   4284   return isVEXTRACTIndex(N, 128);
   4285 }
   4286 
   4287 bool X86::isVEXTRACT256Index(SDNode *N) {
   4288   return isVEXTRACTIndex(N, 256);
   4289 }
   4290 
   4291 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
   4292   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   4293   assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
   4294          "Illegal extract subvector for VEXTRACT");
   4295 
   4296   uint64_t Index =
   4297     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   4298 
   4299   MVT VecVT = N->getOperand(0).getSimpleValueType();
   4300   MVT ElVT = VecVT.getVectorElementType();
   4301 
   4302   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
   4303   return Index / NumElemsPerChunk;
   4304 }
   4305 
   4306 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
   4307   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   4308   assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
   4309          "Illegal insert subvector for VINSERT");
   4310 
   4311   uint64_t Index =
   4312     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   4313 
   4314   MVT VecVT = N->getSimpleValueType(0);
   4315   MVT ElVT = VecVT.getVectorElementType();
   4316 
   4317   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
   4318   return Index / NumElemsPerChunk;
   4319 }
   4320 
   4321 /// Return the appropriate immediate to extract the specified
   4322 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
   4323 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
   4324   return getExtractVEXTRACTImmediate(N, 128);
   4325 }
   4326 
   4327 /// Return the appropriate immediate to extract the specified
   4328 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
   4329 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
   4330   return getExtractVEXTRACTImmediate(N, 256);
   4331 }
   4332 
   4333 /// Return the appropriate immediate to insert at the specified
   4334 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
   4335 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
   4336   return getInsertVINSERTImmediate(N, 128);
   4337 }
   4338 
   4339 /// Return the appropriate immediate to insert at the specified
   4340 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
   4341 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
   4342   return getInsertVINSERTImmediate(N, 256);
   4343 }
   4344 
   4345 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
   4346 bool X86::isZeroNode(SDValue Elt) {
   4347   return isNullConstant(Elt) || isNullFPConstant(Elt);
   4348 }
   4349 
   4350 // Build a vector of constants
   4351 // Use an UNDEF node if MaskElt == -1.
   4352 // Spilt 64-bit constants in the 32-bit mode.
   4353 static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
   4354                               SelectionDAG &DAG,
   4355                               SDLoc dl, bool IsMask = false) {
   4356 
   4357   SmallVector<SDValue, 32>  Ops;
   4358   bool Split = false;
   4359 
   4360   MVT ConstVecVT = VT;
   4361   unsigned NumElts = VT.getVectorNumElements();
   4362   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
   4363   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
   4364     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
   4365     Split = true;
   4366   }
   4367 
   4368   MVT EltVT = ConstVecVT.getVectorElementType();
   4369   for (unsigned i = 0; i < NumElts; ++i) {
   4370     bool IsUndef = Values[i] < 0 && IsMask;
   4371     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
   4372       DAG.getConstant(Values[i], dl, EltVT);
   4373     Ops.push_back(OpNode);
   4374     if (Split)
   4375       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
   4376                     DAG.getConstant(0, dl, EltVT));
   4377   }
   4378   SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops);
   4379   if (Split)
   4380     ConstsNode = DAG.getBitcast(VT, ConstsNode);
   4381   return ConstsNode;
   4382 }
   4383 
   4384 /// Returns a vector of specified type with all zero elements.
   4385 static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget,
   4386                              SelectionDAG &DAG, SDLoc dl) {
   4387   assert(VT.isVector() && "Expected a vector type");
   4388 
   4389   // Always build SSE zero vectors as <4 x i32> bitcasted
   4390   // to their dest type. This ensures they get CSE'd.
   4391   SDValue Vec;
   4392   if (VT.is128BitVector()) {  // SSE
   4393     if (Subtarget->hasSSE2()) {  // SSE2
   4394       SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
   4395       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4396     } else { // SSE1
   4397       SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
   4398       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
   4399     }
   4400   } else if (VT.is256BitVector()) { // AVX
   4401     if (Subtarget->hasInt256()) { // AVX2
   4402       SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
   4403       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4404       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
   4405     } else {
   4406       // 256-bit logic and arithmetic instructions in AVX are all
   4407       // floating-point, no support for integer ops. Emit fp zeroed vectors.
   4408       SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
   4409       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4410       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
   4411     }
   4412   } else if (VT.is512BitVector()) { // AVX-512
   4413       SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
   4414       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
   4415                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4416       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
   4417   } else if (VT.getVectorElementType() == MVT::i1) {
   4418 
   4419     assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
   4420             && "Unexpected vector type");
   4421     assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8)
   4422             && "Unexpected vector type");
   4423     SDValue Cst = DAG.getConstant(0, dl, MVT::i1);
   4424     SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst);
   4425     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   4426   } else
   4427     llvm_unreachable("Unexpected vector type");
   4428 
   4429   return DAG.getBitcast(VT, Vec);
   4430 }
   4431 
   4432 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
   4433                                 SelectionDAG &DAG, SDLoc dl,
   4434                                 unsigned vectorWidth) {
   4435   assert((vectorWidth == 128 || vectorWidth == 256) &&
   4436          "Unsupported vector width");
   4437   EVT VT = Vec.getValueType();
   4438   EVT ElVT = VT.getVectorElementType();
   4439   unsigned Factor = VT.getSizeInBits()/vectorWidth;
   4440   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
   4441                                   VT.getVectorNumElements()/Factor);
   4442 
   4443   // Extract from UNDEF is UNDEF.
   4444   if (Vec.getOpcode() == ISD::UNDEF)
   4445     return DAG.getUNDEF(ResultVT);
   4446 
   4447   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
   4448   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
   4449   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
   4450 
   4451   // This is the index of the first element of the vectorWidth-bit chunk
   4452   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
   4453   IdxVal &= ~(ElemsPerChunk - 1);
   4454 
   4455   // If the input is a buildvector just emit a smaller one.
   4456   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
   4457     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
   4458                        makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
   4459 
   4460   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   4461   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
   4462 }
   4463 
   4464 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
   4465 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
   4466 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
   4467 /// instructions or a simple subregister reference. Idx is an index in the
   4468 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
   4469 /// lowering EXTRACT_VECTOR_ELT operations easier.
   4470 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
   4471                                    SelectionDAG &DAG, SDLoc dl) {
   4472   assert((Vec.getValueType().is256BitVector() ||
   4473           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
   4474   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
   4475 }
   4476 
   4477 /// Generate a DAG to grab 256-bits from a 512-bit vector.
   4478 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
   4479                                    SelectionDAG &DAG, SDLoc dl) {
   4480   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
   4481   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
   4482 }
   4483 
   4484 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
   4485                                unsigned IdxVal, SelectionDAG &DAG,
   4486                                SDLoc dl, unsigned vectorWidth) {
   4487   assert((vectorWidth == 128 || vectorWidth == 256) &&
   4488          "Unsupported vector width");
   4489   // Inserting UNDEF is Result
   4490   if (Vec.getOpcode() == ISD::UNDEF)
   4491     return Result;
   4492   EVT VT = Vec.getValueType();
   4493   EVT ElVT = VT.getVectorElementType();
   4494   EVT ResultVT = Result.getValueType();
   4495 
   4496   // Insert the relevant vectorWidth bits.
   4497   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
   4498   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
   4499 
   4500   // This is the index of the first element of the vectorWidth-bit chunk
   4501   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
   4502   IdxVal &= ~(ElemsPerChunk - 1);
   4503 
   4504   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   4505   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
   4506 }
   4507 
   4508 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
   4509 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
   4510 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
   4511 /// simple superregister reference.  Idx is an index in the 128 bits
   4512 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
   4513 /// lowering INSERT_VECTOR_ELT operations easier.
   4514 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   4515                                   SelectionDAG &DAG, SDLoc dl) {
   4516   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
   4517 
   4518   // For insertion into the zero index (low half) of a 256-bit vector, it is
   4519   // more efficient to generate a blend with immediate instead of an insert*128.
   4520   // We are still creating an INSERT_SUBVECTOR below with an undef node to
   4521   // extend the subvector to the size of the result vector. Make sure that
   4522   // we are not recursing on that node by checking for undef here.
   4523   if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
   4524       Result.getOpcode() != ISD::UNDEF) {
   4525     EVT ResultVT = Result.getValueType();
   4526     SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
   4527     SDValue Undef = DAG.getUNDEF(ResultVT);
   4528     SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
   4529                                  Vec, ZeroIndex);
   4530 
   4531     // The blend instruction, and therefore its mask, depend on the data type.
   4532     MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
   4533     if (ScalarType.isFloatingPoint()) {
   4534       // Choose either vblendps (float) or vblendpd (double).
   4535       unsigned ScalarSize = ScalarType.getSizeInBits();
   4536       assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
   4537       unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
   4538       SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
   4539       return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
   4540     }
   4541 
   4542     const X86Subtarget &Subtarget =
   4543     static_cast<const X86Subtarget &>(DAG.getSubtarget());
   4544 
   4545     // AVX2 is needed for 256-bit integer blend support.
   4546     // Integers must be cast to 32-bit because there is only vpblendd;
   4547     // vpblendw can't be used for this because it has a handicapped mask.
   4548 
   4549     // If we don't have AVX2, then cast to float. Using a wrong domain blend
   4550     // is still more efficient than using the wrong domain vinsertf128 that
   4551     // will be created by InsertSubVector().
   4552     MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
   4553 
   4554     SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
   4555     Vec256 = DAG.getBitcast(CastVT, Vec256);
   4556     Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
   4557     return DAG.getBitcast(ResultVT, Vec256);
   4558   }
   4559 
   4560   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
   4561 }
   4562 
   4563 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   4564                                   SelectionDAG &DAG, SDLoc dl) {
   4565   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
   4566   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
   4567 }
   4568 
   4569 /// Insert i1-subvector to i1-vector.
   4570 static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) {
   4571 
   4572   SDLoc dl(Op);
   4573   SDValue Vec = Op.getOperand(0);
   4574   SDValue SubVec = Op.getOperand(1);
   4575   SDValue Idx = Op.getOperand(2);
   4576 
   4577   if (!isa<ConstantSDNode>(Idx))
   4578     return SDValue();
   4579 
   4580   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   4581   if (IdxVal == 0  && Vec.isUndef()) // the operation is legal
   4582     return Op;
   4583 
   4584   MVT OpVT = Op.getSimpleValueType();
   4585   MVT SubVecVT = SubVec.getSimpleValueType();
   4586   unsigned NumElems = OpVT.getVectorNumElements();
   4587   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
   4588 
   4589   assert(IdxVal + SubVecNumElems <= NumElems &&
   4590          IdxVal % SubVecVT.getSizeInBits() == 0 &&
   4591          "Unexpected index value in INSERT_SUBVECTOR");
   4592 
   4593   // There are 3 possible cases:
   4594   // 1. Subvector should be inserted in the lower part (IdxVal == 0)
   4595   // 2. Subvector should be inserted in the upper part
   4596   //    (IdxVal + SubVecNumElems == NumElems)
   4597   // 3. Subvector should be inserted in the middle (for example v2i1
   4598   //    to v16i1, index 2)
   4599 
   4600   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
   4601   SDValue Undef = DAG.getUNDEF(OpVT);
   4602   SDValue WideSubVec =
   4603     DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx);
   4604   if (Vec.isUndef())
   4605     return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
   4606       DAG.getConstant(IdxVal, dl, MVT::i8));
   4607 
   4608   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
   4609     unsigned ShiftLeft = NumElems - SubVecNumElems;
   4610     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
   4611     WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
   4612       DAG.getConstant(ShiftLeft, dl, MVT::i8));
   4613     return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec,
   4614       DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec;
   4615   }
   4616 
   4617   if (IdxVal == 0) {
   4618     // Zero lower bits of the Vec
   4619     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
   4620     Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
   4621     Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
   4622     // Merge them together
   4623     return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
   4624   }
   4625 
   4626   // Simple case when we put subvector in the upper part
   4627   if (IdxVal + SubVecNumElems == NumElems) {
   4628     // Zero upper bits of the Vec
   4629     WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec,
   4630                         DAG.getConstant(IdxVal, dl, MVT::i8));
   4631     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
   4632     Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
   4633     Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
   4634     return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
   4635   }
   4636   // Subvector should be inserted in the middle - use shuffle
   4637   SmallVector<int, 64> Mask;
   4638   for (unsigned i = 0; i < NumElems; ++i)
   4639     Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
   4640                     i : i + NumElems);
   4641   return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
   4642 }
   4643 
   4644 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
   4645 /// instructions. This is used because creating CONCAT_VECTOR nodes of
   4646 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
   4647 /// large BUILD_VECTORS.
   4648 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
   4649                                    unsigned NumElems, SelectionDAG &DAG,
   4650                                    SDLoc dl) {
   4651   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
   4652   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
   4653 }
   4654 
   4655 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
   4656                                    unsigned NumElems, SelectionDAG &DAG,
   4657                                    SDLoc dl) {
   4658   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
   4659   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
   4660 }
   4661 
   4662 /// Returns a vector of specified type with all bits set.
   4663 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
   4664 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
   4665 /// Then bitcast to their original type, ensuring they get CSE'd.
   4666 static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget,
   4667                              SelectionDAG &DAG, SDLoc dl) {
   4668   assert(VT.isVector() && "Expected a vector type");
   4669 
   4670   SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32);
   4671   SDValue Vec;
   4672   if (VT.is512BitVector()) {
   4673     SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
   4674                       Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4675     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
   4676   } else if (VT.is256BitVector()) {
   4677     if (Subtarget->hasInt256()) { // AVX2
   4678       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4679       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
   4680     } else { // AVX
   4681       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4682       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
   4683     }
   4684   } else if (VT.is128BitVector()) {
   4685     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4686   } else
   4687     llvm_unreachable("Unexpected vector type");
   4688 
   4689   return DAG.getBitcast(VT, Vec);
   4690 }
   4691 
   4692 /// Returns a vector_shuffle node for an unpackl operation.
   4693 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
   4694                           SDValue V2) {
   4695   unsigned NumElems = VT.getVectorNumElements();
   4696   SmallVector<int, 8> Mask;
   4697   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
   4698     Mask.push_back(i);
   4699     Mask.push_back(i + NumElems);
   4700   }
   4701   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4702 }
   4703 
   4704 /// Returns a vector_shuffle node for an unpackh operation.
   4705 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
   4706                           SDValue V2) {
   4707   unsigned NumElems = VT.getVectorNumElements();
   4708   SmallVector<int, 8> Mask;
   4709   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
   4710     Mask.push_back(i + Half);
   4711     Mask.push_back(i + NumElems + Half);
   4712   }
   4713   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4714 }
   4715 
   4716 /// Return a vector_shuffle of the specified vector of zero or undef vector.
   4717 /// This produces a shuffle where the low element of V2 is swizzled into the
   4718 /// zero/undef vector, landing at element Idx.
   4719 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
   4720 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
   4721                                            bool IsZero,
   4722                                            const X86Subtarget *Subtarget,
   4723                                            SelectionDAG &DAG) {
   4724   MVT VT = V2.getSimpleValueType();
   4725   SDValue V1 = IsZero
   4726     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   4727   unsigned NumElems = VT.getVectorNumElements();
   4728   SmallVector<int, 16> MaskVec;
   4729   for (unsigned i = 0; i != NumElems; ++i)
   4730     // If this is the insertion idx, put the low elt of V2 here.
   4731     MaskVec.push_back(i == Idx ? NumElems : i);
   4732   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
   4733 }
   4734 
   4735 /// Calculates the shuffle mask corresponding to the target-specific opcode.
   4736 /// Returns true if the Mask could be calculated. Sets IsUnary to true if only
   4737 /// uses one source. Note that this will set IsUnary for shuffles which use a
   4738 /// single input multiple times, and in those cases it will
   4739 /// adjust the mask to only have indices within that single input.
   4740 /// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero.
   4741 static bool getTargetShuffleMask(SDNode *N, MVT VT,
   4742                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   4743   unsigned NumElems = VT.getVectorNumElements();
   4744   SDValue ImmN;
   4745 
   4746   IsUnary = false;
   4747   bool IsFakeUnary = false;
   4748   switch(N->getOpcode()) {
   4749   case X86ISD::BLENDI:
   4750     ImmN = N->getOperand(N->getNumOperands()-1);
   4751     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4752     break;
   4753   case X86ISD::SHUFP:
   4754     ImmN = N->getOperand(N->getNumOperands()-1);
   4755     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4756     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4757     break;
   4758   case X86ISD::UNPCKH:
   4759     DecodeUNPCKHMask(VT, Mask);
   4760     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4761     break;
   4762   case X86ISD::UNPCKL:
   4763     DecodeUNPCKLMask(VT, Mask);
   4764     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4765     break;
   4766   case X86ISD::MOVHLPS:
   4767     DecodeMOVHLPSMask(NumElems, Mask);
   4768     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4769     break;
   4770   case X86ISD::MOVLHPS:
   4771     DecodeMOVLHPSMask(NumElems, Mask);
   4772     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4773     break;
   4774   case X86ISD::PALIGNR:
   4775     ImmN = N->getOperand(N->getNumOperands()-1);
   4776     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4777     break;
   4778   case X86ISD::PSHUFD:
   4779   case X86ISD::VPERMILPI:
   4780     ImmN = N->getOperand(N->getNumOperands()-1);
   4781     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4782     IsUnary = true;
   4783     break;
   4784   case X86ISD::PSHUFHW:
   4785     ImmN = N->getOperand(N->getNumOperands()-1);
   4786     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4787     IsUnary = true;
   4788     break;
   4789   case X86ISD::PSHUFLW:
   4790     ImmN = N->getOperand(N->getNumOperands()-1);
   4791     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4792     IsUnary = true;
   4793     break;
   4794   case X86ISD::PSHUFB: {
   4795     IsUnary = true;
   4796     SDValue MaskNode = N->getOperand(1);
   4797     while (MaskNode->getOpcode() == ISD::BITCAST)
   4798       MaskNode = MaskNode->getOperand(0);
   4799 
   4800     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
   4801       // If we have a build-vector, then things are easy.
   4802       MVT VT = MaskNode.getSimpleValueType();
   4803       assert(VT.isVector() &&
   4804              "Can't produce a non-vector with a build_vector!");
   4805       if (!VT.isInteger())
   4806         return false;
   4807 
   4808       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
   4809 
   4810       SmallVector<uint64_t, 32> RawMask;
   4811       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
   4812         SDValue Op = MaskNode->getOperand(i);
   4813         if (Op->getOpcode() == ISD::UNDEF) {
   4814           RawMask.push_back((uint64_t)SM_SentinelUndef);
   4815           continue;
   4816         }
   4817         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
   4818         if (!CN)
   4819           return false;
   4820         APInt MaskElement = CN->getAPIntValue();
   4821 
   4822         // We now have to decode the element which could be any integer size and
   4823         // extract each byte of it.
   4824         for (int j = 0; j < NumBytesPerElement; ++j) {
   4825           // Note that this is x86 and so always little endian: the low byte is
   4826           // the first byte of the mask.
   4827           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
   4828           MaskElement = MaskElement.lshr(8);
   4829         }
   4830       }
   4831       DecodePSHUFBMask(RawMask, Mask);
   4832       break;
   4833     }
   4834 
   4835     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
   4836     if (!MaskLoad)
   4837       return false;
   4838 
   4839     SDValue Ptr = MaskLoad->getBasePtr();
   4840     if (Ptr->getOpcode() == X86ISD::Wrapper ||
   4841         Ptr->getOpcode() == X86ISD::WrapperRIP)
   4842       Ptr = Ptr->getOperand(0);
   4843 
   4844     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
   4845     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
   4846       return false;
   4847 
   4848     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
   4849       DecodePSHUFBMask(C, Mask);
   4850       if (Mask.empty())
   4851         return false;
   4852       break;
   4853     }
   4854 
   4855     return false;
   4856   }
   4857   case X86ISD::VPERMI:
   4858     ImmN = N->getOperand(N->getNumOperands()-1);
   4859     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4860     IsUnary = true;
   4861     break;
   4862   case X86ISD::MOVSS:
   4863   case X86ISD::MOVSD:
   4864     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
   4865     break;
   4866   case X86ISD::VPERM2X128:
   4867     ImmN = N->getOperand(N->getNumOperands()-1);
   4868     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4869     if (Mask.empty()) return false;
   4870     // Mask only contains negative index if an element is zero.
   4871     if (std::any_of(Mask.begin(), Mask.end(),
   4872                     [](int M){ return M == SM_SentinelZero; }))
   4873       return false;
   4874     break;
   4875   case X86ISD::MOVSLDUP:
   4876     DecodeMOVSLDUPMask(VT, Mask);
   4877     IsUnary = true;
   4878     break;
   4879   case X86ISD::MOVSHDUP:
   4880     DecodeMOVSHDUPMask(VT, Mask);
   4881     IsUnary = true;
   4882     break;
   4883   case X86ISD::MOVDDUP:
   4884     DecodeMOVDDUPMask(VT, Mask);
   4885     IsUnary = true;
   4886     break;
   4887   case X86ISD::MOVLHPD:
   4888   case X86ISD::MOVLPD:
   4889   case X86ISD::MOVLPS:
   4890     // Not yet implemented
   4891     return false;
   4892   case X86ISD::VPERMV: {
   4893     IsUnary = true;
   4894     SDValue MaskNode = N->getOperand(0);
   4895     while (MaskNode->getOpcode() == ISD::BITCAST)
   4896       MaskNode = MaskNode->getOperand(0);
   4897 
   4898     unsigned MaskLoBits = Log2_64(VT.getVectorNumElements());
   4899     SmallVector<uint64_t, 32> RawMask;
   4900     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
   4901       // If we have a build-vector, then things are easy.
   4902       assert(MaskNode.getSimpleValueType().isInteger() &&
   4903              MaskNode.getSimpleValueType().getVectorNumElements() ==
   4904              VT.getVectorNumElements());
   4905 
   4906       for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
   4907         SDValue Op = MaskNode->getOperand(i);
   4908         if (Op->getOpcode() == ISD::UNDEF)
   4909           RawMask.push_back((uint64_t)SM_SentinelUndef);
   4910         else if (isa<ConstantSDNode>(Op)) {
   4911           APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue();
   4912           RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
   4913         } else
   4914           return false;
   4915       }
   4916       DecodeVPERMVMask(RawMask, Mask);
   4917       break;
   4918     }
   4919     if (MaskNode->getOpcode() == X86ISD::VBROADCAST) {
   4920       unsigned NumEltsInMask = MaskNode->getNumOperands();
   4921       MaskNode = MaskNode->getOperand(0);
   4922       auto *CN = dyn_cast<ConstantSDNode>(MaskNode);
   4923       if (CN) {
   4924         APInt MaskEltValue = CN->getAPIntValue();
   4925         for (unsigned i = 0; i < NumEltsInMask; ++i)
   4926           RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue());
   4927         DecodeVPERMVMask(RawMask, Mask);
   4928         break;
   4929       }
   4930       // It may be a scalar load
   4931     }
   4932 
   4933     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
   4934     if (!MaskLoad)
   4935       return false;
   4936 
   4937     SDValue Ptr = MaskLoad->getBasePtr();
   4938     if (Ptr->getOpcode() == X86ISD::Wrapper ||
   4939         Ptr->getOpcode() == X86ISD::WrapperRIP)
   4940       Ptr = Ptr->getOperand(0);
   4941 
   4942     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
   4943     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
   4944       return false;
   4945 
   4946     auto *C = dyn_cast<Constant>(MaskCP->getConstVal());
   4947     if (C) {
   4948       DecodeVPERMVMask(C, VT, Mask);
   4949       if (Mask.empty())
   4950         return false;
   4951       break;
   4952     }
   4953     return false;
   4954   }
   4955   case X86ISD::VPERMV3: {
   4956     IsUnary = false;
   4957     SDValue MaskNode = N->getOperand(1);
   4958     while (MaskNode->getOpcode() == ISD::BITCAST)
   4959       MaskNode = MaskNode->getOperand(1);
   4960 
   4961     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
   4962       // If we have a build-vector, then things are easy.
   4963       assert(MaskNode.getSimpleValueType().isInteger() &&
   4964              MaskNode.getSimpleValueType().getVectorNumElements() ==
   4965              VT.getVectorNumElements());
   4966 
   4967       SmallVector<uint64_t, 32> RawMask;
   4968       unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2);
   4969 
   4970       for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
   4971         SDValue Op = MaskNode->getOperand(i);
   4972         if (Op->getOpcode() == ISD::UNDEF)
   4973           RawMask.push_back((uint64_t)SM_SentinelUndef);
   4974         else {
   4975           auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
   4976           if (!CN)
   4977             return false;
   4978           APInt MaskElement = CN->getAPIntValue();
   4979           RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
   4980         }
   4981       }
   4982       DecodeVPERMV3Mask(RawMask, Mask);
   4983       break;
   4984     }
   4985 
   4986     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
   4987     if (!MaskLoad)
   4988       return false;
   4989 
   4990     SDValue Ptr = MaskLoad->getBasePtr();
   4991     if (Ptr->getOpcode() == X86ISD::Wrapper ||
   4992         Ptr->getOpcode() == X86ISD::WrapperRIP)
   4993       Ptr = Ptr->getOperand(0);
   4994 
   4995     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
   4996     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
   4997       return false;
   4998 
   4999     auto *C = dyn_cast<Constant>(MaskCP->getConstVal());
   5000     if (C) {
   5001       DecodeVPERMV3Mask(C, VT, Mask);
   5002       if (Mask.empty())
   5003         return false;
   5004       break;
   5005     }
   5006     return false;
   5007   }
   5008   default: llvm_unreachable("unknown target shuffle node");
   5009   }
   5010 
   5011   // If we have a fake unary shuffle, the shuffle mask is spread across two
   5012   // inputs that are actually the same node. Re-map the mask to always point
   5013   // into the first input.
   5014   if (IsFakeUnary)
   5015     for (int &M : Mask)
   5016       if (M >= (int)Mask.size())
   5017         M -= Mask.size();
   5018 
   5019   return true;
   5020 }
   5021 
   5022 /// Returns the scalar element that will make up the ith
   5023 /// element of the result of the vector shuffle.
   5024 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   5025                                    unsigned Depth) {
   5026   if (Depth == 6)
   5027     return SDValue();  // Limit search depth.
   5028 
   5029   SDValue V = SDValue(N, 0);
   5030   EVT VT = V.getValueType();
   5031   unsigned Opcode = V.getOpcode();
   5032 
   5033   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
   5034   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
   5035     int Elt = SV->getMaskElt(Index);
   5036 
   5037     if (Elt < 0)
   5038       return DAG.getUNDEF(VT.getVectorElementType());
   5039 
   5040     unsigned NumElems = VT.getVectorNumElements();
   5041     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
   5042                                          : SV->getOperand(1);
   5043     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
   5044   }
   5045 
   5046   // Recurse into target specific vector shuffles to find scalars.
   5047   if (isTargetShuffle(Opcode)) {
   5048     MVT ShufVT = V.getSimpleValueType();
   5049     unsigned NumElems = ShufVT.getVectorNumElements();
   5050     SmallVector<int, 16> ShuffleMask;
   5051     bool IsUnary;
   5052 
   5053     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
   5054       return SDValue();
   5055 
   5056     int Elt = ShuffleMask[Index];
   5057     if (Elt < 0)
   5058       return DAG.getUNDEF(ShufVT.getVectorElementType());
   5059 
   5060     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
   5061                                          : N->getOperand(1);
   5062     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
   5063                                Depth+1);
   5064   }
   5065 
   5066   // Actual nodes that may contain scalar elements
   5067   if (Opcode == ISD::BITCAST) {
   5068     V = V.getOperand(0);
   5069     EVT SrcVT = V.getValueType();
   5070     unsigned NumElems = VT.getVectorNumElements();
   5071 
   5072     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
   5073       return SDValue();
   5074   }
   5075 
   5076   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   5077     return (Index == 0) ? V.getOperand(0)
   5078                         : DAG.getUNDEF(VT.getVectorElementType());
   5079 
   5080   if (V.getOpcode() == ISD::BUILD_VECTOR)
   5081     return V.getOperand(Index);
   5082 
   5083   return SDValue();
   5084 }
   5085 
   5086 /// Custom lower build_vector of v16i8.
   5087 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   5088                                        unsigned NumNonZero, unsigned NumZero,
   5089                                        SelectionDAG &DAG,
   5090                                        const X86Subtarget* Subtarget,
   5091                                        const TargetLowering &TLI) {
   5092   if (NumNonZero > 8)
   5093     return SDValue();
   5094 
   5095   SDLoc dl(Op);
   5096   SDValue V;
   5097   bool First = true;
   5098 
   5099   // SSE4.1 - use PINSRB to insert each byte directly.
   5100   if (Subtarget->hasSSE41()) {
   5101     for (unsigned i = 0; i < 16; ++i) {
   5102       bool isNonZero = (NonZeros & (1 << i)) != 0;
   5103       if (isNonZero) {
   5104         if (First) {
   5105           if (NumZero)
   5106             V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
   5107           else
   5108             V = DAG.getUNDEF(MVT::v16i8);
   5109           First = false;
   5110         }
   5111         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
   5112                         MVT::v16i8, V, Op.getOperand(i),
   5113                         DAG.getIntPtrConstant(i, dl));
   5114       }
   5115     }
   5116 
   5117     return V;
   5118   }
   5119 
   5120   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
   5121   for (unsigned i = 0; i < 16; ++i) {
   5122     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
   5123     if (ThisIsNonZero && First) {
   5124       if (NumZero)
   5125         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   5126       else
   5127         V = DAG.getUNDEF(MVT::v8i16);
   5128       First = false;
   5129     }
   5130 
   5131     if ((i & 1) != 0) {
   5132       SDValue ThisElt, LastElt;
   5133       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
   5134       if (LastIsNonZero) {
   5135         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
   5136                               MVT::i16, Op.getOperand(i-1));
   5137       }
   5138       if (ThisIsNonZero) {
   5139         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
   5140         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
   5141                               ThisElt, DAG.getConstant(8, dl, MVT::i8));
   5142         if (LastIsNonZero)
   5143           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
   5144       } else
   5145         ThisElt = LastElt;
   5146 
   5147       if (ThisElt.getNode())
   5148         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
   5149                         DAG.getIntPtrConstant(i/2, dl));
   5150     }
   5151   }
   5152 
   5153   return DAG.getBitcast(MVT::v16i8, V);
   5154 }
   5155 
   5156 /// Custom lower build_vector of v8i16.
   5157 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   5158                                      unsigned NumNonZero, unsigned NumZero,
   5159                                      SelectionDAG &DAG,
   5160                                      const X86Subtarget* Subtarget,
   5161                                      const TargetLowering &TLI) {
   5162   if (NumNonZero > 4)
   5163     return SDValue();
   5164 
   5165   SDLoc dl(Op);
   5166   SDValue V;
   5167   bool First = true;
   5168   for (unsigned i = 0; i < 8; ++i) {
   5169     bool isNonZero = (NonZeros & (1 << i)) != 0;
   5170     if (isNonZero) {
   5171       if (First) {
   5172         if (NumZero)
   5173           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   5174         else
   5175           V = DAG.getUNDEF(MVT::v8i16);
   5176         First = false;
   5177       }
   5178       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
   5179                       MVT::v8i16, V, Op.getOperand(i),
   5180                       DAG.getIntPtrConstant(i, dl));
   5181     }
   5182   }
   5183 
   5184   return V;
   5185 }
   5186 
   5187 /// Custom lower build_vector of v4i32 or v4f32.
   5188 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
   5189                                      const X86Subtarget *Subtarget,
   5190                                      const TargetLowering &TLI) {
   5191   // Find all zeroable elements.
   5192   std::bitset<4> Zeroable;
   5193   for (int i=0; i < 4; ++i) {
   5194     SDValue Elt = Op->getOperand(i);
   5195     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
   5196   }
   5197   assert(Zeroable.size() - Zeroable.count() > 1 &&
   5198          "We expect at least two non-zero elements!");
   5199 
   5200   // We only know how to deal with build_vector nodes where elements are either
   5201   // zeroable or extract_vector_elt with constant index.
   5202   SDValue FirstNonZero;
   5203   unsigned FirstNonZeroIdx;
   5204   for (unsigned i=0; i < 4; ++i) {
   5205     if (Zeroable[i])
   5206       continue;
   5207     SDValue Elt = Op->getOperand(i);
   5208     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   5209         !isa<ConstantSDNode>(Elt.getOperand(1)))
   5210       return SDValue();
   5211     // Make sure that this node is extracting from a 128-bit vector.
   5212     MVT VT = Elt.getOperand(0).getSimpleValueType();
   5213     if (!VT.is128BitVector())
   5214       return SDValue();
   5215     if (!FirstNonZero.getNode()) {
   5216       FirstNonZero = Elt;
   5217       FirstNonZeroIdx = i;
   5218     }
   5219   }
   5220 
   5221   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
   5222   SDValue V1 = FirstNonZero.getOperand(0);
   5223   MVT VT = V1.getSimpleValueType();
   5224 
   5225   // See if this build_vector can be lowered as a blend with zero.
   5226   SDValue Elt;
   5227   unsigned EltMaskIdx, EltIdx;
   5228   int Mask[4];
   5229   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
   5230     if (Zeroable[EltIdx]) {
   5231       // The zero vector will be on the right hand side.
   5232       Mask[EltIdx] = EltIdx+4;
   5233       continue;
   5234     }
   5235 
   5236     Elt = Op->getOperand(EltIdx);
   5237     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
   5238     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
   5239     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
   5240       break;
   5241     Mask[EltIdx] = EltIdx;
   5242   }
   5243 
   5244   if (EltIdx == 4) {
   5245     // Let the shuffle legalizer deal with blend operations.
   5246     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
   5247     if (V1.getSimpleValueType() != VT)
   5248       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
   5249     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
   5250   }
   5251 
   5252   // See if we can lower this build_vector to a INSERTPS.
   5253   if (!Subtarget->hasSSE41())
   5254     return SDValue();
   5255 
   5256   SDValue V2 = Elt.getOperand(0);
   5257   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
   5258     V1 = SDValue();
   5259 
   5260   bool CanFold = true;
   5261   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
   5262     if (Zeroable[i])
   5263       continue;
   5264 
   5265     SDValue Current = Op->getOperand(i);
   5266     SDValue SrcVector = Current->getOperand(0);
   5267     if (!V1.getNode())
   5268       V1 = SrcVector;
   5269     CanFold = SrcVector == V1 &&
   5270       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
   5271   }
   5272 
   5273   if (!CanFold)
   5274     return SDValue();
   5275 
   5276   assert(V1.getNode() && "Expected at least two non-zero elements!");
   5277   if (V1.getSimpleValueType() != MVT::v4f32)
   5278     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
   5279   if (V2.getSimpleValueType() != MVT::v4f32)
   5280     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
   5281 
   5282   // Ok, we can emit an INSERTPS instruction.
   5283   unsigned ZMask = Zeroable.to_ulong();
   5284 
   5285   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
   5286   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
   5287   SDLoc DL(Op);
   5288   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
   5289                                DAG.getIntPtrConstant(InsertPSMask, DL));
   5290   return DAG.getBitcast(VT, Result);
   5291 }
   5292 
   5293 /// Return a vector logical shift node.
   5294 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
   5295                          unsigned NumBits, SelectionDAG &DAG,
   5296                          const TargetLowering &TLI, SDLoc dl) {
   5297   assert(VT.is128BitVector() && "Unknown type for VShift");
   5298   MVT ShVT = MVT::v2i64;
   5299   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   5300   SrcOp = DAG.getBitcast(ShVT, SrcOp);
   5301   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
   5302   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
   5303   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
   5304   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
   5305 }
   5306 
   5307 static SDValue
   5308 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
   5309 
   5310   // Check if the scalar load can be widened into a vector load. And if
   5311   // the address is "base + cst" see if the cst can be "absorbed" into
   5312   // the shuffle mask.
   5313   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
   5314     SDValue Ptr = LD->getBasePtr();
   5315     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
   5316       return SDValue();
   5317     EVT PVT = LD->getValueType(0);
   5318     if (PVT != MVT::i32 && PVT != MVT::f32)
   5319       return SDValue();
   5320 
   5321     int FI = -1;
   5322     int64_t Offset = 0;
   5323     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
   5324       FI = FINode->getIndex();
   5325       Offset = 0;
   5326     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
   5327                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
   5328       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
   5329       Offset = Ptr.getConstantOperandVal(1);
   5330       Ptr = Ptr.getOperand(0);
   5331     } else {
   5332       return SDValue();
   5333     }
   5334 
   5335     // FIXME: 256-bit vector instructions don't require a strict alignment,
   5336     // improve this code to support it better.
   5337     unsigned RequiredAlign = VT.getSizeInBits()/8;
   5338     SDValue Chain = LD->getChain();
   5339     // Make sure the stack object alignment is at least 16 or 32.
   5340     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   5341     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
   5342       if (MFI->isFixedObjectIndex(FI)) {
   5343         // Can't change the alignment. FIXME: It's possible to compute
   5344         // the exact stack offset and reference FI + adjust offset instead.
   5345         // If someone *really* cares about this. That's the way to implement it.
   5346         return SDValue();
   5347       } else {
   5348         MFI->setObjectAlignment(FI, RequiredAlign);
   5349       }
   5350     }
   5351 
   5352     // (Offset % 16 or 32) must be multiple of 4. Then address is then
   5353     // Ptr + (Offset & ~15).
   5354     if (Offset < 0)
   5355       return SDValue();
   5356     if ((Offset % RequiredAlign) & 3)
   5357       return SDValue();
   5358     int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
   5359     if (StartOffset) {
   5360       SDLoc DL(Ptr);
   5361       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
   5362                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
   5363     }
   5364 
   5365     int EltNo = (Offset - StartOffset) >> 2;
   5366     unsigned NumElems = VT.getVectorNumElements();
   5367 
   5368     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
   5369     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
   5370                              LD->getPointerInfo().getWithOffset(StartOffset),
   5371                              false, false, false, 0);
   5372 
   5373     SmallVector<int, 8> Mask(NumElems, EltNo);
   5374 
   5375     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
   5376   }
   5377 
   5378   return SDValue();
   5379 }
   5380 
   5381 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
   5382 /// elements can be replaced by a single large load which has the same value as
   5383 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
   5384 ///
   5385 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
   5386 ///
   5387 /// FIXME: we'd also like to handle the case where the last elements are zero
   5388 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
   5389 /// There's even a handy isZeroNode for that purpose.
   5390 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   5391                                         SDLoc &DL, SelectionDAG &DAG,
   5392                                         bool isAfterLegalize) {
   5393   unsigned NumElems = Elts.size();
   5394 
   5395   LoadSDNode *LDBase = nullptr;
   5396   unsigned LastLoadedElt = -1U;
   5397 
   5398   // For each element in the initializer, see if we've found a load or an undef.
   5399   // If we don't find an initial load element, or later load elements are
   5400   // non-consecutive, bail out.
   5401   for (unsigned i = 0; i < NumElems; ++i) {
   5402     SDValue Elt = Elts[i];
   5403     // Look through a bitcast.
   5404     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
   5405       Elt = Elt.getOperand(0);
   5406     if (!Elt.getNode() ||
   5407         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
   5408       return SDValue();
   5409     if (!LDBase) {
   5410       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
   5411         return SDValue();
   5412       LDBase = cast<LoadSDNode>(Elt.getNode());
   5413       LastLoadedElt = i;
   5414       continue;
   5415     }
   5416     if (Elt.getOpcode() == ISD::UNDEF)
   5417       continue;
   5418 
   5419     LoadSDNode *LD = cast<LoadSDNode>(Elt);
   5420     EVT LdVT = Elt.getValueType();
   5421     // Each loaded element must be the correct fractional portion of the
   5422     // requested vector load.
   5423     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
   5424       return SDValue();
   5425     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
   5426       return SDValue();
   5427     LastLoadedElt = i;
   5428   }
   5429 
   5430   // If we have found an entire vector of loads and undefs, then return a large
   5431   // load of the entire vector width starting at the base pointer.  If we found
   5432   // consecutive loads for the low half, generate a vzext_load node.
   5433   if (LastLoadedElt == NumElems - 1) {
   5434     assert(LDBase && "Did not find base load for merging consecutive loads");
   5435     EVT EltVT = LDBase->getValueType(0);
   5436     // Ensure that the input vector size for the merged loads matches the
   5437     // cumulative size of the input elements.
   5438     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
   5439       return SDValue();
   5440 
   5441     if (isAfterLegalize &&
   5442         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
   5443       return SDValue();
   5444 
   5445     SDValue NewLd = SDValue();
   5446 
   5447     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
   5448                         LDBase->getPointerInfo(), LDBase->isVolatile(),
   5449                         LDBase->isNonTemporal(), LDBase->isInvariant(),
   5450                         LDBase->getAlignment());
   5451 
   5452     if (LDBase->hasAnyUseOfValue(1)) {
   5453       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
   5454                                      SDValue(LDBase, 1),
   5455                                      SDValue(NewLd.getNode(), 1));
   5456       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
   5457       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
   5458                              SDValue(NewLd.getNode(), 1));
   5459     }
   5460 
   5461     return NewLd;
   5462   }
   5463 
   5464   //TODO: The code below fires only for for loading the low v2i32 / v2f32
   5465   //of a v4i32 / v4f32. It's probably worth generalizing.
   5466   EVT EltVT = VT.getVectorElementType();
   5467   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
   5468       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
   5469     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
   5470     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
   5471     SDValue ResNode =
   5472         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
   5473                                 LDBase->getPointerInfo(),
   5474                                 LDBase->getAlignment(),
   5475                                 false/*isVolatile*/, true/*ReadMem*/,
   5476                                 false/*WriteMem*/);
   5477 
   5478     // Make sure the newly-created LOAD is in the same position as LDBase in
   5479     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
   5480     // update uses of LDBase's output chain to use the TokenFactor.
   5481     if (LDBase->hasAnyUseOfValue(1)) {
   5482       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
   5483                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
   5484       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
   5485       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
   5486                              SDValue(ResNode.getNode(), 1));
   5487     }
   5488 
   5489     return DAG.getBitcast(VT, ResNode);
   5490   }
   5491   return SDValue();
   5492 }
   5493 
   5494 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
   5495 /// to generate a splat value for the following cases:
   5496 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
   5497 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
   5498 /// a scalar load, or a constant.
   5499 /// The VBROADCAST node is returned when a pattern is found,
   5500 /// or SDValue() otherwise.
   5501 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
   5502                                     SelectionDAG &DAG) {
   5503   // VBROADCAST requires AVX.
   5504   // TODO: Splats could be generated for non-AVX CPUs using SSE
   5505   // instructions, but there's less potential gain for only 128-bit vectors.
   5506   if (!Subtarget->hasAVX())
   5507     return SDValue();
   5508 
   5509   MVT VT = Op.getSimpleValueType();
   5510   SDLoc dl(Op);
   5511 
   5512   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
   5513          "Unsupported vector type for broadcast.");
   5514 
   5515   SDValue Ld;
   5516   bool ConstSplatVal;
   5517 
   5518   switch (Op.getOpcode()) {
   5519     default:
   5520       // Unknown pattern found.
   5521       return SDValue();
   5522 
   5523     case ISD::BUILD_VECTOR: {
   5524       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
   5525       BitVector UndefElements;
   5526       SDValue Splat = BVOp->getSplatValue(&UndefElements);
   5527 
   5528       // We need a splat of a single value to use broadcast, and it doesn't
   5529       // make any sense if the value is only in one element of the vector.
   5530       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
   5531         return SDValue();
   5532 
   5533       Ld = Splat;
   5534       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   5535                        Ld.getOpcode() == ISD::ConstantFP);
   5536 
   5537       // Make sure that all of the users of a non-constant load are from the
   5538       // BUILD_VECTOR node.
   5539       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
   5540         return SDValue();
   5541       break;
   5542     }
   5543 
   5544     case ISD::VECTOR_SHUFFLE: {
   5545       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   5546 
   5547       // Shuffles must have a splat mask where the first element is
   5548       // broadcasted.
   5549       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
   5550         return SDValue();
   5551 
   5552       SDValue Sc = Op.getOperand(0);
   5553       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
   5554           Sc.getOpcode() != ISD::BUILD_VECTOR) {
   5555 
   5556         if (!Subtarget->hasInt256())
   5557           return SDValue();
   5558 
   5559         // Use the register form of the broadcast instruction available on AVX2.
   5560         if (VT.getSizeInBits() >= 256)
   5561           Sc = Extract128BitVector(Sc, 0, DAG, dl);
   5562         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
   5563       }
   5564 
   5565       Ld = Sc.getOperand(0);
   5566       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   5567                        Ld.getOpcode() == ISD::ConstantFP);
   5568 
   5569       // The scalar_to_vector node and the suspected
   5570       // load node must have exactly one user.
   5571       // Constants may have multiple users.
   5572 
   5573       // AVX-512 has register version of the broadcast
   5574       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
   5575         Ld.getValueType().getSizeInBits() >= 32;
   5576       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
   5577           !hasRegVer))
   5578         return SDValue();
   5579       break;
   5580     }
   5581   }
   5582 
   5583   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
   5584   bool IsGE256 = (VT.getSizeInBits() >= 256);
   5585 
   5586   // When optimizing for size, generate up to 5 extra bytes for a broadcast
   5587   // instruction to save 8 or more bytes of constant pool data.
   5588   // TODO: If multiple splats are generated to load the same constant,
   5589   // it may be detrimental to overall size. There needs to be a way to detect
   5590   // that condition to know if this is truly a size win.
   5591   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
   5592 
   5593   // Handle broadcasting a single constant scalar from the constant pool
   5594   // into a vector.
   5595   // On Sandybridge (no AVX2), it is still better to load a constant vector
   5596   // from the constant pool and not to broadcast it from a scalar.
   5597   // But override that restriction when optimizing for size.
   5598   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
   5599   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
   5600     EVT CVT = Ld.getValueType();
   5601     assert(!CVT.isVector() && "Must not broadcast a vector type");
   5602 
   5603     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
   5604     // For size optimization, also splat v2f64 and v2i64, and for size opt
   5605     // with AVX2, also splat i8 and i16.
   5606     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
   5607     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
   5608         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
   5609       const Constant *C = nullptr;
   5610       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
   5611         C = CI->getConstantIntValue();
   5612       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
   5613         C = CF->getConstantFPValue();
   5614 
   5615       assert(C && "Invalid constant type");
   5616 
   5617       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   5618       SDValue CP =
   5619           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
   5620       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   5621       Ld = DAG.getLoad(
   5622           CVT, dl, DAG.getEntryNode(), CP,
   5623           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
   5624           false, false, Alignment);
   5625 
   5626       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5627     }
   5628   }
   5629 
   5630   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
   5631 
   5632   // Handle AVX2 in-register broadcasts.
   5633   if (!IsLoad && Subtarget->hasInt256() &&
   5634       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
   5635     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5636 
   5637   // The scalar source must be a normal load.
   5638   if (!IsLoad)
   5639     return SDValue();
   5640 
   5641   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
   5642       (Subtarget->hasVLX() && ScalarSize == 64))
   5643     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5644 
   5645   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   5646   // double since there is no vbroadcastsd xmm
   5647   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
   5648     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
   5649       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5650   }
   5651 
   5652   // Unsupported broadcast.
   5653   return SDValue();
   5654 }
   5655 
   5656 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
   5657 /// underlying vector and index.
   5658 ///
   5659 /// Modifies \p ExtractedFromVec to the real vector and returns the real
   5660 /// index.
   5661 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
   5662                                          SDValue ExtIdx) {
   5663   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
   5664   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
   5665     return Idx;
   5666 
   5667   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
   5668   // lowered this:
   5669   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
   5670   // to:
   5671   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
   5672   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
   5673   //                           undef)
   5674   //                       Constant<0>)
   5675   // In this case the vector is the extract_subvector expression and the index
   5676   // is 2, as specified by the shuffle.
   5677   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
   5678   SDValue ShuffleVec = SVOp->getOperand(0);
   5679   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
   5680   assert(ShuffleVecVT.getVectorElementType() ==
   5681          ExtractedFromVec.getSimpleValueType().getVectorElementType());
   5682 
   5683   int ShuffleIdx = SVOp->getMaskElt(Idx);
   5684   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
   5685     ExtractedFromVec = ShuffleVec;
   5686     return ShuffleIdx;
   5687   }
   5688   return Idx;
   5689 }
   5690 
   5691 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
   5692   MVT VT = Op.getSimpleValueType();
   5693 
   5694   // Skip if insert_vec_elt is not supported.
   5695   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   5696   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
   5697     return SDValue();
   5698 
   5699   SDLoc DL(Op);
   5700   unsigned NumElems = Op.getNumOperands();
   5701 
   5702   SDValue VecIn1;
   5703   SDValue VecIn2;
   5704   SmallVector<unsigned, 4> InsertIndices;
   5705   SmallVector<int, 8> Mask(NumElems, -1);
   5706 
   5707   for (unsigned i = 0; i != NumElems; ++i) {
   5708     unsigned Opc = Op.getOperand(i).getOpcode();
   5709 
   5710     if (Opc == ISD::UNDEF)
   5711       continue;
   5712 
   5713     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
   5714       // Quit if more than 1 elements need inserting.
   5715       if (InsertIndices.size() > 1)
   5716         return SDValue();
   5717 
   5718       InsertIndices.push_back(i);
   5719       continue;
   5720     }
   5721 
   5722     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
   5723     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
   5724     // Quit if non-constant index.
   5725     if (!isa<ConstantSDNode>(ExtIdx))
   5726       return SDValue();
   5727     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
   5728 
   5729     // Quit if extracted from vector of different type.
   5730     if (ExtractedFromVec.getValueType() != VT)
   5731       return SDValue();
   5732 
   5733     if (!VecIn1.getNode())
   5734       VecIn1 = ExtractedFromVec;
   5735     else if (VecIn1 != ExtractedFromVec) {
   5736       if (!VecIn2.getNode())
   5737         VecIn2 = ExtractedFromVec;
   5738       else if (VecIn2 != ExtractedFromVec)
   5739         // Quit if more than 2 vectors to shuffle
   5740         return SDValue();
   5741     }
   5742 
   5743     if (ExtractedFromVec == VecIn1)
   5744       Mask[i] = Idx;
   5745     else if (ExtractedFromVec == VecIn2)
   5746       Mask[i] = Idx + NumElems;
   5747   }
   5748 
   5749   if (!VecIn1.getNode())
   5750     return SDValue();
   5751 
   5752   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
   5753   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
   5754   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
   5755     unsigned Idx = InsertIndices[i];
   5756     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
   5757                      DAG.getIntPtrConstant(Idx, DL));
   5758   }
   5759 
   5760   return NV;
   5761 }
   5762 
   5763 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
   5764   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
   5765          Op.getScalarValueSizeInBits() == 1 &&
   5766          "Can not convert non-constant vector");
   5767   uint64_t Immediate = 0;
   5768   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
   5769     SDValue In = Op.getOperand(idx);
   5770     if (In.getOpcode() != ISD::UNDEF)
   5771       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
   5772   }
   5773   SDLoc dl(Op);
   5774   MVT VT =
   5775    MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
   5776   return DAG.getConstant(Immediate, dl, VT);
   5777 }
   5778 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
   5779 SDValue
   5780 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
   5781 
   5782   MVT VT = Op.getSimpleValueType();
   5783   assert((VT.getVectorElementType() == MVT::i1) &&
   5784          "Unexpected type in LowerBUILD_VECTORvXi1!");
   5785 
   5786   SDLoc dl(Op);
   5787   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
   5788     SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1);
   5789     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
   5790     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   5791   }
   5792 
   5793   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
   5794     SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1);
   5795     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
   5796     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   5797   }
   5798 
   5799   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
   5800     SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
   5801     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
   5802       return DAG.getBitcast(VT, Imm);
   5803     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
   5804     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
   5805                         DAG.getIntPtrConstant(0, dl));
   5806   }
   5807 
   5808   // Vector has one or more non-const elements
   5809   uint64_t Immediate = 0;
   5810   SmallVector<unsigned, 16> NonConstIdx;
   5811   bool IsSplat = true;
   5812   bool HasConstElts = false;
   5813   int SplatIdx = -1;
   5814   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
   5815     SDValue In = Op.getOperand(idx);
   5816     if (In.getOpcode() == ISD::UNDEF)
   5817       continue;
   5818     if (!isa<ConstantSDNode>(In))
   5819       NonConstIdx.push_back(idx);
   5820     else {
   5821       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
   5822       HasConstElts = true;
   5823     }
   5824     if (SplatIdx == -1)
   5825       SplatIdx = idx;
   5826     else if (In != Op.getOperand(SplatIdx))
   5827       IsSplat = false;
   5828   }
   5829 
   5830   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
   5831   if (IsSplat)
   5832     return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
   5833                        DAG.getConstant(1, dl, VT),
   5834                        DAG.getConstant(0, dl, VT));
   5835 
   5836   // insert elements one by one
   5837   SDValue DstVec;
   5838   SDValue Imm;
   5839   if (Immediate) {
   5840     MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
   5841     Imm = DAG.getConstant(Immediate, dl, ImmVT);
   5842   }
   5843   else if (HasConstElts)
   5844     Imm = DAG.getConstant(0, dl, VT);
   5845   else
   5846     Imm = DAG.getUNDEF(VT);
   5847   if (Imm.getValueSizeInBits() == VT.getSizeInBits())
   5848     DstVec = DAG.getBitcast(VT, Imm);
   5849   else {
   5850     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
   5851     DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
   5852                          DAG.getIntPtrConstant(0, dl));
   5853   }
   5854 
   5855   for (unsigned i = 0; i < NonConstIdx.size(); ++i) {
   5856     unsigned InsertIdx = NonConstIdx[i];
   5857     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
   5858                          Op.getOperand(InsertIdx),
   5859                          DAG.getIntPtrConstant(InsertIdx, dl));
   5860   }
   5861   return DstVec;
   5862 }
   5863 
   5864 /// \brief Return true if \p N implements a horizontal binop and return the
   5865 /// operands for the horizontal binop into V0 and V1.
   5866 ///
   5867 /// This is a helper function of LowerToHorizontalOp().
   5868 /// This function checks that the build_vector \p N in input implements a
   5869 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
   5870 /// operation to match.
   5871 /// For example, if \p Opcode is equal to ISD::ADD, then this function
   5872 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
   5873 /// is equal to ISD::SUB, then this function checks if this is a horizontal
   5874 /// arithmetic sub.
   5875 ///
   5876 /// This function only analyzes elements of \p N whose indices are
   5877 /// in range [BaseIdx, LastIdx).
   5878 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
   5879                               SelectionDAG &DAG,
   5880                               unsigned BaseIdx, unsigned LastIdx,
   5881                               SDValue &V0, SDValue &V1) {
   5882   EVT VT = N->getValueType(0);
   5883 
   5884   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
   5885   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
   5886          "Invalid Vector in input!");
   5887 
   5888   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
   5889   bool CanFold = true;
   5890   unsigned ExpectedVExtractIdx = BaseIdx;
   5891   unsigned NumElts = LastIdx - BaseIdx;
   5892   V0 = DAG.getUNDEF(VT);
   5893   V1 = DAG.getUNDEF(VT);
   5894 
   5895   // Check if N implements a horizontal binop.
   5896   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
   5897     SDValue Op = N->getOperand(i + BaseIdx);
   5898 
   5899     // Skip UNDEFs.
   5900     if (Op->getOpcode() == ISD::UNDEF) {
   5901       // Update the expected vector extract index.
   5902       if (i * 2 == NumElts)
   5903         ExpectedVExtractIdx = BaseIdx;
   5904       ExpectedVExtractIdx += 2;
   5905       continue;
   5906     }
   5907 
   5908     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
   5909 
   5910     if (!CanFold)
   5911       break;
   5912 
   5913     SDValue Op0 = Op.getOperand(0);
   5914     SDValue Op1 = Op.getOperand(1);
   5915 
   5916     // Try to match the following pattern:
   5917     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
   5918     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   5919         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   5920         Op0.getOperand(0) == Op1.getOperand(0) &&
   5921         isa<ConstantSDNode>(Op0.getOperand(1)) &&
   5922         isa<ConstantSDNode>(Op1.getOperand(1)));
   5923     if (!CanFold)
   5924       break;
   5925 
   5926     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
   5927     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
   5928 
   5929     if (i * 2 < NumElts) {
   5930       if (V0.getOpcode() == ISD::UNDEF) {
   5931         V0 = Op0.getOperand(0);
   5932         if (V0.getValueType() != VT)
   5933           return false;
   5934       }
   5935     } else {
   5936       if (V1.getOpcode() == ISD::UNDEF) {
   5937         V1 = Op0.getOperand(0);
   5938         if (V1.getValueType() != VT)
   5939           return false;
   5940       }
   5941       if (i * 2 == NumElts)
   5942         ExpectedVExtractIdx = BaseIdx;
   5943     }
   5944 
   5945     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
   5946     if (I0 == ExpectedVExtractIdx)
   5947       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
   5948     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
   5949       // Try to match the following dag sequence:
   5950       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
   5951       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
   5952     } else
   5953       CanFold = false;
   5954 
   5955     ExpectedVExtractIdx += 2;
   5956   }
   5957 
   5958   return CanFold;
   5959 }
   5960 
   5961 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
   5962 /// a concat_vector.
   5963 ///
   5964 /// This is a helper function of LowerToHorizontalOp().
   5965 /// This function expects two 256-bit vectors called V0 and V1.
   5966 /// At first, each vector is split into two separate 128-bit vectors.
   5967 /// Then, the resulting 128-bit vectors are used to implement two
   5968 /// horizontal binary operations.
   5969 ///
   5970 /// The kind of horizontal binary operation is defined by \p X86Opcode.
   5971 ///
   5972 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
   5973 /// the two new horizontal binop.
   5974 /// When Mode is set, the first horizontal binop dag node would take as input
   5975 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
   5976 /// horizontal binop dag node would take as input the lower 128-bit of V1
   5977 /// and the upper 128-bit of V1.
   5978 ///   Example:
   5979 ///     HADD V0_LO, V0_HI
   5980 ///     HADD V1_LO, V1_HI
   5981 ///
   5982 /// Otherwise, the first horizontal binop dag node takes as input the lower
   5983 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
   5984 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
   5985 ///   Example:
   5986 ///     HADD V0_LO, V1_LO
   5987 ///     HADD V0_HI, V1_HI
   5988 ///
   5989 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
   5990 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
   5991 /// the upper 128-bits of the result.
   5992 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
   5993                                      SDLoc DL, SelectionDAG &DAG,
   5994                                      unsigned X86Opcode, bool Mode,
   5995                                      bool isUndefLO, bool isUndefHI) {
   5996   EVT VT = V0.getValueType();
   5997   assert(VT.is256BitVector() && VT == V1.getValueType() &&
   5998          "Invalid nodes in input!");
   5999 
   6000   unsigned NumElts = VT.getVectorNumElements();
   6001   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
   6002   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
   6003   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
   6004   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
   6005   EVT NewVT = V0_LO.getValueType();
   6006 
   6007   SDValue LO = DAG.getUNDEF(NewVT);
   6008   SDValue HI = DAG.getUNDEF(NewVT);
   6009 
   6010   if (Mode) {
   6011     // Don't emit a horizontal binop if the result is expected to be UNDEF.
   6012     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
   6013       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
   6014     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
   6015       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
   6016   } else {
   6017     // Don't emit a horizontal binop if the result is expected to be UNDEF.
   6018     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
   6019                        V1_LO->getOpcode() != ISD::UNDEF))
   6020       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
   6021 
   6022     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
   6023                        V1_HI->getOpcode() != ISD::UNDEF))
   6024       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
   6025   }
   6026 
   6027   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
   6028 }
   6029 
   6030 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
   6031 /// node.
   6032 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
   6033                              const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   6034   MVT VT = BV->getSimpleValueType(0);
   6035   if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
   6036       (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
   6037     return SDValue();
   6038 
   6039   SDLoc DL(BV);
   6040   unsigned NumElts = VT.getVectorNumElements();
   6041   SDValue InVec0 = DAG.getUNDEF(VT);
   6042   SDValue InVec1 = DAG.getUNDEF(VT);
   6043 
   6044   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
   6045           VT == MVT::v2f64) && "build_vector with an invalid type found!");
   6046 
   6047   // Odd-numbered elements in the input build vector are obtained from
   6048   // adding two integer/float elements.
   6049   // Even-numbered elements in the input build vector are obtained from
   6050   // subtracting two integer/float elements.
   6051   unsigned ExpectedOpcode = ISD::FSUB;
   6052   unsigned NextExpectedOpcode = ISD::FADD;
   6053   bool AddFound = false;
   6054   bool SubFound = false;
   6055 
   6056   for (unsigned i = 0, e = NumElts; i != e; ++i) {
   6057     SDValue Op = BV->getOperand(i);
   6058 
   6059     // Skip 'undef' values.
   6060     unsigned Opcode = Op.getOpcode();
   6061     if (Opcode == ISD::UNDEF) {
   6062       std::swap(ExpectedOpcode, NextExpectedOpcode);
   6063       continue;
   6064     }
   6065 
   6066     // Early exit if we found an unexpected opcode.
   6067     if (Opcode != ExpectedOpcode)
   6068       return SDValue();
   6069 
   6070     SDValue Op0 = Op.getOperand(0);
   6071     SDValue Op1 = Op.getOperand(1);
   6072 
   6073     // Try to match the following pattern:
   6074     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
   6075     // Early exit if we cannot match that sequence.
   6076     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   6077         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   6078         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
   6079         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
   6080         Op0.getOperand(1) != Op1.getOperand(1))
   6081       return SDValue();
   6082 
   6083     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
   6084     if (I0 != i)
   6085       return SDValue();
   6086 
   6087     // We found a valid add/sub node. Update the information accordingly.
   6088     if (i & 1)
   6089       AddFound = true;
   6090     else
   6091       SubFound = true;
   6092 
   6093     // Update InVec0 and InVec1.
   6094     if (InVec0.getOpcode() == ISD::UNDEF) {
   6095       InVec0 = Op0.getOperand(0);
   6096       if (InVec0.getSimpleValueType() != VT)
   6097         return SDValue();
   6098     }
   6099     if (InVec1.getOpcode() == ISD::UNDEF) {
   6100       InVec1 = Op1.getOperand(0);
   6101       if (InVec1.getSimpleValueType() != VT)
   6102         return SDValue();
   6103     }
   6104 
   6105     // Make sure that operands in input to each add/sub node always
   6106     // come from a same pair of vectors.
   6107     if (InVec0 != Op0.getOperand(0)) {
   6108       if (ExpectedOpcode == ISD::FSUB)
   6109         return SDValue();
   6110 
   6111       // FADD is commutable. Try to commute the operands
   6112       // and then test again.
   6113       std::swap(Op0, Op1);
   6114       if (InVec0 != Op0.getOperand(0))
   6115         return SDValue();
   6116     }
   6117 
   6118     if (InVec1 != Op1.getOperand(0))
   6119       return SDValue();
   6120 
   6121     // Update the pair of expected opcodes.
   6122     std::swap(ExpectedOpcode, NextExpectedOpcode);
   6123   }
   6124 
   6125   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
   6126   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
   6127       InVec1.getOpcode() != ISD::UNDEF)
   6128     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
   6129 
   6130   return SDValue();
   6131 }
   6132 
   6133 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
   6134 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
   6135                                    const X86Subtarget *Subtarget,
   6136                                    SelectionDAG &DAG) {
   6137   MVT VT = BV->getSimpleValueType(0);
   6138   unsigned NumElts = VT.getVectorNumElements();
   6139   unsigned NumUndefsLO = 0;
   6140   unsigned NumUndefsHI = 0;
   6141   unsigned Half = NumElts/2;
   6142 
   6143   // Count the number of UNDEF operands in the build_vector in input.
   6144   for (unsigned i = 0, e = Half; i != e; ++i)
   6145     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
   6146       NumUndefsLO++;
   6147 
   6148   for (unsigned i = Half, e = NumElts; i != e; ++i)
   6149     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
   6150       NumUndefsHI++;
   6151 
   6152   // Early exit if this is either a build_vector of all UNDEFs or all the
   6153   // operands but one are UNDEF.
   6154   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
   6155     return SDValue();
   6156 
   6157   SDLoc DL(BV);
   6158   SDValue InVec0, InVec1;
   6159   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
   6160     // Try to match an SSE3 float HADD/HSUB.
   6161     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
   6162       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
   6163 
   6164     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
   6165       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   6166   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
   6167     // Try to match an SSSE3 integer HADD/HSUB.
   6168     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
   6169       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
   6170 
   6171     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
   6172       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
   6173   }
   6174 
   6175   if (!Subtarget->hasAVX())
   6176     return SDValue();
   6177 
   6178   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
   6179     // Try to match an AVX horizontal add/sub of packed single/double
   6180     // precision floating point values from 256-bit vectors.
   6181     SDValue InVec2, InVec3;
   6182     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
   6183         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
   6184         ((InVec0.getOpcode() == ISD::UNDEF ||
   6185           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
   6186         ((InVec1.getOpcode() == ISD::UNDEF ||
   6187           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
   6188       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
   6189 
   6190     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
   6191         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
   6192         ((InVec0.getOpcode() == ISD::UNDEF ||
   6193           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
   6194         ((InVec1.getOpcode() == ISD::UNDEF ||
   6195           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
   6196       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   6197   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
   6198     // Try to match an AVX2 horizontal add/sub of signed integers.
   6199     SDValue InVec2, InVec3;
   6200     unsigned X86Opcode;
   6201     bool CanFold = true;
   6202 
   6203     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
   6204         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
   6205         ((InVec0.getOpcode() == ISD::UNDEF ||
   6206           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
   6207         ((InVec1.getOpcode() == ISD::UNDEF ||
   6208           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
   6209       X86Opcode = X86ISD::HADD;
   6210     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
   6211         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
   6212         ((InVec0.getOpcode() == ISD::UNDEF ||
   6213           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
   6214         ((InVec1.getOpcode() == ISD::UNDEF ||
   6215           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
   6216       X86Opcode = X86ISD::HSUB;
   6217     else
   6218       CanFold = false;
   6219 
   6220     if (CanFold) {
   6221       // Fold this build_vector into a single horizontal add/sub.
   6222       // Do this only if the target has AVX2.
   6223       if (Subtarget->hasAVX2())
   6224         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
   6225 
   6226       // Do not try to expand this build_vector into a pair of horizontal
   6227       // add/sub if we can emit a pair of scalar add/sub.
   6228       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
   6229         return SDValue();
   6230 
   6231       // Convert this build_vector into a pair of horizontal binop followed by
   6232       // a concat vector.
   6233       bool isUndefLO = NumUndefsLO == Half;
   6234       bool isUndefHI = NumUndefsHI == Half;
   6235       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
   6236                                    isUndefLO, isUndefHI);
   6237     }
   6238   }
   6239 
   6240   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
   6241        VT == MVT::v16i16) && Subtarget->hasAVX()) {
   6242     unsigned X86Opcode;
   6243     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
   6244       X86Opcode = X86ISD::HADD;
   6245     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
   6246       X86Opcode = X86ISD::HSUB;
   6247     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
   6248       X86Opcode = X86ISD::FHADD;
   6249     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
   6250       X86Opcode = X86ISD::FHSUB;
   6251     else
   6252       return SDValue();
   6253 
   6254     // Don't try to expand this build_vector into a pair of horizontal add/sub
   6255     // if we can simply emit a pair of scalar add/sub.
   6256     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
   6257       return SDValue();
   6258 
   6259     // Convert this build_vector into two horizontal add/sub followed by
   6260     // a concat vector.
   6261     bool isUndefLO = NumUndefsLO == Half;
   6262     bool isUndefHI = NumUndefsHI == Half;
   6263     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
   6264                                  isUndefLO, isUndefHI);
   6265   }
   6266 
   6267   return SDValue();
   6268 }
   6269 
   6270 SDValue
   6271 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   6272   SDLoc dl(Op);
   6273 
   6274   MVT VT = Op.getSimpleValueType();
   6275   MVT ExtVT = VT.getVectorElementType();
   6276   unsigned NumElems = Op.getNumOperands();
   6277 
   6278   // Generate vectors for predicate vectors.
   6279   if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512())
   6280     return LowerBUILD_VECTORvXi1(Op, DAG);
   6281 
   6282   // Vectors containing all zeros can be matched by pxor and xorps later
   6283   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
   6284     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
   6285     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
   6286     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
   6287       return Op;
   6288 
   6289     return getZeroVector(VT, Subtarget, DAG, dl);
   6290   }
   6291 
   6292   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   6293   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   6294   // vpcmpeqd on 256-bit vectors.
   6295   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
   6296     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
   6297       return Op;
   6298 
   6299     if (!VT.is512BitVector())
   6300       return getOnesVector(VT, Subtarget, DAG, dl);
   6301   }
   6302 
   6303   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
   6304   if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
   6305     return AddSub;
   6306   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
   6307     return HorizontalOp;
   6308   if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
   6309     return Broadcast;
   6310 
   6311   unsigned EVTBits = ExtVT.getSizeInBits();
   6312 
   6313   unsigned NumZero  = 0;
   6314   unsigned NumNonZero = 0;
   6315   uint64_t NonZeros = 0;
   6316   bool IsAllConstants = true;
   6317   SmallSet<SDValue, 8> Values;
   6318   for (unsigned i = 0; i < NumElems; ++i) {
   6319     SDValue Elt = Op.getOperand(i);
   6320     if (Elt.getOpcode() == ISD::UNDEF)
   6321       continue;
   6322     Values.insert(Elt);
   6323     if (Elt.getOpcode() != ISD::Constant &&
   6324         Elt.getOpcode() != ISD::ConstantFP)
   6325       IsAllConstants = false;
   6326     if (X86::isZeroNode(Elt))
   6327       NumZero++;
   6328     else {
   6329       assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
   6330       NonZeros |= ((uint64_t)1 << i);
   6331       NumNonZero++;
   6332     }
   6333   }
   6334 
   6335   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
   6336   if (NumNonZero == 0)
   6337     return DAG.getUNDEF(VT);
   6338 
   6339   // Special case for single non-zero, non-undef, element.
   6340   if (NumNonZero == 1) {
   6341     unsigned Idx = countTrailingZeros(NonZeros);
   6342     SDValue Item = Op.getOperand(Idx);
   6343 
   6344     // If this is an insertion of an i64 value on x86-32, and if the top bits of
   6345     // the value are obviously zero, truncate the value to i32 and do the
   6346     // insertion that way.  Only do this if the value is non-constant or if the
   6347     // value is a constant being inserted into element 0.  It is cheaper to do
   6348     // a constant pool load than it is to do a movd + shuffle.
   6349     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
   6350         (!IsAllConstants || Idx == 0)) {
   6351       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
   6352         // Handle SSE only.
   6353         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
   6354         MVT VecVT = MVT::v4i32;
   6355 
   6356         // Truncate the value (which may itself be a constant) to i32, and
   6357         // convert it to a vector with movd (S2V+shuffle to zero extend).
   6358         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
   6359         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
   6360         return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
   6361                                       Item, Idx * 2, true, Subtarget, DAG));
   6362       }
   6363     }
   6364 
   6365     // If we have a constant or non-constant insertion into the low element of
   6366     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
   6367     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
   6368     // depending on what the source datatype is.
   6369     if (Idx == 0) {
   6370       if (NumZero == 0)
   6371         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   6372 
   6373       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
   6374           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
   6375         if (VT.is512BitVector()) {
   6376           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
   6377           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
   6378                              Item, DAG.getIntPtrConstant(0, dl));
   6379         }
   6380         assert((VT.is128BitVector() || VT.is256BitVector()) &&
   6381                "Expected an SSE value type!");
   6382         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   6383         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
   6384         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   6385       }
   6386 
   6387       // We can't directly insert an i8 or i16 into a vector, so zero extend
   6388       // it to i32 first.
   6389       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
   6390         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
   6391         if (VT.is256BitVector()) {
   6392           if (Subtarget->hasAVX()) {
   6393             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item);
   6394             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   6395           } else {
   6396             // Without AVX, we need to extend to a 128-bit vector and then
   6397             // insert into the 256-bit vector.
   6398             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
   6399             SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
   6400             Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
   6401           }
   6402         } else {
   6403           assert(VT.is128BitVector() && "Expected an SSE value type!");
   6404           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
   6405           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   6406         }
   6407         return DAG.getBitcast(VT, Item);
   6408       }
   6409     }
   6410 
   6411     // Is it a vector logical left shift?
   6412     if (NumElems == 2 && Idx == 1 &&
   6413         X86::isZeroNode(Op.getOperand(0)) &&
   6414         !X86::isZeroNode(Op.getOperand(1))) {
   6415       unsigned NumBits = VT.getSizeInBits();
   6416       return getVShift(true, VT,
   6417                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   6418                                    VT, Op.getOperand(1)),
   6419                        NumBits/2, DAG, *this, dl);
   6420     }
   6421 
   6422     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
   6423       return SDValue();
   6424 
   6425     // Otherwise, if this is a vector with i32 or f32 elements, and the element
   6426     // is a non-constant being inserted into an element other than the low one,
   6427     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
   6428     // movd/movss) to move this into the low element, then shuffle it into
   6429     // place.
   6430     if (EVTBits == 32) {
   6431       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   6432       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
   6433     }
   6434   }
   6435 
   6436   // Splat is obviously ok. Let legalizer expand it to a shuffle.
   6437   if (Values.size() == 1) {
   6438     if (EVTBits == 32) {
   6439       // Instead of a shuffle like this:
   6440       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
   6441       // Check if it's possible to issue this instead.
   6442       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
   6443       unsigned Idx = countTrailingZeros(NonZeros);
   6444       SDValue Item = Op.getOperand(Idx);
   6445       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
   6446         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
   6447     }
   6448     return SDValue();
   6449   }
   6450 
   6451   // A vector full of immediates; various special cases are already
   6452   // handled, so this is best done with a single constant-pool load.
   6453   if (IsAllConstants)
   6454     return SDValue();
   6455 
   6456   // For AVX-length vectors, see if we can use a vector load to get all of the
   6457   // elements, otherwise build the individual 128-bit pieces and use
   6458   // shuffles to put them in place.
   6459   if (VT.is256BitVector() || VT.is512BitVector()) {
   6460     SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
   6461 
   6462     // Check for a build vector of consecutive loads.
   6463     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
   6464       return LD;
   6465 
   6466     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
   6467 
   6468     // Build both the lower and upper subvector.
   6469     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
   6470                                 makeArrayRef(&V[0], NumElems/2));
   6471     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
   6472                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
   6473 
   6474     // Recreate the wider vector with the lower and upper part.
   6475     if (VT.is256BitVector())
   6476       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   6477     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   6478   }
   6479 
   6480   // Let legalizer expand 2-wide build_vectors.
   6481   if (EVTBits == 64) {
   6482     if (NumNonZero == 1) {
   6483       // One half is zero or undef.
   6484       unsigned Idx = countTrailingZeros(NonZeros);
   6485       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
   6486                                Op.getOperand(Idx));
   6487       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
   6488     }
   6489     return SDValue();
   6490   }
   6491 
   6492   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   6493   if (EVTBits == 8 && NumElems == 16)
   6494     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
   6495                                           DAG, Subtarget, *this))
   6496       return V;
   6497 
   6498   if (EVTBits == 16 && NumElems == 8)
   6499     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
   6500                                           DAG, Subtarget, *this))
   6501       return V;
   6502 
   6503   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
   6504   if (EVTBits == 32 && NumElems == 4)
   6505     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
   6506       return V;
   6507 
   6508   // If element VT is == 32 bits, turn it into a number of shuffles.
   6509   SmallVector<SDValue, 8> V(NumElems);
   6510   if (NumElems == 4 && NumZero > 0) {
   6511     for (unsigned i = 0; i < 4; ++i) {
   6512       bool isZero = !(NonZeros & (1ULL << i));
   6513       if (isZero)
   6514         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
   6515       else
   6516         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   6517     }
   6518 
   6519     for (unsigned i = 0; i < 2; ++i) {
   6520       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
   6521         default: break;
   6522         case 0:
   6523           V[i] = V[i*2];  // Must be a zero vector.
   6524           break;
   6525         case 1:
   6526           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
   6527           break;
   6528         case 2:
   6529           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
   6530           break;
   6531         case 3:
   6532           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
   6533           break;
   6534       }
   6535     }
   6536 
   6537     bool Reverse1 = (NonZeros & 0x3) == 2;
   6538     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
   6539     int MaskVec[] = {
   6540       Reverse1 ? 1 : 0,
   6541       Reverse1 ? 0 : 1,
   6542       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
   6543       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
   6544     };
   6545     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
   6546   }
   6547 
   6548   if (Values.size() > 1 && VT.is128BitVector()) {
   6549     // Check for a build vector of consecutive loads.
   6550     for (unsigned i = 0; i < NumElems; ++i)
   6551       V[i] = Op.getOperand(i);
   6552 
   6553     // Check for elements which are consecutive loads.
   6554     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
   6555       return LD;
   6556 
   6557     // Check for a build vector from mostly shuffle plus few inserting.
   6558     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
   6559       return Sh;
   6560 
   6561     // For SSE 4.1, use insertps to put the high elements into the low element.
   6562     if (Subtarget->hasSSE41()) {
   6563       SDValue Result;
   6564       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
   6565         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
   6566       else
   6567         Result = DAG.getUNDEF(VT);
   6568 
   6569       for (unsigned i = 1; i < NumElems; ++i) {
   6570         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
   6571         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
   6572                              Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
   6573       }
   6574       return Result;
   6575     }
   6576 
   6577     // Otherwise, expand into a number of unpckl*, start by extending each of
   6578     // our (non-undef) elements to the full vector width with the element in the
   6579     // bottom slot of the vector (which generates no code for SSE).
   6580     for (unsigned i = 0; i < NumElems; ++i) {
   6581       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
   6582         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   6583       else
   6584         V[i] = DAG.getUNDEF(VT);
   6585     }
   6586 
   6587     // Next, we iteratively mix elements, e.g. for v4f32:
   6588     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
   6589     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
   6590     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
   6591     unsigned EltStride = NumElems >> 1;
   6592     while (EltStride != 0) {
   6593       for (unsigned i = 0; i < EltStride; ++i) {
   6594         // If V[i+EltStride] is undef and this is the first round of mixing,
   6595         // then it is safe to just drop this shuffle: V[i] is already in the
   6596         // right place, the one element (since it's the first round) being
   6597         // inserted as undef can be dropped.  This isn't safe for successive
   6598         // rounds because they will permute elements within both vectors.
   6599         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
   6600             EltStride == NumElems/2)
   6601           continue;
   6602 
   6603         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
   6604       }
   6605       EltStride >>= 1;
   6606     }
   6607     return V[0];
   6608   }
   6609   return SDValue();
   6610 }
   6611 
   6612 // 256-bit AVX can use the vinsertf128 instruction
   6613 // to create 256-bit vectors from two other 128-bit ones.
   6614 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   6615   SDLoc dl(Op);
   6616   MVT ResVT = Op.getSimpleValueType();
   6617 
   6618   assert((ResVT.is256BitVector() ||
   6619           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
   6620 
   6621   SDValue V1 = Op.getOperand(0);
   6622   SDValue V2 = Op.getOperand(1);
   6623   unsigned NumElems = ResVT.getVectorNumElements();
   6624   if (ResVT.is256BitVector())
   6625     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
   6626 
   6627   if (Op.getNumOperands() == 4) {
   6628     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
   6629                                   ResVT.getVectorNumElements()/2);
   6630     SDValue V3 = Op.getOperand(2);
   6631     SDValue V4 = Op.getOperand(3);
   6632     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
   6633       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
   6634   }
   6635   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
   6636 }
   6637 
   6638 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   6639                                        const X86Subtarget *Subtarget,
   6640                                        SelectionDAG & DAG) {
   6641   SDLoc dl(Op);
   6642   MVT ResVT = Op.getSimpleValueType();
   6643   unsigned NumOfOperands = Op.getNumOperands();
   6644 
   6645   assert(isPowerOf2_32(NumOfOperands) &&
   6646          "Unexpected number of operands in CONCAT_VECTORS");
   6647 
   6648   SDValue Undef = DAG.getUNDEF(ResVT);
   6649   if (NumOfOperands > 2) {
   6650     // Specialize the cases when all, or all but one, of the operands are undef.
   6651     unsigned NumOfDefinedOps = 0;
   6652     unsigned OpIdx = 0;
   6653     for (unsigned i = 0; i < NumOfOperands; i++)
   6654       if (!Op.getOperand(i).isUndef()) {
   6655         NumOfDefinedOps++;
   6656         OpIdx = i;
   6657       }
   6658     if (NumOfDefinedOps == 0)
   6659       return Undef;
   6660     if (NumOfDefinedOps == 1) {
   6661       unsigned SubVecNumElts =
   6662         Op.getOperand(OpIdx).getValueType().getVectorNumElements();
   6663       SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
   6664       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
   6665                          Op.getOperand(OpIdx), IdxVal);
   6666     }
   6667 
   6668     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
   6669                                   ResVT.getVectorNumElements()/2);
   6670     SmallVector<SDValue, 2> Ops;
   6671     for (unsigned i = 0; i < NumOfOperands/2; i++)
   6672       Ops.push_back(Op.getOperand(i));
   6673     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
   6674     Ops.clear();
   6675     for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
   6676       Ops.push_back(Op.getOperand(i));
   6677     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
   6678     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   6679   }
   6680 
   6681   // 2 operands
   6682   SDValue V1 = Op.getOperand(0);
   6683   SDValue V2 = Op.getOperand(1);
   6684   unsigned NumElems = ResVT.getVectorNumElements();
   6685   assert(V1.getValueType() == V2.getValueType() &&
   6686          V1.getValueType().getVectorNumElements() == NumElems/2 &&
   6687          "Unexpected operands in CONCAT_VECTORS");
   6688 
   6689   if (ResVT.getSizeInBits() >= 16)
   6690     return Op; // The operation is legal with KUNPCK
   6691 
   6692   bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
   6693   bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
   6694   SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
   6695   if (IsZeroV1 && IsZeroV2)
   6696     return ZeroVec;
   6697 
   6698   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
   6699   if (V2.isUndef())
   6700     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
   6701   if (IsZeroV2)
   6702     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
   6703 
   6704   SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
   6705   if (V1.isUndef())
   6706     V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
   6707 
   6708   if (IsZeroV1)
   6709     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
   6710 
   6711   V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
   6712   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
   6713 }
   6714 
   6715 static SDValue LowerCONCAT_VECTORS(SDValue Op,
   6716                                    const X86Subtarget *Subtarget,
   6717                                    SelectionDAG &DAG) {
   6718   MVT VT = Op.getSimpleValueType();
   6719   if (VT.getVectorElementType() == MVT::i1)
   6720     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
   6721 
   6722   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
   6723          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
   6724           Op.getNumOperands() == 4)));
   6725 
   6726   // AVX can use the vinsertf128 instruction to create 256-bit vectors
   6727   // from two other 128-bit ones.
   6728 
   6729   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
   6730   return LowerAVXCONCAT_VECTORS(Op, DAG);
   6731 }
   6732 
   6733 //===----------------------------------------------------------------------===//
   6734 // Vector shuffle lowering
   6735 //
   6736 // This is an experimental code path for lowering vector shuffles on x86. It is
   6737 // designed to handle arbitrary vector shuffles and blends, gracefully
   6738 // degrading performance as necessary. It works hard to recognize idiomatic
   6739 // shuffles and lower them to optimal instruction patterns without leaving
   6740 // a framework that allows reasonably efficient handling of all vector shuffle
   6741 // patterns.
   6742 //===----------------------------------------------------------------------===//
   6743 
   6744 /// \brief Tiny helper function to identify a no-op mask.
   6745 ///
   6746 /// This is a somewhat boring predicate function. It checks whether the mask
   6747 /// array input, which is assumed to be a single-input shuffle mask of the kind
   6748 /// used by the X86 shuffle instructions (not a fully general
   6749 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
   6750 /// in-place shuffle are 'no-op's.
   6751 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
   6752   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   6753     if (Mask[i] != -1 && Mask[i] != i)
   6754       return false;
   6755   return true;
   6756 }
   6757 
   6758 /// \brief Helper function to classify a mask as a single-input mask.
   6759 ///
   6760 /// This isn't a generic single-input test because in the vector shuffle
   6761 /// lowering we canonicalize single inputs to be the first input operand. This
   6762 /// means we can more quickly test for a single input by only checking whether
   6763 /// an input from the second operand exists. We also assume that the size of
   6764 /// mask corresponds to the size of the input vectors which isn't true in the
   6765 /// fully general case.
   6766 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
   6767   for (int M : Mask)
   6768     if (M >= (int)Mask.size())
   6769       return false;
   6770   return true;
   6771 }
   6772 
   6773 /// \brief Test whether there are elements crossing 128-bit lanes in this
   6774 /// shuffle mask.
   6775 ///
   6776 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
   6777 /// and we routinely test for these.
   6778 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
   6779   int LaneSize = 128 / VT.getScalarSizeInBits();
   6780   int Size = Mask.size();
   6781   for (int i = 0; i < Size; ++i)
   6782     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
   6783       return true;
   6784   return false;
   6785 }
   6786 
   6787 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
   6788 ///
   6789 /// This checks a shuffle mask to see if it is performing the same
   6790 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
   6791 /// that it is also not lane-crossing. It may however involve a blend from the
   6792 /// same lane of a second vector.
   6793 ///
   6794 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
   6795 /// non-trivial to compute in the face of undef lanes. The representation is
   6796 /// *not* suitable for use with existing 128-bit shuffles as it will contain
   6797 /// entries from both V1 and V2 inputs to the wider mask.
   6798 static bool
   6799 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
   6800                                 SmallVectorImpl<int> &RepeatedMask) {
   6801   int LaneSize = 128 / VT.getScalarSizeInBits();
   6802   RepeatedMask.resize(LaneSize, -1);
   6803   int Size = Mask.size();
   6804   for (int i = 0; i < Size; ++i) {
   6805     if (Mask[i] < 0)
   6806       continue;
   6807     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
   6808       // This entry crosses lanes, so there is no way to model this shuffle.
   6809       return false;
   6810 
   6811     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
   6812     if (RepeatedMask[i % LaneSize] == -1)
   6813       // This is the first non-undef entry in this slot of a 128-bit lane.
   6814       RepeatedMask[i % LaneSize] =
   6815           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
   6816     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
   6817       // Found a mismatch with the repeated mask.
   6818       return false;
   6819   }
   6820   return true;
   6821 }
   6822 
   6823 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
   6824 /// arguments.
   6825 ///
   6826 /// This is a fast way to test a shuffle mask against a fixed pattern:
   6827 ///
   6828 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
   6829 ///
   6830 /// It returns true if the mask is exactly as wide as the argument list, and
   6831 /// each element of the mask is either -1 (signifying undef) or the value given
   6832 /// in the argument.
   6833 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
   6834                                 ArrayRef<int> ExpectedMask) {
   6835   if (Mask.size() != ExpectedMask.size())
   6836     return false;
   6837 
   6838   int Size = Mask.size();
   6839 
   6840   // If the values are build vectors, we can look through them to find
   6841   // equivalent inputs that make the shuffles equivalent.
   6842   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
   6843   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
   6844 
   6845   for (int i = 0; i < Size; ++i)
   6846     if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
   6847       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
   6848       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
   6849       if (!MaskBV || !ExpectedBV ||
   6850           MaskBV->getOperand(Mask[i] % Size) !=
   6851               ExpectedBV->getOperand(ExpectedMask[i] % Size))
   6852         return false;
   6853     }
   6854 
   6855   return true;
   6856 }
   6857 
   6858 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
   6859 ///
   6860 /// This helper function produces an 8-bit shuffle immediate corresponding to
   6861 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
   6862 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
   6863 /// example.
   6864 ///
   6865 /// NB: We rely heavily on "undef" masks preserving the input lane.
   6866 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
   6867                                           SelectionDAG &DAG) {
   6868   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
   6869   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
   6870   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
   6871   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
   6872   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
   6873 
   6874   unsigned Imm = 0;
   6875   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
   6876   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
   6877   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
   6878   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
   6879   return DAG.getConstant(Imm, DL, MVT::i8);
   6880 }
   6881 
   6882 /// \brief Compute whether each element of a shuffle is zeroable.
   6883 ///
   6884 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
   6885 /// Either it is an undef element in the shuffle mask, the element of the input
   6886 /// referenced is undef, or the element of the input referenced is known to be
   6887 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
   6888 /// as many lanes with this technique as possible to simplify the remaining
   6889 /// shuffle.
   6890 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
   6891                                                      SDValue V1, SDValue V2) {
   6892   SmallBitVector Zeroable(Mask.size(), false);
   6893 
   6894   while (V1.getOpcode() == ISD::BITCAST)
   6895     V1 = V1->getOperand(0);
   6896   while (V2.getOpcode() == ISD::BITCAST)
   6897     V2 = V2->getOperand(0);
   6898 
   6899   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
   6900   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
   6901 
   6902   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   6903     int M = Mask[i];
   6904     // Handle the easy cases.
   6905     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
   6906       Zeroable[i] = true;
   6907       continue;
   6908     }
   6909 
   6910     // If this is an index into a build_vector node (which has the same number
   6911     // of elements), dig out the input value and use it.
   6912     SDValue V = M < Size ? V1 : V2;
   6913     if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
   6914       continue;
   6915 
   6916     SDValue Input = V.getOperand(M % Size);
   6917     // The UNDEF opcode check really should be dead code here, but not quite
   6918     // worth asserting on (it isn't invalid, just unexpected).
   6919     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
   6920       Zeroable[i] = true;
   6921   }
   6922 
   6923   return Zeroable;
   6924 }
   6925 
   6926 // X86 has dedicated unpack instructions that can handle specific blend
   6927 // operations: UNPCKH and UNPCKL.
   6928 static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
   6929                                            SDValue V1, SDValue V2,
   6930                                            SelectionDAG &DAG) {
   6931   int NumElts = VT.getVectorNumElements();
   6932   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
   6933   SmallVector<int, 8> Unpckl;
   6934   SmallVector<int, 8> Unpckh;
   6935 
   6936   for (int i = 0; i < NumElts; ++i) {
   6937     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
   6938     int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
   6939     int HiPos = LoPos + NumEltsInLane / 2;
   6940     Unpckl.push_back(LoPos);
   6941     Unpckh.push_back(HiPos);
   6942   }
   6943 
   6944   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
   6945     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
   6946   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
   6947     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
   6948 
   6949   // Commute and try again.
   6950   ShuffleVectorSDNode::commuteMask(Unpckl);
   6951   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
   6952     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
   6953 
   6954   ShuffleVectorSDNode::commuteMask(Unpckh);
   6955   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
   6956     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
   6957 
   6958   return SDValue();
   6959 }
   6960 
   6961 /// \brief Try to emit a bitmask instruction for a shuffle.
   6962 ///
   6963 /// This handles cases where we can model a blend exactly as a bitmask due to
   6964 /// one of the inputs being zeroable.
   6965 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
   6966                                            SDValue V2, ArrayRef<int> Mask,
   6967                                            SelectionDAG &DAG) {
   6968   MVT EltVT = VT.getVectorElementType();
   6969   int NumEltBits = EltVT.getSizeInBits();
   6970   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
   6971   SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
   6972   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
   6973                                     IntEltVT);
   6974   if (EltVT.isFloatingPoint()) {
   6975     Zero = DAG.getBitcast(EltVT, Zero);
   6976     AllOnes = DAG.getBitcast(EltVT, AllOnes);
   6977   }
   6978   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
   6979   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   6980   SDValue V;
   6981   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   6982     if (Zeroable[i])
   6983       continue;
   6984     if (Mask[i] % Size != i)
   6985       return SDValue(); // Not a blend.
   6986     if (!V)
   6987       V = Mask[i] < Size ? V1 : V2;
   6988     else if (V != (Mask[i] < Size ? V1 : V2))
   6989       return SDValue(); // Can only let one input through the mask.
   6990 
   6991     VMaskOps[i] = AllOnes;
   6992   }
   6993   if (!V)
   6994     return SDValue(); // No non-zeroable elements!
   6995 
   6996   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
   6997   V = DAG.getNode(VT.isFloatingPoint()
   6998                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
   6999                   DL, VT, V, VMask);
   7000   return V;
   7001 }
   7002 
   7003 /// \brief Try to emit a blend instruction for a shuffle using bit math.
   7004 ///
   7005 /// This is used as a fallback approach when first class blend instructions are
   7006 /// unavailable. Currently it is only suitable for integer vectors, but could
   7007 /// be generalized for floating point vectors if desirable.
   7008 static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
   7009                                             SDValue V2, ArrayRef<int> Mask,
   7010                                             SelectionDAG &DAG) {
   7011   assert(VT.isInteger() && "Only supports integer vector types!");
   7012   MVT EltVT = VT.getVectorElementType();
   7013   int NumEltBits = EltVT.getSizeInBits();
   7014   SDValue Zero = DAG.getConstant(0, DL, EltVT);
   7015   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
   7016                                     EltVT);
   7017   SmallVector<SDValue, 16> MaskOps;
   7018   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   7019     if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
   7020       return SDValue(); // Shuffled input!
   7021     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
   7022   }
   7023 
   7024   SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
   7025   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
   7026   // We have to cast V2 around.
   7027   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
   7028   V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
   7029                                       DAG.getBitcast(MaskVT, V1Mask),
   7030                                       DAG.getBitcast(MaskVT, V2)));
   7031   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
   7032 }
   7033 
   7034 /// \brief Try to emit a blend instruction for a shuffle.
   7035 ///
   7036 /// This doesn't do any checks for the availability of instructions for blending
   7037 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
   7038 /// be matched in the backend with the type given. What it does check for is
   7039 /// that the shuffle mask is a blend, or convertible into a blend with zero.
   7040 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
   7041                                          SDValue V2, ArrayRef<int> Original,
   7042                                          const X86Subtarget *Subtarget,
   7043                                          SelectionDAG &DAG) {
   7044   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
   7045   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
   7046   SmallVector<int, 8> Mask(Original.begin(), Original.end());
   7047   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   7048   bool ForceV1Zero = false, ForceV2Zero = false;
   7049 
   7050   // Attempt to generate the binary blend mask. If an input is zero then
   7051   // we can use any lane.
   7052   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
   7053   unsigned BlendMask = 0;
   7054   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   7055     int M = Mask[i];
   7056     if (M < 0)
   7057       continue;
   7058     if (M == i)
   7059       continue;
   7060     if (M == i + Size) {
   7061       BlendMask |= 1u << i;
   7062       continue;
   7063     }
   7064     if (Zeroable[i]) {
   7065       if (V1IsZero) {
   7066         ForceV1Zero = true;
   7067         Mask[i] = i;
   7068         continue;
   7069       }
   7070       if (V2IsZero) {
   7071         ForceV2Zero = true;
   7072         BlendMask |= 1u << i;
   7073         Mask[i] = i + Size;
   7074         continue;
   7075       }
   7076     }
   7077     return SDValue(); // Shuffled input!
   7078   }
   7079 
   7080   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
   7081   if (ForceV1Zero)
   7082     V1 = getZeroVector(VT, Subtarget, DAG, DL);
   7083   if (ForceV2Zero)
   7084     V2 = getZeroVector(VT, Subtarget, DAG, DL);
   7085 
   7086   auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
   7087     unsigned ScaledMask = 0;
   7088     for (int i = 0; i != Size; ++i)
   7089       if (BlendMask & (1u << i))
   7090         for (int j = 0; j != Scale; ++j)
   7091           ScaledMask |= 1u << (i * Scale + j);
   7092     return ScaledMask;
   7093   };
   7094 
   7095   switch (VT.SimpleTy) {
   7096   case MVT::v2f64:
   7097   case MVT::v4f32:
   7098   case MVT::v4f64:
   7099   case MVT::v8f32:
   7100     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
   7101                        DAG.getConstant(BlendMask, DL, MVT::i8));
   7102 
   7103   case MVT::v4i64:
   7104   case MVT::v8i32:
   7105     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
   7106     // FALLTHROUGH
   7107   case MVT::v2i64:
   7108   case MVT::v4i32:
   7109     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
   7110     // that instruction.
   7111     if (Subtarget->hasAVX2()) {
   7112       // Scale the blend by the number of 32-bit dwords per element.
   7113       int Scale =  VT.getScalarSizeInBits() / 32;
   7114       BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
   7115       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
   7116       V1 = DAG.getBitcast(BlendVT, V1);
   7117       V2 = DAG.getBitcast(BlendVT, V2);
   7118       return DAG.getBitcast(
   7119           VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
   7120                           DAG.getConstant(BlendMask, DL, MVT::i8)));
   7121     }
   7122     // FALLTHROUGH
   7123   case MVT::v8i16: {
   7124     // For integer shuffles we need to expand the mask and cast the inputs to
   7125     // v8i16s prior to blending.
   7126     int Scale = 8 / VT.getVectorNumElements();
   7127     BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
   7128     V1 = DAG.getBitcast(MVT::v8i16, V1);
   7129     V2 = DAG.getBitcast(MVT::v8i16, V2);
   7130     return DAG.getBitcast(VT,
   7131                           DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
   7132                                       DAG.getConstant(BlendMask, DL, MVT::i8)));
   7133   }
   7134 
   7135   case MVT::v16i16: {
   7136     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
   7137     SmallVector<int, 8> RepeatedMask;
   7138     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
   7139       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
   7140       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
   7141       BlendMask = 0;
   7142       for (int i = 0; i < 8; ++i)
   7143         if (RepeatedMask[i] >= 16)
   7144           BlendMask |= 1u << i;
   7145       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
   7146                          DAG.getConstant(BlendMask, DL, MVT::i8));
   7147     }
   7148   }
   7149     // FALLTHROUGH
   7150   case MVT::v16i8:
   7151   case MVT::v32i8: {
   7152     assert((VT.is128BitVector() || Subtarget->hasAVX2()) &&
   7153            "256-bit byte-blends require AVX2 support!");
   7154 
   7155     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
   7156     if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
   7157       return Masked;
   7158 
   7159     // Scale the blend by the number of bytes per element.
   7160     int Scale = VT.getScalarSizeInBits() / 8;
   7161 
   7162     // This form of blend is always done on bytes. Compute the byte vector
   7163     // type.
   7164     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
   7165 
   7166     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
   7167     // mix of LLVM's code generator and the x86 backend. We tell the code
   7168     // generator that boolean values in the elements of an x86 vector register
   7169     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
   7170     // mapping a select to operand #1, and 'false' mapping to operand #2. The
   7171     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
   7172     // of the element (the remaining are ignored) and 0 in that high bit would
   7173     // mean operand #1 while 1 in the high bit would mean operand #2. So while
   7174     // the LLVM model for boolean values in vector elements gets the relevant
   7175     // bit set, it is set backwards and over constrained relative to x86's
   7176     // actual model.
   7177     SmallVector<SDValue, 32> VSELECTMask;
   7178     for (int i = 0, Size = Mask.size(); i < Size; ++i)
   7179       for (int j = 0; j < Scale; ++j)
   7180         VSELECTMask.push_back(
   7181             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
   7182                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
   7183                                           MVT::i8));
   7184 
   7185     V1 = DAG.getBitcast(BlendVT, V1);
   7186     V2 = DAG.getBitcast(BlendVT, V2);
   7187     return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
   7188                                           DAG.getNode(ISD::BUILD_VECTOR, DL,
   7189                                                       BlendVT, VSELECTMask),
   7190                                           V1, V2));
   7191   }
   7192 
   7193   default:
   7194     llvm_unreachable("Not a supported integer vector type!");
   7195   }
   7196 }
   7197 
   7198 /// \brief Try to lower as a blend of elements from two inputs followed by
   7199 /// a single-input permutation.
   7200 ///
   7201 /// This matches the pattern where we can blend elements from two inputs and
   7202 /// then reduce the shuffle to a single-input permutation.
   7203 static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
   7204                                                    SDValue V2,
   7205                                                    ArrayRef<int> Mask,
   7206                                                    SelectionDAG &DAG) {
   7207   // We build up the blend mask while checking whether a blend is a viable way
   7208   // to reduce the shuffle.
   7209   SmallVector<int, 32> BlendMask(Mask.size(), -1);
   7210   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
   7211 
   7212   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   7213     if (Mask[i] < 0)
   7214       continue;
   7215 
   7216     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
   7217 
   7218     if (BlendMask[Mask[i] % Size] == -1)
   7219       BlendMask[Mask[i] % Size] = Mask[i];
   7220     else if (BlendMask[Mask[i] % Size] != Mask[i])
   7221       return SDValue(); // Can't blend in the needed input!
   7222 
   7223     PermuteMask[i] = Mask[i] % Size;
   7224   }
   7225 
   7226   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
   7227   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
   7228 }
   7229 
   7230 /// \brief Generic routine to decompose a shuffle and blend into indepndent
   7231 /// blends and permutes.
   7232 ///
   7233 /// This matches the extremely common pattern for handling combined
   7234 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
   7235 /// operations. It will try to pick the best arrangement of shuffles and
   7236 /// blends.
   7237 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
   7238                                                           SDValue V1,
   7239                                                           SDValue V2,
   7240                                                           ArrayRef<int> Mask,
   7241                                                           SelectionDAG &DAG) {
   7242   // Shuffle the input elements into the desired positions in V1 and V2 and
   7243   // blend them together.
   7244   SmallVector<int, 32> V1Mask(Mask.size(), -1);
   7245   SmallVector<int, 32> V2Mask(Mask.size(), -1);
   7246   SmallVector<int, 32> BlendMask(Mask.size(), -1);
   7247   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   7248     if (Mask[i] >= 0 && Mask[i] < Size) {
   7249       V1Mask[i] = Mask[i];
   7250       BlendMask[i] = i;
   7251     } else if (Mask[i] >= Size) {
   7252       V2Mask[i] = Mask[i] - Size;
   7253       BlendMask[i] = i + Size;
   7254     }
   7255 
   7256   // Try to lower with the simpler initial blend strategy unless one of the
   7257   // input shuffles would be a no-op. We prefer to shuffle inputs as the
   7258   // shuffle may be able to fold with a load or other benefit. However, when
   7259   // we'll have to do 2x as many shuffles in order to achieve this, blending
   7260   // first is a better strategy.
   7261   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
   7262     if (SDValue BlendPerm =
   7263             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
   7264       return BlendPerm;
   7265 
   7266   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
   7267   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
   7268   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
   7269 }
   7270 
   7271 /// \brief Try to lower a vector shuffle as a byte rotation.
   7272 ///
   7273 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
   7274 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
   7275 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
   7276 /// try to generically lower a vector shuffle through such an pattern. It
   7277 /// does not check for the profitability of lowering either as PALIGNR or
   7278 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
   7279 /// This matches shuffle vectors that look like:
   7280 ///
   7281 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
   7282 ///
   7283 /// Essentially it concatenates V1 and V2, shifts right by some number of
   7284 /// elements, and takes the low elements as the result. Note that while this is
   7285 /// specified as a *right shift* because x86 is little-endian, it is a *left
   7286 /// rotate* of the vector lanes.
   7287 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
   7288                                               SDValue V2,
   7289                                               ArrayRef<int> Mask,
   7290                                               const X86Subtarget *Subtarget,
   7291                                               SelectionDAG &DAG) {
   7292   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
   7293 
   7294   int NumElts = Mask.size();
   7295   int NumLanes = VT.getSizeInBits() / 128;
   7296   int NumLaneElts = NumElts / NumLanes;
   7297 
   7298   // We need to detect various ways of spelling a rotation:
   7299   //   [11, 12, 13, 14, 15,  0,  1,  2]
   7300   //   [-1, 12, 13, 14, -1, -1,  1, -1]
   7301   //   [-1, -1, -1, -1, -1, -1,  1,  2]
   7302   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
   7303   //   [-1,  4,  5,  6, -1, -1,  9, -1]
   7304   //   [-1,  4,  5,  6, -1, -1, -1, -1]
   7305   int Rotation = 0;
   7306   SDValue Lo, Hi;
   7307   for (int l = 0; l < NumElts; l += NumLaneElts) {
   7308     for (int i = 0; i < NumLaneElts; ++i) {
   7309       if (Mask[l + i] == -1)
   7310         continue;
   7311       assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
   7312 
   7313       // Get the mod-Size index and lane correct it.
   7314       int LaneIdx = (Mask[l + i] % NumElts) - l;
   7315       // Make sure it was in this lane.
   7316       if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
   7317         return SDValue();
   7318 
   7319       // Determine where a rotated vector would have started.
   7320       int StartIdx = i - LaneIdx;
   7321       if (StartIdx == 0)
   7322         // The identity rotation isn't interesting, stop.
   7323         return SDValue();
   7324 
   7325       // If we found the tail of a vector the rotation must be the missing
   7326       // front. If we found the head of a vector, it must be how much of the
   7327       // head.
   7328       int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
   7329 
   7330       if (Rotation == 0)
   7331         Rotation = CandidateRotation;
   7332       else if (Rotation != CandidateRotation)
   7333         // The rotations don't match, so we can't match this mask.
   7334         return SDValue();
   7335 
   7336       // Compute which value this mask is pointing at.
   7337       SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
   7338 
   7339       // Compute which of the two target values this index should be assigned
   7340       // to. This reflects whether the high elements are remaining or the low
   7341       // elements are remaining.
   7342       SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
   7343 
   7344       // Either set up this value if we've not encountered it before, or check
   7345       // that it remains consistent.
   7346       if (!TargetV)
   7347         TargetV = MaskV;
   7348       else if (TargetV != MaskV)
   7349         // This may be a rotation, but it pulls from the inputs in some
   7350         // unsupported interleaving.
   7351         return SDValue();
   7352     }
   7353   }
   7354 
   7355   // Check that we successfully analyzed the mask, and normalize the results.
   7356   assert(Rotation != 0 && "Failed to locate a viable rotation!");
   7357   assert((Lo || Hi) && "Failed to find a rotated input vector!");
   7358   if (!Lo)
   7359     Lo = Hi;
   7360   else if (!Hi)
   7361     Hi = Lo;
   7362 
   7363   // The actual rotate instruction rotates bytes, so we need to scale the
   7364   // rotation based on how many bytes are in the vector lane.
   7365   int Scale = 16 / NumLaneElts;
   7366 
   7367   // SSSE3 targets can use the palignr instruction.
   7368   if (Subtarget->hasSSSE3()) {
   7369     // Cast the inputs to i8 vector of correct length to match PALIGNR.
   7370     MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
   7371     Lo = DAG.getBitcast(AlignVT, Lo);
   7372     Hi = DAG.getBitcast(AlignVT, Hi);
   7373 
   7374     return DAG.getBitcast(
   7375         VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi,
   7376                         DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
   7377   }
   7378 
   7379   assert(VT.is128BitVector() &&
   7380          "Rotate-based lowering only supports 128-bit lowering!");
   7381   assert(Mask.size() <= 16 &&
   7382          "Can shuffle at most 16 bytes in a 128-bit vector!");
   7383 
   7384   // Default SSE2 implementation
   7385   int LoByteShift = 16 - Rotation * Scale;
   7386   int HiByteShift = Rotation * Scale;
   7387 
   7388   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
   7389   Lo = DAG.getBitcast(MVT::v2i64, Lo);
   7390   Hi = DAG.getBitcast(MVT::v2i64, Hi);
   7391 
   7392   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
   7393                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
   7394   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
   7395                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
   7396   return DAG.getBitcast(VT,
   7397                         DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
   7398 }
   7399 
   7400 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
   7401 ///
   7402 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
   7403 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
   7404 /// matches elements from one of the input vectors shuffled to the left or
   7405 /// right with zeroable elements 'shifted in'. It handles both the strictly
   7406 /// bit-wise element shifts and the byte shift across an entire 128-bit double
   7407 /// quad word lane.
   7408 ///
   7409 /// PSHL : (little-endian) left bit shift.
   7410 /// [ zz, 0, zz,  2 ]
   7411 /// [ -1, 4, zz, -1 ]
   7412 /// PSRL : (little-endian) right bit shift.
   7413 /// [  1, zz,  3, zz]
   7414 /// [ -1, -1,  7, zz]
   7415 /// PSLLDQ : (little-endian) left byte shift
   7416 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
   7417 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
   7418 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
   7419 /// PSRLDQ : (little-endian) right byte shift
   7420 /// [  5, 6,  7, zz, zz, zz, zz, zz]
   7421 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
   7422 /// [  1, 2, -1, -1, -1, -1, zz, zz]
   7423 static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
   7424                                          SDValue V2, ArrayRef<int> Mask,
   7425                                          SelectionDAG &DAG) {
   7426   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   7427 
   7428   int Size = Mask.size();
   7429   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
   7430 
   7431   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
   7432     for (int i = 0; i < Size; i += Scale)
   7433       for (int j = 0; j < Shift; ++j)
   7434         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
   7435           return false;
   7436 
   7437     return true;
   7438   };
   7439 
   7440   auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
   7441     for (int i = 0; i != Size; i += Scale) {
   7442       unsigned Pos = Left ? i + Shift : i;
   7443       unsigned Low = Left ? i : i + Shift;
   7444       unsigned Len = Scale - Shift;
   7445       if (!isSequentialOrUndefInRange(Mask, Pos, Len,
   7446                                       Low + (V == V1 ? 0 : Size)))
   7447         return SDValue();
   7448     }
   7449 
   7450     int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
   7451     bool ByteShift = ShiftEltBits > 64;
   7452     unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
   7453                            : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
   7454     int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
   7455 
   7456     // Normalize the scale for byte shifts to still produce an i64 element
   7457     // type.
   7458     Scale = ByteShift ? Scale / 2 : Scale;
   7459 
   7460     // We need to round trip through the appropriate type for the shift.
   7461     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
   7462     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
   7463     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
   7464            "Illegal integer vector type");
   7465     V = DAG.getBitcast(ShiftVT, V);
   7466 
   7467     V = DAG.getNode(OpCode, DL, ShiftVT, V,
   7468                     DAG.getConstant(ShiftAmt, DL, MVT::i8));
   7469     return DAG.getBitcast(VT, V);
   7470   };
   7471 
   7472   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
   7473   // keep doubling the size of the integer elements up to that. We can
   7474   // then shift the elements of the integer vector by whole multiples of
   7475   // their width within the elements of the larger integer vector. Test each
   7476   // multiple to see if we can find a match with the moved element indices
   7477   // and that the shifted in elements are all zeroable.
   7478   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
   7479     for (int Shift = 1; Shift != Scale; ++Shift)
   7480       for (bool Left : {true, false})
   7481         if (CheckZeros(Shift, Scale, Left))
   7482           for (SDValue V : {V1, V2})
   7483             if (SDValue Match = MatchShift(Shift, Scale, Left, V))
   7484               return Match;
   7485 
   7486   // no match
   7487   return SDValue();
   7488 }
   7489 
   7490 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
   7491 static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
   7492                                            SDValue V2, ArrayRef<int> Mask,
   7493                                            SelectionDAG &DAG) {
   7494   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   7495   assert(!Zeroable.all() && "Fully zeroable shuffle mask");
   7496 
   7497   int Size = Mask.size();
   7498   int HalfSize = Size / 2;
   7499   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
   7500 
   7501   // Upper half must be undefined.
   7502   if (!isUndefInRange(Mask, HalfSize, HalfSize))
   7503     return SDValue();
   7504 
   7505   // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
   7506   // Remainder of lower half result is zero and upper half is all undef.
   7507   auto LowerAsEXTRQ = [&]() {
   7508     // Determine the extraction length from the part of the
   7509     // lower half that isn't zeroable.
   7510     int Len = HalfSize;
   7511     for (; Len > 0; --Len)
   7512       if (!Zeroable[Len - 1])
   7513         break;
   7514     assert(Len > 0 && "Zeroable shuffle mask");
   7515 
   7516     // Attempt to match first Len sequential elements from the lower half.
   7517     SDValue Src;
   7518     int Idx = -1;
   7519     for (int i = 0; i != Len; ++i) {
   7520       int M = Mask[i];
   7521       if (M < 0)
   7522         continue;
   7523       SDValue &V = (M < Size ? V1 : V2);
   7524       M = M % Size;
   7525 
   7526       // The extracted elements must start at a valid index and all mask
   7527       // elements must be in the lower half.
   7528       if (i > M || M >= HalfSize)
   7529         return SDValue();
   7530 
   7531       if (Idx < 0 || (Src == V && Idx == (M - i))) {
   7532         Src = V;
   7533         Idx = M - i;
   7534         continue;
   7535       }
   7536       return SDValue();
   7537     }
   7538 
   7539     if (Idx < 0)
   7540       return SDValue();
   7541 
   7542     assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
   7543     int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
   7544     int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
   7545     return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
   7546                        DAG.getConstant(BitLen, DL, MVT::i8),
   7547                        DAG.getConstant(BitIdx, DL, MVT::i8));
   7548   };
   7549 
   7550   if (SDValue ExtrQ = LowerAsEXTRQ())
   7551     return ExtrQ;
   7552 
   7553   // INSERTQ: Extract lowest Len elements from lower half of second source and
   7554   // insert over first source, starting at Idx.
   7555   // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
   7556   auto LowerAsInsertQ = [&]() {
   7557     for (int Idx = 0; Idx != HalfSize; ++Idx) {
   7558       SDValue Base;
   7559 
   7560       // Attempt to match first source from mask before insertion point.
   7561       if (isUndefInRange(Mask, 0, Idx)) {
   7562         /* EMPTY */
   7563       } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
   7564         Base = V1;
   7565       } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
   7566         Base = V2;
   7567       } else {
   7568         continue;
   7569       }
   7570 
   7571       // Extend the extraction length looking to match both the insertion of
   7572       // the second source and the remaining elements of the first.
   7573       for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
   7574         SDValue Insert;
   7575         int Len = Hi - Idx;
   7576 
   7577         // Match insertion.
   7578         if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
   7579           Insert = V1;
   7580         } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
   7581           Insert = V2;
   7582         } else {
   7583           continue;
   7584         }
   7585 
   7586         // Match the remaining elements of the lower half.
   7587         if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
   7588           /* EMPTY */
   7589         } else if ((!Base || (Base == V1)) &&
   7590                    isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
   7591           Base = V1;
   7592         } else if ((!Base || (Base == V2)) &&
   7593                    isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
   7594                                               Size + Hi)) {
   7595           Base = V2;
   7596         } else {
   7597           continue;
   7598         }
   7599 
   7600         // We may not have a base (first source) - this can safely be undefined.
   7601         if (!Base)
   7602           Base = DAG.getUNDEF(VT);
   7603 
   7604         int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
   7605         int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
   7606         return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
   7607                            DAG.getConstant(BitLen, DL, MVT::i8),
   7608                            DAG.getConstant(BitIdx, DL, MVT::i8));
   7609       }
   7610     }
   7611 
   7612     return SDValue();
   7613   };
   7614 
   7615   if (SDValue InsertQ = LowerAsInsertQ())
   7616     return InsertQ;
   7617 
   7618   return SDValue();
   7619 }
   7620 
   7621 /// \brief Lower a vector shuffle as a zero or any extension.
   7622 ///
   7623 /// Given a specific number of elements, element bit width, and extension
   7624 /// stride, produce either a zero or any extension based on the available
   7625 /// features of the subtarget. The extended elements are consecutive and
   7626 /// begin and can start from an offseted element index in the input; to
   7627 /// avoid excess shuffling the offset must either being in the bottom lane
   7628 /// or at the start of a higher lane. All extended elements must be from
   7629 /// the same lane.
   7630 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   7631     SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
   7632     ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   7633   assert(Scale > 1 && "Need a scale to extend.");
   7634   int EltBits = VT.getScalarSizeInBits();
   7635   int NumElements = VT.getVectorNumElements();
   7636   int NumEltsPerLane = 128 / EltBits;
   7637   int OffsetLane = Offset / NumEltsPerLane;
   7638   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
   7639          "Only 8, 16, and 32 bit elements can be extended.");
   7640   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
   7641   assert(0 <= Offset && "Extension offset must be positive.");
   7642   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
   7643          "Extension offset must be in the first lane or start an upper lane.");
   7644 
   7645   // Check that an index is in same lane as the base offset.
   7646   auto SafeOffset = [&](int Idx) {
   7647     return OffsetLane == (Idx / NumEltsPerLane);
   7648   };
   7649 
   7650   // Shift along an input so that the offset base moves to the first element.
   7651   auto ShuffleOffset = [&](SDValue V) {
   7652     if (!Offset)
   7653       return V;
   7654 
   7655     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
   7656     for (int i = 0; i * Scale < NumElements; ++i) {
   7657       int SrcIdx = i + Offset;
   7658       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
   7659     }
   7660     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
   7661   };
   7662 
   7663   // Found a valid zext mask! Try various lowering strategies based on the
   7664   // input type and available ISA extensions.
   7665   if (Subtarget->hasSSE41()) {
   7666     // Not worth offseting 128-bit vectors if scale == 2, a pattern using
   7667     // PUNPCK will catch this in a later shuffle match.
   7668     if (Offset && Scale == 2 && VT.is128BitVector())
   7669       return SDValue();
   7670     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
   7671                                  NumElements / Scale);
   7672     InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV));
   7673     return DAG.getBitcast(VT, InputV);
   7674   }
   7675 
   7676   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
   7677 
   7678   // For any extends we can cheat for larger element sizes and use shuffle
   7679   // instructions that can fold with a load and/or copy.
   7680   if (AnyExt && EltBits == 32) {
   7681     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
   7682                          -1};
   7683     return DAG.getBitcast(
   7684         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
   7685                         DAG.getBitcast(MVT::v4i32, InputV),
   7686                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   7687   }
   7688   if (AnyExt && EltBits == 16 && Scale > 2) {
   7689     int PSHUFDMask[4] = {Offset / 2, -1,
   7690                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
   7691     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
   7692                          DAG.getBitcast(MVT::v4i32, InputV),
   7693                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
   7694     int PSHUFWMask[4] = {1, -1, -1, -1};
   7695     unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
   7696     return DAG.getBitcast(
   7697         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
   7698                         DAG.getBitcast(MVT::v8i16, InputV),
   7699                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
   7700   }
   7701 
   7702   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
   7703   // to 64-bits.
   7704   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
   7705     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
   7706     assert(VT.is128BitVector() && "Unexpected vector width!");
   7707 
   7708     int LoIdx = Offset * EltBits;
   7709     SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
   7710                              DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
   7711                                          DAG.getConstant(EltBits, DL, MVT::i8),
   7712                                          DAG.getConstant(LoIdx, DL, MVT::i8)));
   7713 
   7714     if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
   7715         !SafeOffset(Offset + 1))
   7716       return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
   7717 
   7718     int HiIdx = (Offset + 1) * EltBits;
   7719     SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
   7720                              DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
   7721                                          DAG.getConstant(EltBits, DL, MVT::i8),
   7722                                          DAG.getConstant(HiIdx, DL, MVT::i8)));
   7723     return DAG.getNode(ISD::BITCAST, DL, VT,
   7724                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
   7725   }
   7726 
   7727   // If this would require more than 2 unpack instructions to expand, use
   7728   // pshufb when available. We can only use more than 2 unpack instructions
   7729   // when zero extending i8 elements which also makes it easier to use pshufb.
   7730   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
   7731     assert(NumElements == 16 && "Unexpected byte vector width!");
   7732     SDValue PSHUFBMask[16];
   7733     for (int i = 0; i < 16; ++i) {
   7734       int Idx = Offset + (i / Scale);
   7735       PSHUFBMask[i] = DAG.getConstant(
   7736           (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
   7737     }
   7738     InputV = DAG.getBitcast(MVT::v16i8, InputV);
   7739     return DAG.getBitcast(VT,
   7740                           DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
   7741                                       DAG.getNode(ISD::BUILD_VECTOR, DL,
   7742                                                   MVT::v16i8, PSHUFBMask)));
   7743   }
   7744 
   7745   // If we are extending from an offset, ensure we start on a boundary that
   7746   // we can unpack from.
   7747   int AlignToUnpack = Offset % (NumElements / Scale);
   7748   if (AlignToUnpack) {
   7749     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
   7750     for (int i = AlignToUnpack; i < NumElements; ++i)
   7751       ShMask[i - AlignToUnpack] = i;
   7752     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
   7753     Offset -= AlignToUnpack;
   7754   }
   7755 
   7756   // Otherwise emit a sequence of unpacks.
   7757   do {
   7758     unsigned UnpackLoHi = X86ISD::UNPCKL;
   7759     if (Offset >= (NumElements / 2)) {
   7760       UnpackLoHi = X86ISD::UNPCKH;
   7761       Offset -= (NumElements / 2);
   7762     }
   7763 
   7764     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
   7765     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
   7766                          : getZeroVector(InputVT, Subtarget, DAG, DL);
   7767     InputV = DAG.getBitcast(InputVT, InputV);
   7768     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
   7769     Scale /= 2;
   7770     EltBits *= 2;
   7771     NumElements /= 2;
   7772   } while (Scale > 1);
   7773   return DAG.getBitcast(VT, InputV);
   7774 }
   7775 
   7776 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
   7777 ///
   7778 /// This routine will try to do everything in its power to cleverly lower
   7779 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
   7780 /// check for the profitability of this lowering,  it tries to aggressively
   7781 /// match this pattern. It will use all of the micro-architectural details it
   7782 /// can to emit an efficient lowering. It handles both blends with all-zero
   7783 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
   7784 /// masking out later).
   7785 ///
   7786 /// The reason we have dedicated lowering for zext-style shuffles is that they
   7787 /// are both incredibly common and often quite performance sensitive.
   7788 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
   7789     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   7790     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   7791   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   7792 
   7793   int Bits = VT.getSizeInBits();
   7794   int NumLanes = Bits / 128;
   7795   int NumElements = VT.getVectorNumElements();
   7796   int NumEltsPerLane = NumElements / NumLanes;
   7797   assert(VT.getScalarSizeInBits() <= 32 &&
   7798          "Exceeds 32-bit integer zero extension limit");
   7799   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
   7800 
   7801   // Define a helper function to check a particular ext-scale and lower to it if
   7802   // valid.
   7803   auto Lower = [&](int Scale) -> SDValue {
   7804     SDValue InputV;
   7805     bool AnyExt = true;
   7806     int Offset = 0;
   7807     int Matches = 0;
   7808     for (int i = 0; i < NumElements; ++i) {
   7809       int M = Mask[i];
   7810       if (M == -1)
   7811         continue; // Valid anywhere but doesn't tell us anything.
   7812       if (i % Scale != 0) {
   7813         // Each of the extended elements need to be zeroable.
   7814         if (!Zeroable[i])
   7815           return SDValue();
   7816 
   7817         // We no longer are in the anyext case.
   7818         AnyExt = false;
   7819         continue;
   7820       }
   7821 
   7822       // Each of the base elements needs to be consecutive indices into the
   7823       // same input vector.
   7824       SDValue V = M < NumElements ? V1 : V2;
   7825       M = M % NumElements;
   7826       if (!InputV) {
   7827         InputV = V;
   7828         Offset = M - (i / Scale);
   7829       } else if (InputV != V)
   7830         return SDValue(); // Flip-flopping inputs.
   7831 
   7832       // Offset must start in the lowest 128-bit lane or at the start of an
   7833       // upper lane.
   7834       // FIXME: Is it ever worth allowing a negative base offset?
   7835       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
   7836             (Offset % NumEltsPerLane) == 0))
   7837         return SDValue();
   7838 
   7839       // If we are offsetting, all referenced entries must come from the same
   7840       // lane.
   7841       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
   7842         return SDValue();
   7843 
   7844       if ((M % NumElements) != (Offset + (i / Scale)))
   7845         return SDValue(); // Non-consecutive strided elements.
   7846       Matches++;
   7847     }
   7848 
   7849     // If we fail to find an input, we have a zero-shuffle which should always
   7850     // have already been handled.
   7851     // FIXME: Maybe handle this here in case during blending we end up with one?
   7852     if (!InputV)
   7853       return SDValue();
   7854 
   7855     // If we are offsetting, don't extend if we only match a single input, we
   7856     // can always do better by using a basic PSHUF or PUNPCK.
   7857     if (Offset != 0 && Matches < 2)
   7858       return SDValue();
   7859 
   7860     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   7861         DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
   7862   };
   7863 
   7864   // The widest scale possible for extending is to a 64-bit integer.
   7865   assert(Bits % 64 == 0 &&
   7866          "The number of bits in a vector must be divisible by 64 on x86!");
   7867   int NumExtElements = Bits / 64;
   7868 
   7869   // Each iteration, try extending the elements half as much, but into twice as
   7870   // many elements.
   7871   for (; NumExtElements < NumElements; NumExtElements *= 2) {
   7872     assert(NumElements % NumExtElements == 0 &&
   7873            "The input vector size must be divisible by the extended size.");
   7874     if (SDValue V = Lower(NumElements / NumExtElements))
   7875       return V;
   7876   }
   7877 
   7878   // General extends failed, but 128-bit vectors may be able to use MOVQ.
   7879   if (Bits != 128)
   7880     return SDValue();
   7881 
   7882   // Returns one of the source operands if the shuffle can be reduced to a
   7883   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
   7884   auto CanZExtLowHalf = [&]() {
   7885     for (int i = NumElements / 2; i != NumElements; ++i)
   7886       if (!Zeroable[i])
   7887         return SDValue();
   7888     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
   7889       return V1;
   7890     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
   7891       return V2;
   7892     return SDValue();
   7893   };
   7894 
   7895   if (SDValue V = CanZExtLowHalf()) {
   7896     V = DAG.getBitcast(MVT::v2i64, V);
   7897     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
   7898     return DAG.getBitcast(VT, V);
   7899   }
   7900 
   7901   // No viable ext lowering found.
   7902   return SDValue();
   7903 }
   7904 
   7905 /// \brief Try to get a scalar value for a specific element of a vector.
   7906 ///
   7907 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
   7908 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
   7909                                               SelectionDAG &DAG) {
   7910   MVT VT = V.getSimpleValueType();
   7911   MVT EltVT = VT.getVectorElementType();
   7912   while (V.getOpcode() == ISD::BITCAST)
   7913     V = V.getOperand(0);
   7914   // If the bitcasts shift the element size, we can't extract an equivalent
   7915   // element from it.
   7916   MVT NewVT = V.getSimpleValueType();
   7917   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
   7918     return SDValue();
   7919 
   7920   if (V.getOpcode() == ISD::BUILD_VECTOR ||
   7921       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
   7922     // Ensure the scalar operand is the same size as the destination.
   7923     // FIXME: Add support for scalar truncation where possible.
   7924     SDValue S = V.getOperand(Idx);
   7925     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
   7926       return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S);
   7927   }
   7928 
   7929   return SDValue();
   7930 }
   7931 
   7932 /// \brief Helper to test for a load that can be folded with x86 shuffles.
   7933 ///
   7934 /// This is particularly important because the set of instructions varies
   7935 /// significantly based on whether the operand is a load or not.
   7936 static bool isShuffleFoldableLoad(SDValue V) {
   7937   while (V.getOpcode() == ISD::BITCAST)
   7938     V = V.getOperand(0);
   7939 
   7940   return ISD::isNON_EXTLoad(V.getNode());
   7941 }
   7942 
   7943 /// \brief Try to lower insertion of a single element into a zero vector.
   7944 ///
   7945 /// This is a common pattern that we have especially efficient patterns to lower
   7946 /// across all subtarget feature sets.
   7947 static SDValue lowerVectorShuffleAsElementInsertion(
   7948     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   7949     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   7950   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   7951   MVT ExtVT = VT;
   7952   MVT EltVT = VT.getVectorElementType();
   7953 
   7954   int V2Index = std::find_if(Mask.begin(), Mask.end(),
   7955                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
   7956                 Mask.begin();
   7957   bool IsV1Zeroable = true;
   7958   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   7959     if (i != V2Index && !Zeroable[i]) {
   7960       IsV1Zeroable = false;
   7961       break;
   7962     }
   7963 
   7964   // Check for a single input from a SCALAR_TO_VECTOR node.
   7965   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
   7966   // all the smarts here sunk into that routine. However, the current
   7967   // lowering of BUILD_VECTOR makes that nearly impossible until the old
   7968   // vector shuffle lowering is dead.
   7969   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
   7970                                                DAG);
   7971   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
   7972     // We need to zext the scalar if it is smaller than an i32.
   7973     V2S = DAG.getBitcast(EltVT, V2S);
   7974     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
   7975       // Using zext to expand a narrow element won't work for non-zero
   7976       // insertions.
   7977       if (!IsV1Zeroable)
   7978         return SDValue();
   7979 
   7980       // Zero-extend directly to i32.
   7981       ExtVT = MVT::v4i32;
   7982       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
   7983     }
   7984     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
   7985   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
   7986              EltVT == MVT::i16) {
   7987     // Either not inserting from the low element of the input or the input
   7988     // element size is too small to use VZEXT_MOVL to clear the high bits.
   7989     return SDValue();
   7990   }
   7991 
   7992   if (!IsV1Zeroable) {
   7993     // If V1 can't be treated as a zero vector we have fewer options to lower
   7994     // this. We can't support integer vectors or non-zero targets cheaply, and
   7995     // the V1 elements can't be permuted in any way.
   7996     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
   7997     if (!VT.isFloatingPoint() || V2Index != 0)
   7998       return SDValue();
   7999     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
   8000     V1Mask[V2Index] = -1;
   8001     if (!isNoopShuffleMask(V1Mask))
   8002       return SDValue();
   8003     // This is essentially a special case blend operation, but if we have
   8004     // general purpose blend operations, they are always faster. Bail and let
   8005     // the rest of the lowering handle these as blends.
   8006     if (Subtarget->hasSSE41())
   8007       return SDValue();
   8008 
   8009     // Otherwise, use MOVSD or MOVSS.
   8010     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
   8011            "Only two types of floating point element types to handle!");
   8012     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
   8013                        ExtVT, V1, V2);
   8014   }
   8015 
   8016   // This lowering only works for the low element with floating point vectors.
   8017   if (VT.isFloatingPoint() && V2Index != 0)
   8018     return SDValue();
   8019 
   8020   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
   8021   if (ExtVT != VT)
   8022     V2 = DAG.getBitcast(VT, V2);
   8023 
   8024   if (V2Index != 0) {
   8025     // If we have 4 or fewer lanes we can cheaply shuffle the element into
   8026     // the desired position. Otherwise it is more efficient to do a vector
   8027     // shift left. We know that we can do a vector shift left because all
   8028     // the inputs are zero.
   8029     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
   8030       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
   8031       V2Shuffle[V2Index] = 0;
   8032       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
   8033     } else {
   8034       V2 = DAG.getBitcast(MVT::v2i64, V2);
   8035       V2 = DAG.getNode(
   8036           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
   8037           DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
   8038                           DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
   8039                               DAG.getDataLayout(), VT)));
   8040       V2 = DAG.getBitcast(VT, V2);
   8041     }
   8042   }
   8043   return V2;
   8044 }
   8045 
   8046 /// \brief Try to lower broadcast of a single - truncated - integer element,
   8047 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
   8048 ///
   8049 /// This assumes we have AVX2.
   8050 static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
   8051                                                   int BroadcastIdx,
   8052                                                   const X86Subtarget *Subtarget,
   8053                                                   SelectionDAG &DAG) {
   8054   assert(Subtarget->hasAVX2() &&
   8055          "We can only lower integer broadcasts with AVX2!");
   8056 
   8057   EVT EltVT = VT.getVectorElementType();
   8058   EVT V0VT = V0.getValueType();
   8059 
   8060   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
   8061   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
   8062 
   8063   EVT V0EltVT = V0VT.getVectorElementType();
   8064   if (!V0EltVT.isInteger())
   8065     return SDValue();
   8066 
   8067   const unsigned EltSize = EltVT.getSizeInBits();
   8068   const unsigned V0EltSize = V0EltVT.getSizeInBits();
   8069 
   8070   // This is only a truncation if the original element type is larger.
   8071   if (V0EltSize <= EltSize)
   8072     return SDValue();
   8073 
   8074   assert(((V0EltSize % EltSize) == 0) &&
   8075          "Scalar type sizes must all be powers of 2 on x86!");
   8076 
   8077   const unsigned V0Opc = V0.getOpcode();
   8078   const unsigned Scale = V0EltSize / EltSize;
   8079   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
   8080 
   8081   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
   8082       V0Opc != ISD::BUILD_VECTOR)
   8083     return SDValue();
   8084 
   8085   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
   8086 
   8087   // If we're extracting non-least-significant bits, shift so we can truncate.
   8088   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
   8089   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
   8090   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
   8091   if (const int OffsetIdx = BroadcastIdx % Scale)
   8092     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
   8093             DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
   8094 
   8095   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
   8096                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
   8097 }
   8098 
   8099 /// \brief Try to lower broadcast of a single element.
   8100 ///
   8101 /// For convenience, this code also bundles all of the subtarget feature set
   8102 /// filtering. While a little annoying to re-dispatch on type here, there isn't
   8103 /// a convenient way to factor it out.
   8104 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
   8105 static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
   8106                                              ArrayRef<int> Mask,
   8107                                              const X86Subtarget *Subtarget,
   8108                                              SelectionDAG &DAG) {
   8109   if (!Subtarget->hasAVX())
   8110     return SDValue();
   8111   if (VT.isInteger() && !Subtarget->hasAVX2())
   8112     return SDValue();
   8113 
   8114   // Check that the mask is a broadcast.
   8115   int BroadcastIdx = -1;
   8116   for (int M : Mask)
   8117     if (M >= 0 && BroadcastIdx == -1)
   8118       BroadcastIdx = M;
   8119     else if (M >= 0 && M != BroadcastIdx)
   8120       return SDValue();
   8121 
   8122   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
   8123                                             "a sorted mask where the broadcast "
   8124                                             "comes from V1.");
   8125 
   8126   // Go up the chain of (vector) values to find a scalar load that we can
   8127   // combine with the broadcast.
   8128   for (;;) {
   8129     switch (V.getOpcode()) {
   8130     case ISD::CONCAT_VECTORS: {
   8131       int OperandSize = Mask.size() / V.getNumOperands();
   8132       V = V.getOperand(BroadcastIdx / OperandSize);
   8133       BroadcastIdx %= OperandSize;
   8134       continue;
   8135     }
   8136 
   8137     case ISD::INSERT_SUBVECTOR: {
   8138       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
   8139       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
   8140       if (!ConstantIdx)
   8141         break;
   8142 
   8143       int BeginIdx = (int)ConstantIdx->getZExtValue();
   8144       int EndIdx =
   8145           BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
   8146       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
   8147         BroadcastIdx -= BeginIdx;
   8148         V = VInner;
   8149       } else {
   8150         V = VOuter;
   8151       }
   8152       continue;
   8153     }
   8154     }
   8155     break;
   8156   }
   8157 
   8158   // Check if this is a broadcast of a scalar. We special case lowering
   8159   // for scalars so that we can more effectively fold with loads.
   8160   // First, look through bitcast: if the original value has a larger element
   8161   // type than the shuffle, the broadcast element is in essence truncated.
   8162   // Make that explicit to ease folding.
   8163   if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
   8164     if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
   8165             DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
   8166       return TruncBroadcast;
   8167 
   8168   // Also check the simpler case, where we can directly reuse the scalar.
   8169   if (V.getOpcode() == ISD::BUILD_VECTOR ||
   8170       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
   8171     V = V.getOperand(BroadcastIdx);
   8172 
   8173     // If the scalar isn't a load, we can't broadcast from it in AVX1.
   8174     // Only AVX2 has register broadcasts.
   8175     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
   8176       return SDValue();
   8177   } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
   8178     // If we are broadcasting a load that is only used by the shuffle
   8179     // then we can reduce the vector load to the broadcasted scalar load.
   8180     LoadSDNode *Ld = cast<LoadSDNode>(V);
   8181     SDValue BaseAddr = Ld->getOperand(1);
   8182     EVT AddrVT = BaseAddr.getValueType();
   8183     EVT SVT = VT.getScalarType();
   8184     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
   8185     SDValue NewAddr = DAG.getNode(
   8186         ISD::ADD, DL, AddrVT, BaseAddr,
   8187         DAG.getConstant(Offset, DL, AddrVT));
   8188     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
   8189                     DAG.getMachineFunction().getMachineMemOperand(
   8190                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
   8191   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
   8192     // We can't broadcast from a vector register without AVX2, and we can only
   8193     // broadcast from the zero-element of a vector register.
   8194     return SDValue();
   8195   }
   8196 
   8197   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
   8198 }
   8199 
   8200 // Check for whether we can use INSERTPS to perform the shuffle. We only use
   8201 // INSERTPS when the V1 elements are already in the correct locations
   8202 // because otherwise we can just always use two SHUFPS instructions which
   8203 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
   8204 // perform INSERTPS if a single V1 element is out of place and all V2
   8205 // elements are zeroable.
   8206 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
   8207                                             ArrayRef<int> Mask,
   8208                                             SelectionDAG &DAG) {
   8209   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
   8210   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   8211   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   8212   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   8213 
   8214   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   8215 
   8216   unsigned ZMask = 0;
   8217   int V1DstIndex = -1;
   8218   int V2DstIndex = -1;
   8219   bool V1UsedInPlace = false;
   8220 
   8221   for (int i = 0; i < 4; ++i) {
   8222     // Synthesize a zero mask from the zeroable elements (includes undefs).
   8223     if (Zeroable[i]) {
   8224       ZMask |= 1 << i;
   8225       continue;
   8226     }
   8227 
   8228     // Flag if we use any V1 inputs in place.
   8229     if (i == Mask[i]) {
   8230       V1UsedInPlace = true;
   8231       continue;
   8232     }
   8233 
   8234     // We can only insert a single non-zeroable element.
   8235     if (V1DstIndex != -1 || V2DstIndex != -1)
   8236       return SDValue();
   8237 
   8238     if (Mask[i] < 4) {
   8239       // V1 input out of place for insertion.
   8240       V1DstIndex = i;
   8241     } else {
   8242       // V2 input for insertion.
   8243       V2DstIndex = i;
   8244     }
   8245   }
   8246 
   8247   // Don't bother if we have no (non-zeroable) element for insertion.
   8248   if (V1DstIndex == -1 && V2DstIndex == -1)
   8249     return SDValue();
   8250 
   8251   // Determine element insertion src/dst indices. The src index is from the
   8252   // start of the inserted vector, not the start of the concatenated vector.
   8253   unsigned V2SrcIndex = 0;
   8254   if (V1DstIndex != -1) {
   8255     // If we have a V1 input out of place, we use V1 as the V2 element insertion
   8256     // and don't use the original V2 at all.
   8257     V2SrcIndex = Mask[V1DstIndex];
   8258     V2DstIndex = V1DstIndex;
   8259     V2 = V1;
   8260   } else {
   8261     V2SrcIndex = Mask[V2DstIndex] - 4;
   8262   }
   8263 
   8264   // If no V1 inputs are used in place, then the result is created only from
   8265   // the zero mask and the V2 insertion - so remove V1 dependency.
   8266   if (!V1UsedInPlace)
   8267     V1 = DAG.getUNDEF(MVT::v4f32);
   8268 
   8269   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
   8270   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
   8271 
   8272   // Insert the V2 element into the desired position.
   8273   SDLoc DL(Op);
   8274   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
   8275                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
   8276 }
   8277 
   8278 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
   8279 /// UNPCK instruction.
   8280 ///
   8281 /// This specifically targets cases where we end up with alternating between
   8282 /// the two inputs, and so can permute them into something that feeds a single
   8283 /// UNPCK instruction. Note that this routine only targets integer vectors
   8284 /// because for floating point vectors we have a generalized SHUFPS lowering
   8285 /// strategy that handles everything that doesn't *exactly* match an unpack,
   8286 /// making this clever lowering unnecessary.
   8287 static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
   8288                                                     SDValue V1, SDValue V2,
   8289                                                     ArrayRef<int> Mask,
   8290                                                     SelectionDAG &DAG) {
   8291   assert(!VT.isFloatingPoint() &&
   8292          "This routine only supports integer vectors.");
   8293   assert(!isSingleInputShuffleMask(Mask) &&
   8294          "This routine should only be used when blending two inputs.");
   8295   assert(Mask.size() >= 2 && "Single element masks are invalid.");
   8296 
   8297   int Size = Mask.size();
   8298 
   8299   int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
   8300     return M >= 0 && M % Size < Size / 2;
   8301   });
   8302   int NumHiInputs = std::count_if(
   8303       Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; });
   8304 
   8305   bool UnpackLo = NumLoInputs >= NumHiInputs;
   8306 
   8307   auto TryUnpack = [&](MVT UnpackVT, int Scale) {
   8308     SmallVector<int, 32> V1Mask(Mask.size(), -1);
   8309     SmallVector<int, 32> V2Mask(Mask.size(), -1);
   8310 
   8311     for (int i = 0; i < Size; ++i) {
   8312       if (Mask[i] < 0)
   8313         continue;
   8314 
   8315       // Each element of the unpack contains Scale elements from this mask.
   8316       int UnpackIdx = i / Scale;
   8317 
   8318       // We only handle the case where V1 feeds the first slots of the unpack.
   8319       // We rely on canonicalization to ensure this is the case.
   8320       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
   8321         return SDValue();
   8322 
   8323       // Setup the mask for this input. The indexing is tricky as we have to
   8324       // handle the unpack stride.
   8325       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
   8326       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
   8327           Mask[i] % Size;
   8328     }
   8329 
   8330     // If we will have to shuffle both inputs to use the unpack, check whether
   8331     // we can just unpack first and shuffle the result. If so, skip this unpack.
   8332     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
   8333         !isNoopShuffleMask(V2Mask))
   8334       return SDValue();
   8335 
   8336     // Shuffle the inputs into place.
   8337     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
   8338     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
   8339 
   8340     // Cast the inputs to the type we will use to unpack them.
   8341     V1 = DAG.getBitcast(UnpackVT, V1);
   8342     V2 = DAG.getBitcast(UnpackVT, V2);
   8343 
   8344     // Unpack the inputs and cast the result back to the desired type.
   8345     return DAG.getBitcast(
   8346         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
   8347                         UnpackVT, V1, V2));
   8348   };
   8349 
   8350   // We try each unpack from the largest to the smallest to try and find one
   8351   // that fits this mask.
   8352   int OrigNumElements = VT.getVectorNumElements();
   8353   int OrigScalarSize = VT.getScalarSizeInBits();
   8354   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
   8355     int Scale = ScalarSize / OrigScalarSize;
   8356     int NumElements = OrigNumElements / Scale;
   8357     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
   8358     if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
   8359       return Unpack;
   8360   }
   8361 
   8362   // If none of the unpack-rooted lowerings worked (or were profitable) try an
   8363   // initial unpack.
   8364   if (NumLoInputs == 0 || NumHiInputs == 0) {
   8365     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
   8366            "We have to have *some* inputs!");
   8367     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
   8368 
   8369     // FIXME: We could consider the total complexity of the permute of each
   8370     // possible unpacking. Or at the least we should consider how many
   8371     // half-crossings are created.
   8372     // FIXME: We could consider commuting the unpacks.
   8373 
   8374     SmallVector<int, 32> PermMask;
   8375     PermMask.assign(Size, -1);
   8376     for (int i = 0; i < Size; ++i) {
   8377       if (Mask[i] < 0)
   8378         continue;
   8379 
   8380       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
   8381 
   8382       PermMask[i] =
   8383           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
   8384     }
   8385     return DAG.getVectorShuffle(
   8386         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
   8387                             DL, VT, V1, V2),
   8388         DAG.getUNDEF(VT), PermMask);
   8389   }
   8390 
   8391   return SDValue();
   8392 }
   8393 
   8394 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
   8395 ///
   8396 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
   8397 /// support for floating point shuffles but not integer shuffles. These
   8398 /// instructions will incur a domain crossing penalty on some chips though so
   8399 /// it is better to avoid lowering through this for integer vectors where
   8400 /// possible.
   8401 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   8402                                        const X86Subtarget *Subtarget,
   8403                                        SelectionDAG &DAG) {
   8404   SDLoc DL(Op);
   8405   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
   8406   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   8407   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   8408   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   8409   ArrayRef<int> Mask = SVOp->getMask();
   8410   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
   8411 
   8412   if (isSingleInputShuffleMask(Mask)) {
   8413     // Use low duplicate instructions for masks that match their pattern.
   8414     if (Subtarget->hasSSE3())
   8415       if (isShuffleEquivalent(V1, V2, Mask, {0, 0}))
   8416         return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
   8417 
   8418     // Straight shuffle of a single input vector. Simulate this by using the
   8419     // single input as both of the "inputs" to this instruction..
   8420     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
   8421 
   8422     if (Subtarget->hasAVX()) {
   8423       // If we have AVX, we can use VPERMILPS which will allow folding a load
   8424       // into the shuffle.
   8425       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
   8426                          DAG.getConstant(SHUFPDMask, DL, MVT::i8));
   8427     }
   8428 
   8429     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
   8430                        DAG.getConstant(SHUFPDMask, DL, MVT::i8));
   8431   }
   8432   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
   8433   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
   8434 
   8435   // If we have a single input, insert that into V1 if we can do so cheaply.
   8436   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
   8437     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   8438             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
   8439       return Insertion;
   8440     // Try inverting the insertion since for v2 masks it is easy to do and we
   8441     // can't reliably sort the mask one way or the other.
   8442     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
   8443                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
   8444     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   8445             DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
   8446       return Insertion;
   8447   }
   8448 
   8449   // Try to use one of the special instruction patterns to handle two common
   8450   // blend patterns if a zero-blend above didn't work.
   8451   if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
   8452       isShuffleEquivalent(V1, V2, Mask, {1, 3}))
   8453     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
   8454       // We can either use a special instruction to load over the low double or
   8455       // to move just the low double.
   8456       return DAG.getNode(
   8457           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
   8458           DL, MVT::v2f64, V2,
   8459           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
   8460 
   8461   if (Subtarget->hasSSE41())
   8462     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
   8463                                                   Subtarget, DAG))
   8464       return Blend;
   8465 
   8466   // Use dedicated unpack instructions for masks that match their pattern.
   8467   if (SDValue V =
   8468           lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
   8469     return V;
   8470 
   8471   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
   8472   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
   8473                      DAG.getConstant(SHUFPDMask, DL, MVT::i8));
   8474 }
   8475 
   8476 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
   8477 ///
   8478 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
   8479 /// the integer unit to minimize domain crossing penalties. However, for blends
   8480 /// it falls back to the floating point shuffle operation with appropriate bit
   8481 /// casting.
   8482 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   8483                                        const X86Subtarget *Subtarget,
   8484                                        SelectionDAG &DAG) {
   8485   SDLoc DL(Op);
   8486   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
   8487   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   8488   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   8489   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   8490   ArrayRef<int> Mask = SVOp->getMask();
   8491   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
   8492 
   8493   if (isSingleInputShuffleMask(Mask)) {
   8494     // Check for being able to broadcast a single element.
   8495     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
   8496                                                           Mask, Subtarget, DAG))
   8497       return Broadcast;
   8498 
   8499     // Straight shuffle of a single input vector. For everything from SSE2
   8500     // onward this has a single fast instruction with no scary immediates.
   8501     // We have to map the mask as it is actually a v4i32 shuffle instruction.
   8502     V1 = DAG.getBitcast(MVT::v4i32, V1);
   8503     int WidenedMask[4] = {
   8504         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
   8505         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
   8506     return DAG.getBitcast(
   8507         MVT::v2i64,
   8508         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
   8509                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
   8510   }
   8511   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
   8512   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
   8513   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
   8514   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
   8515 
   8516   // If we have a blend of two PACKUS operations an the blend aligns with the
   8517   // low and half halves, we can just merge the PACKUS operations. This is
   8518   // particularly important as it lets us merge shuffles that this routine itself
   8519   // creates.
   8520   auto GetPackNode = [](SDValue V) {
   8521     while (V.getOpcode() == ISD::BITCAST)
   8522       V = V.getOperand(0);
   8523 
   8524     return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
   8525   };
   8526   if (SDValue V1Pack = GetPackNode(V1))
   8527     if (SDValue V2Pack = GetPackNode(V2))
   8528       return DAG.getBitcast(MVT::v2i64,
   8529                             DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
   8530                                         Mask[0] == 0 ? V1Pack.getOperand(0)
   8531                                                      : V1Pack.getOperand(1),
   8532                                         Mask[1] == 2 ? V2Pack.getOperand(0)
   8533                                                      : V2Pack.getOperand(1)));
   8534 
   8535   // Try to use shift instructions.
   8536   if (SDValue Shift =
   8537           lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
   8538     return Shift;
   8539 
   8540   // When loading a scalar and then shuffling it into a vector we can often do
   8541   // the insertion cheaply.
   8542   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   8543           DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
   8544     return Insertion;
   8545   // Try inverting the insertion since for v2 masks it is easy to do and we
   8546   // can't reliably sort the mask one way or the other.
   8547   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
   8548   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   8549           DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
   8550     return Insertion;
   8551 
   8552   // We have different paths for blend lowering, but they all must use the
   8553   // *exact* same predicate.
   8554   bool IsBlendSupported = Subtarget->hasSSE41();
   8555   if (IsBlendSupported)
   8556     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
   8557                                                   Subtarget, DAG))
   8558       return Blend;
   8559 
   8560   // Use dedicated unpack instructions for masks that match their pattern.
   8561   if (SDValue V =
   8562           lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
   8563     return V;
   8564 
   8565   // Try to use byte rotation instructions.
   8566   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   8567   if (Subtarget->hasSSSE3())
   8568     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   8569             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
   8570       return Rotate;
   8571 
   8572   // If we have direct support for blends, we should lower by decomposing into
   8573   // a permute. That will be faster than the domain cross.
   8574   if (IsBlendSupported)
   8575     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
   8576                                                       Mask, DAG);
   8577 
   8578   // We implement this with SHUFPD which is pretty lame because it will likely
   8579   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
   8580   // However, all the alternatives are still more cycles and newer chips don't
   8581   // have this problem. It would be really nice if x86 had better shuffles here.
   8582   V1 = DAG.getBitcast(MVT::v2f64, V1);
   8583   V2 = DAG.getBitcast(MVT::v2f64, V2);
   8584   return DAG.getBitcast(MVT::v2i64,
   8585                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
   8586 }
   8587 
   8588 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
   8589 ///
   8590 /// This is used to disable more specialized lowerings when the shufps lowering
   8591 /// will happen to be efficient.
   8592 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
   8593   // This routine only handles 128-bit shufps.
   8594   assert(Mask.size() == 4 && "Unsupported mask size!");
   8595 
   8596   // To lower with a single SHUFPS we need to have the low half and high half
   8597   // each requiring a single input.
   8598   if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
   8599     return false;
   8600   if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
   8601     return false;
   8602 
   8603   return true;
   8604 }
   8605 
   8606 /// \brief Lower a vector shuffle using the SHUFPS instruction.
   8607 ///
   8608 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
   8609 /// It makes no assumptions about whether this is the *best* lowering, it simply
   8610 /// uses it.
   8611 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
   8612                                             ArrayRef<int> Mask, SDValue V1,
   8613                                             SDValue V2, SelectionDAG &DAG) {
   8614   SDValue LowV = V1, HighV = V2;
   8615   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
   8616 
   8617   int NumV2Elements =
   8618       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
   8619 
   8620   if (NumV2Elements == 1) {
   8621     int V2Index =
   8622         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
   8623         Mask.begin();
   8624 
   8625     // Compute the index adjacent to V2Index and in the same half by toggling
   8626     // the low bit.
   8627     int V2AdjIndex = V2Index ^ 1;
   8628 
   8629     if (Mask[V2AdjIndex] == -1) {
   8630       // Handles all the cases where we have a single V2 element and an undef.
   8631       // This will only ever happen in the high lanes because we commute the
   8632       // vector otherwise.
   8633       if (V2Index < 2)
   8634         std::swap(LowV, HighV);
   8635       NewMask[V2Index] -= 4;
   8636     } else {
   8637       // Handle the case where the V2 element ends up adjacent to a V1 element.
   8638       // To make this work, blend them together as the first step.
   8639       int V1Index = V2AdjIndex;
   8640       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
   8641       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
   8642                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
   8643 
   8644       // Now proceed to reconstruct the final blend as we have the necessary
   8645       // high or low half formed.
   8646       if (V2Index < 2) {
   8647         LowV = V2;
   8648         HighV = V1;
   8649       } else {
   8650         HighV = V2;
   8651       }
   8652       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
   8653       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
   8654     }
   8655   } else if (NumV2Elements == 2) {
   8656     if (Mask[0] < 4 && Mask[1] < 4) {
   8657       // Handle the easy case where we have V1 in the low lanes and V2 in the
   8658       // high lanes.
   8659       NewMask[2] -= 4;
   8660       NewMask[3] -= 4;
   8661     } else if (Mask[2] < 4 && Mask[3] < 4) {
   8662       // We also handle the reversed case because this utility may get called
   8663       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
   8664       // arrange things in the right direction.
   8665       NewMask[0] -= 4;
   8666       NewMask[1] -= 4;
   8667       HighV = V1;
   8668       LowV = V2;
   8669     } else {
   8670       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
   8671       // trying to place elements directly, just blend them and set up the final
   8672       // shuffle to place them.
   8673 
   8674       // The first two blend mask elements are for V1, the second two are for
   8675       // V2.
   8676       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
   8677                           Mask[2] < 4 ? Mask[2] : Mask[3],
   8678                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
   8679                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
   8680       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
   8681                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
   8682 
   8683       // Now we do a normal shuffle of V1 by giving V1 as both operands to
   8684       // a blend.
   8685       LowV = HighV = V1;
   8686       NewMask[0] = Mask[0] < 4 ? 0 : 2;
   8687       NewMask[1] = Mask[0] < 4 ? 2 : 0;
   8688       NewMask[2] = Mask[2] < 4 ? 1 : 3;
   8689       NewMask[3] = Mask[2] < 4 ? 3 : 1;
   8690     }
   8691   }
   8692   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
   8693                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
   8694 }
   8695 
   8696 /// \brief Lower 4-lane 32-bit floating point shuffles.
   8697 ///
   8698 /// Uses instructions exclusively from the floating point unit to minimize
   8699 /// domain crossing penalties, as these are sufficient to implement all v4f32
   8700 /// shuffles.
   8701 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   8702                                        const X86Subtarget *Subtarget,
   8703                                        SelectionDAG &DAG) {
   8704   SDLoc DL(Op);
   8705   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
   8706   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   8707   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   8708   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   8709   ArrayRef<int> Mask = SVOp->getMask();
   8710   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   8711 
   8712   int NumV2Elements =
   8713       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
   8714 
   8715   if (NumV2Elements == 0) {
   8716     // Check for being able to broadcast a single element.
   8717     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
   8718                                                           Mask, Subtarget, DAG))
   8719       return Broadcast;
   8720 
   8721     // Use even/odd duplicate instructions for masks that match their pattern.
   8722     if (Subtarget->hasSSE3()) {
   8723       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
   8724         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
   8725       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
   8726         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
   8727     }
   8728 
   8729     if (Subtarget->hasAVX()) {
   8730       // If we have AVX, we can use VPERMILPS which will allow folding a load
   8731       // into the shuffle.
   8732       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
   8733                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   8734     }
   8735 
   8736     // Otherwise, use a straight shuffle of a single input vector. We pass the
   8737     // input vector to both operands to simulate this with a SHUFPS.
   8738     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
   8739                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   8740   }
   8741 
   8742   // There are special ways we can lower some single-element blends. However, we
   8743   // have custom ways we can lower more complex single-element blends below that
   8744   // we defer to if both this and BLENDPS fail to match, so restrict this to
   8745   // when the V2 input is targeting element 0 of the mask -- that is the fast
   8746   // case here.
   8747   if (NumV2Elements == 1 && Mask[0] >= 4)
   8748     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
   8749                                                          Mask, Subtarget, DAG))
   8750       return V;
   8751 
   8752   if (Subtarget->hasSSE41()) {
   8753     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
   8754                                                   Subtarget, DAG))
   8755       return Blend;
   8756 
   8757     // Use INSERTPS if we can complete the shuffle efficiently.
   8758     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
   8759       return V;
   8760 
   8761     if (!isSingleSHUFPSMask(Mask))
   8762       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
   8763               DL, MVT::v4f32, V1, V2, Mask, DAG))
   8764         return BlendPerm;
   8765   }
   8766 
   8767   // Use dedicated unpack instructions for masks that match their pattern.
   8768   if (SDValue V =
   8769           lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
   8770     return V;
   8771 
   8772   // Otherwise fall back to a SHUFPS lowering strategy.
   8773   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
   8774 }
   8775 
   8776 /// \brief Lower 4-lane i32 vector shuffles.
   8777 ///
   8778 /// We try to handle these with integer-domain shuffles where we can, but for
   8779 /// blends we use the floating point domain blend instructions.
   8780 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   8781                                        const X86Subtarget *Subtarget,
   8782                                        SelectionDAG &DAG) {
   8783   SDLoc DL(Op);
   8784   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
   8785   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   8786   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   8787   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   8788   ArrayRef<int> Mask = SVOp->getMask();
   8789   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   8790 
   8791   // Whenever we can lower this as a zext, that instruction is strictly faster
   8792   // than any alternative. It also allows us to fold memory operands into the
   8793   // shuffle in many cases.
   8794   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
   8795                                                          Mask, Subtarget, DAG))
   8796     return ZExt;
   8797 
   8798   int NumV2Elements =
   8799       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
   8800 
   8801   if (NumV2Elements == 0) {
   8802     // Check for being able to broadcast a single element.
   8803     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
   8804                                                           Mask, Subtarget, DAG))
   8805       return Broadcast;
   8806 
   8807     // Straight shuffle of a single input vector. For everything from SSE2
   8808     // onward this has a single fast instruction with no scary immediates.
   8809     // We coerce the shuffle pattern to be compatible with UNPCK instructions
   8810     // but we aren't actually going to use the UNPCK instruction because doing
   8811     // so prevents folding a load into this instruction or making a copy.
   8812     const int UnpackLoMask[] = {0, 0, 1, 1};
   8813     const int UnpackHiMask[] = {2, 2, 3, 3};
   8814     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
   8815       Mask = UnpackLoMask;
   8816     else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
   8817       Mask = UnpackHiMask;
   8818 
   8819     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
   8820                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   8821   }
   8822 
   8823   // Try to use shift instructions.
   8824   if (SDValue Shift =
   8825           lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
   8826     return Shift;
   8827 
   8828   // There are special ways we can lower some single-element blends.
   8829   if (NumV2Elements == 1)
   8830     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
   8831                                                          Mask, Subtarget, DAG))
   8832       return V;
   8833 
   8834   // We have different paths for blend lowering, but they all must use the
   8835   // *exact* same predicate.
   8836   bool IsBlendSupported = Subtarget->hasSSE41();
   8837   if (IsBlendSupported)
   8838     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
   8839                                                   Subtarget, DAG))
   8840       return Blend;
   8841 
   8842   if (SDValue Masked =
   8843           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
   8844     return Masked;
   8845 
   8846   // Use dedicated unpack instructions for masks that match their pattern.
   8847   if (SDValue V =
   8848           lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
   8849     return V;
   8850 
   8851   // Try to use byte rotation instructions.
   8852   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   8853   if (Subtarget->hasSSSE3())
   8854     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   8855             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
   8856       return Rotate;
   8857 
   8858   // If we have direct support for blends, we should lower by decomposing into
   8859   // a permute. That will be faster than the domain cross.
   8860   if (IsBlendSupported)
   8861     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
   8862                                                       Mask, DAG);
   8863 
   8864   // Try to lower by permuting the inputs into an unpack instruction.
   8865   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1,
   8866                                                             V2, Mask, DAG))
   8867     return Unpack;
   8868 
   8869   // We implement this with SHUFPS because it can blend from two vectors.
   8870   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
   8871   // up the inputs, bypassing domain shift penalties that we would encur if we
   8872   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
   8873   // relevant.
   8874   return DAG.getBitcast(
   8875       MVT::v4i32,
   8876       DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
   8877                            DAG.getBitcast(MVT::v4f32, V2), Mask));
   8878 }
   8879 
   8880 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
   8881 /// shuffle lowering, and the most complex part.
   8882 ///
   8883 /// The lowering strategy is to try to form pairs of input lanes which are
   8884 /// targeted at the same half of the final vector, and then use a dword shuffle
   8885 /// to place them onto the right half, and finally unpack the paired lanes into
   8886 /// their final position.
   8887 ///
   8888 /// The exact breakdown of how to form these dword pairs and align them on the
   8889 /// correct sides is really tricky. See the comments within the function for
   8890 /// more of the details.
   8891 ///
   8892 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
   8893 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
   8894 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
   8895 /// vector, form the analogous 128-bit 8-element Mask.
   8896 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
   8897     SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
   8898     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   8899   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
   8900   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
   8901 
   8902   assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
   8903   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
   8904   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
   8905 
   8906   SmallVector<int, 4> LoInputs;
   8907   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
   8908                [](int M) { return M >= 0; });
   8909   std::sort(LoInputs.begin(), LoInputs.end());
   8910   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
   8911   SmallVector<int, 4> HiInputs;
   8912   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
   8913                [](int M) { return M >= 0; });
   8914   std::sort(HiInputs.begin(), HiInputs.end());
   8915   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
   8916   int NumLToL =
   8917       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
   8918   int NumHToL = LoInputs.size() - NumLToL;
   8919   int NumLToH =
   8920       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
   8921   int NumHToH = HiInputs.size() - NumLToH;
   8922   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
   8923   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
   8924   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
   8925   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
   8926 
   8927   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
   8928   // such inputs we can swap two of the dwords across the half mark and end up
   8929   // with <=2 inputs to each half in each half. Once there, we can fall through
   8930   // to the generic code below. For example:
   8931   //
   8932   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   8933   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
   8934   //
   8935   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
   8936   // and an existing 2-into-2 on the other half. In this case we may have to
   8937   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
   8938   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
   8939   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
   8940   // because any other situation (including a 3-into-1 or 1-into-3 in the other
   8941   // half than the one we target for fixing) will be fixed when we re-enter this
   8942   // path. We will also combine away any sequence of PSHUFD instructions that
   8943   // result into a single instruction. Here is an example of the tricky case:
   8944   //
   8945   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   8946   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
   8947   //
   8948   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
   8949   //
   8950   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
   8951   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
   8952   //
   8953   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
   8954   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
   8955   //
   8956   // The result is fine to be handled by the generic logic.
   8957   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
   8958                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
   8959                           int AOffset, int BOffset) {
   8960     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
   8961            "Must call this with A having 3 or 1 inputs from the A half.");
   8962     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
   8963            "Must call this with B having 1 or 3 inputs from the B half.");
   8964     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
   8965            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
   8966 
   8967     bool ThreeAInputs = AToAInputs.size() == 3;
   8968 
   8969     // Compute the index of dword with only one word among the three inputs in
   8970     // a half by taking the sum of the half with three inputs and subtracting
   8971     // the sum of the actual three inputs. The difference is the remaining
   8972     // slot.
   8973     int ADWord, BDWord;
   8974     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
   8975     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
   8976     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
   8977     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
   8978     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
   8979     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
   8980     int TripleNonInputIdx =
   8981         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
   8982     TripleDWord = TripleNonInputIdx / 2;
   8983 
   8984     // We use xor with one to compute the adjacent DWord to whichever one the
   8985     // OneInput is in.
   8986     OneInputDWord = (OneInput / 2) ^ 1;
   8987 
   8988     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
   8989     // and BToA inputs. If there is also such a problem with the BToB and AToB
   8990     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
   8991     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
   8992     // is essential that we don't *create* a 3<-1 as then we might oscillate.
   8993     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
   8994       // Compute how many inputs will be flipped by swapping these DWords. We
   8995       // need
   8996       // to balance this to ensure we don't form a 3-1 shuffle in the other
   8997       // half.
   8998       int NumFlippedAToBInputs =
   8999           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
   9000           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
   9001       int NumFlippedBToBInputs =
   9002           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
   9003           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
   9004       if ((NumFlippedAToBInputs == 1 &&
   9005            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
   9006           (NumFlippedBToBInputs == 1 &&
   9007            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
   9008         // We choose whether to fix the A half or B half based on whether that
   9009         // half has zero flipped inputs. At zero, we may not be able to fix it
   9010         // with that half. We also bias towards fixing the B half because that
   9011         // will more commonly be the high half, and we have to bias one way.
   9012         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
   9013                                                        ArrayRef<int> Inputs) {
   9014           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
   9015           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
   9016                                          PinnedIdx ^ 1) != Inputs.end();
   9017           // Determine whether the free index is in the flipped dword or the
   9018           // unflipped dword based on where the pinned index is. We use this bit
   9019           // in an xor to conditionally select the adjacent dword.
   9020           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
   9021           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
   9022                                              FixFreeIdx) != Inputs.end();
   9023           if (IsFixIdxInput == IsFixFreeIdxInput)
   9024             FixFreeIdx += 1;
   9025           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
   9026                                         FixFreeIdx) != Inputs.end();
   9027           assert(IsFixIdxInput != IsFixFreeIdxInput &&
   9028                  "We need to be changing the number of flipped inputs!");
   9029           int PSHUFHalfMask[] = {0, 1, 2, 3};
   9030           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
   9031           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
   9032                           MVT::v8i16, V,
   9033                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
   9034 
   9035           for (int &M : Mask)
   9036             if (M != -1 && M == FixIdx)
   9037               M = FixFreeIdx;
   9038             else if (M != -1 && M == FixFreeIdx)
   9039               M = FixIdx;
   9040         };
   9041         if (NumFlippedBToBInputs != 0) {
   9042           int BPinnedIdx =
   9043               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
   9044           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
   9045         } else {
   9046           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
   9047           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
   9048           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
   9049         }
   9050       }
   9051     }
   9052 
   9053     int PSHUFDMask[] = {0, 1, 2, 3};
   9054     PSHUFDMask[ADWord] = BDWord;
   9055     PSHUFDMask[BDWord] = ADWord;
   9056     V = DAG.getBitcast(
   9057         VT,
   9058         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
   9059                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   9060 
   9061     // Adjust the mask to match the new locations of A and B.
   9062     for (int &M : Mask)
   9063       if (M != -1 && M/2 == ADWord)
   9064         M = 2 * BDWord + M % 2;
   9065       else if (M != -1 && M/2 == BDWord)
   9066         M = 2 * ADWord + M % 2;
   9067 
   9068     // Recurse back into this routine to re-compute state now that this isn't
   9069     // a 3 and 1 problem.
   9070     return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
   9071                                                      DAG);
   9072   };
   9073   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
   9074     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
   9075   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
   9076     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
   9077 
   9078   // At this point there are at most two inputs to the low and high halves from
   9079   // each half. That means the inputs can always be grouped into dwords and
   9080   // those dwords can then be moved to the correct half with a dword shuffle.
   9081   // We use at most one low and one high word shuffle to collect these paired
   9082   // inputs into dwords, and finally a dword shuffle to place them.
   9083   int PSHUFLMask[4] = {-1, -1, -1, -1};
   9084   int PSHUFHMask[4] = {-1, -1, -1, -1};
   9085   int PSHUFDMask[4] = {-1, -1, -1, -1};
   9086 
   9087   // First fix the masks for all the inputs that are staying in their
   9088   // original halves. This will then dictate the targets of the cross-half
   9089   // shuffles.
   9090   auto fixInPlaceInputs =
   9091       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
   9092                     MutableArrayRef<int> SourceHalfMask,
   9093                     MutableArrayRef<int> HalfMask, int HalfOffset) {
   9094     if (InPlaceInputs.empty())
   9095       return;
   9096     if (InPlaceInputs.size() == 1) {
   9097       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
   9098           InPlaceInputs[0] - HalfOffset;
   9099       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
   9100       return;
   9101     }
   9102     if (IncomingInputs.empty()) {
   9103       // Just fix all of the in place inputs.
   9104       for (int Input : InPlaceInputs) {
   9105         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
   9106         PSHUFDMask[Input / 2] = Input / 2;
   9107       }
   9108       return;
   9109     }
   9110 
   9111     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
   9112     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
   9113         InPlaceInputs[0] - HalfOffset;
   9114     // Put the second input next to the first so that they are packed into
   9115     // a dword. We find the adjacent index by toggling the low bit.
   9116     int AdjIndex = InPlaceInputs[0] ^ 1;
   9117     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
   9118     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
   9119     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
   9120   };
   9121   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
   9122   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
   9123 
   9124   // Now gather the cross-half inputs and place them into a free dword of
   9125   // their target half.
   9126   // FIXME: This operation could almost certainly be simplified dramatically to
   9127   // look more like the 3-1 fixing operation.
   9128   auto moveInputsToRightHalf = [&PSHUFDMask](
   9129       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
   9130       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
   9131       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
   9132       int DestOffset) {
   9133     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
   9134       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
   9135     };
   9136     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
   9137                                                int Word) {
   9138       int LowWord = Word & ~1;
   9139       int HighWord = Word | 1;
   9140       return isWordClobbered(SourceHalfMask, LowWord) ||
   9141              isWordClobbered(SourceHalfMask, HighWord);
   9142     };
   9143 
   9144     if (IncomingInputs.empty())
   9145       return;
   9146 
   9147     if (ExistingInputs.empty()) {
   9148       // Map any dwords with inputs from them into the right half.
   9149       for (int Input : IncomingInputs) {
   9150         // If the source half mask maps over the inputs, turn those into
   9151         // swaps and use the swapped lane.
   9152         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
   9153           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
   9154             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
   9155                 Input - SourceOffset;
   9156             // We have to swap the uses in our half mask in one sweep.
   9157             for (int &M : HalfMask)
   9158               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
   9159                 M = Input;
   9160               else if (M == Input)
   9161                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
   9162           } else {
   9163             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
   9164                        Input - SourceOffset &&
   9165                    "Previous placement doesn't match!");
   9166           }
   9167           // Note that this correctly re-maps both when we do a swap and when
   9168           // we observe the other side of the swap above. We rely on that to
   9169           // avoid swapping the members of the input list directly.
   9170           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
   9171         }
   9172 
   9173         // Map the input's dword into the correct half.
   9174         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
   9175           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
   9176         else
   9177           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
   9178                      Input / 2 &&
   9179                  "Previous placement doesn't match!");
   9180       }
   9181 
   9182       // And just directly shift any other-half mask elements to be same-half
   9183       // as we will have mirrored the dword containing the element into the
   9184       // same position within that half.
   9185       for (int &M : HalfMask)
   9186         if (M >= SourceOffset && M < SourceOffset + 4) {
   9187           M = M - SourceOffset + DestOffset;
   9188           assert(M >= 0 && "This should never wrap below zero!");
   9189         }
   9190       return;
   9191     }
   9192 
   9193     // Ensure we have the input in a viable dword of its current half. This
   9194     // is particularly tricky because the original position may be clobbered
   9195     // by inputs being moved and *staying* in that half.
   9196     if (IncomingInputs.size() == 1) {
   9197       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
   9198         int InputFixed = std::find(std::begin(SourceHalfMask),
   9199                                    std::end(SourceHalfMask), -1) -
   9200                          std::begin(SourceHalfMask) + SourceOffset;
   9201         SourceHalfMask[InputFixed - SourceOffset] =
   9202             IncomingInputs[0] - SourceOffset;
   9203         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
   9204                      InputFixed);
   9205         IncomingInputs[0] = InputFixed;
   9206       }
   9207     } else if (IncomingInputs.size() == 2) {
   9208       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
   9209           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
   9210         // We have two non-adjacent or clobbered inputs we need to extract from
   9211         // the source half. To do this, we need to map them into some adjacent
   9212         // dword slot in the source mask.
   9213         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
   9214                               IncomingInputs[1] - SourceOffset};
   9215 
   9216         // If there is a free slot in the source half mask adjacent to one of
   9217         // the inputs, place the other input in it. We use (Index XOR 1) to
   9218         // compute an adjacent index.
   9219         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
   9220             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
   9221           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
   9222           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
   9223           InputsFixed[1] = InputsFixed[0] ^ 1;
   9224         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
   9225                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
   9226           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
   9227           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
   9228           InputsFixed[0] = InputsFixed[1] ^ 1;
   9229         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
   9230                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
   9231           // The two inputs are in the same DWord but it is clobbered and the
   9232           // adjacent DWord isn't used at all. Move both inputs to the free
   9233           // slot.
   9234           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
   9235           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
   9236           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
   9237           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
   9238         } else {
   9239           // The only way we hit this point is if there is no clobbering
   9240           // (because there are no off-half inputs to this half) and there is no
   9241           // free slot adjacent to one of the inputs. In this case, we have to
   9242           // swap an input with a non-input.
   9243           for (int i = 0; i < 4; ++i)
   9244             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
   9245                    "We can't handle any clobbers here!");
   9246           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
   9247                  "Cannot have adjacent inputs here!");
   9248 
   9249           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
   9250           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
   9251 
   9252           // We also have to update the final source mask in this case because
   9253           // it may need to undo the above swap.
   9254           for (int &M : FinalSourceHalfMask)
   9255             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
   9256               M = InputsFixed[1] + SourceOffset;
   9257             else if (M == InputsFixed[1] + SourceOffset)
   9258               M = (InputsFixed[0] ^ 1) + SourceOffset;
   9259 
   9260           InputsFixed[1] = InputsFixed[0] ^ 1;
   9261         }
   9262 
   9263         // Point everything at the fixed inputs.
   9264         for (int &M : HalfMask)
   9265           if (M == IncomingInputs[0])
   9266             M = InputsFixed[0] + SourceOffset;
   9267           else if (M == IncomingInputs[1])
   9268             M = InputsFixed[1] + SourceOffset;
   9269 
   9270         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
   9271         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
   9272       }
   9273     } else {
   9274       llvm_unreachable("Unhandled input size!");
   9275     }
   9276 
   9277     // Now hoist the DWord down to the right half.
   9278     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
   9279     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
   9280     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
   9281     for (int &M : HalfMask)
   9282       for (int Input : IncomingInputs)
   9283         if (M == Input)
   9284           M = FreeDWord * 2 + Input % 2;
   9285   };
   9286   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
   9287                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
   9288   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
   9289                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
   9290 
   9291   // Now enact all the shuffles we've computed to move the inputs into their
   9292   // target half.
   9293   if (!isNoopShuffleMask(PSHUFLMask))
   9294     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
   9295                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
   9296   if (!isNoopShuffleMask(PSHUFHMask))
   9297     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
   9298                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
   9299   if (!isNoopShuffleMask(PSHUFDMask))
   9300     V = DAG.getBitcast(
   9301         VT,
   9302         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
   9303                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   9304 
   9305   // At this point, each half should contain all its inputs, and we can then
   9306   // just shuffle them into their final position.
   9307   assert(std::count_if(LoMask.begin(), LoMask.end(),
   9308                        [](int M) { return M >= 4; }) == 0 &&
   9309          "Failed to lift all the high half inputs to the low mask!");
   9310   assert(std::count_if(HiMask.begin(), HiMask.end(),
   9311                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
   9312          "Failed to lift all the low half inputs to the high mask!");
   9313 
   9314   // Do a half shuffle for the low mask.
   9315   if (!isNoopShuffleMask(LoMask))
   9316     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
   9317                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
   9318 
   9319   // Do a half shuffle with the high mask after shifting its values down.
   9320   for (int &M : HiMask)
   9321     if (M >= 0)
   9322       M -= 4;
   9323   if (!isNoopShuffleMask(HiMask))
   9324     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
   9325                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
   9326 
   9327   return V;
   9328 }
   9329 
   9330 /// \brief Helper to form a PSHUFB-based shuffle+blend.
   9331 static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
   9332                                           SDValue V2, ArrayRef<int> Mask,
   9333                                           SelectionDAG &DAG, bool &V1InUse,
   9334                                           bool &V2InUse) {
   9335   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   9336   SDValue V1Mask[16];
   9337   SDValue V2Mask[16];
   9338   V1InUse = false;
   9339   V2InUse = false;
   9340 
   9341   int Size = Mask.size();
   9342   int Scale = 16 / Size;
   9343   for (int i = 0; i < 16; ++i) {
   9344     if (Mask[i / Scale] == -1) {
   9345       V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
   9346     } else {
   9347       const int ZeroMask = 0x80;
   9348       int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
   9349                                           : ZeroMask;
   9350       int V2Idx = Mask[i / Scale] < Size
   9351                       ? ZeroMask
   9352                       : (Mask[i / Scale] - Size) * Scale + i % Scale;
   9353       if (Zeroable[i / Scale])
   9354         V1Idx = V2Idx = ZeroMask;
   9355       V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
   9356       V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
   9357       V1InUse |= (ZeroMask != V1Idx);
   9358       V2InUse |= (ZeroMask != V2Idx);
   9359     }
   9360   }
   9361 
   9362   if (V1InUse)
   9363     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
   9364                      DAG.getBitcast(MVT::v16i8, V1),
   9365                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
   9366   if (V2InUse)
   9367     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
   9368                      DAG.getBitcast(MVT::v16i8, V2),
   9369                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
   9370 
   9371   // If we need shuffled inputs from both, blend the two.
   9372   SDValue V;
   9373   if (V1InUse && V2InUse)
   9374     V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
   9375   else
   9376     V = V1InUse ? V1 : V2;
   9377 
   9378   // Cast the result back to the correct type.
   9379   return DAG.getBitcast(VT, V);
   9380 }
   9381 
   9382 /// \brief Generic lowering of 8-lane i16 shuffles.
   9383 ///
   9384 /// This handles both single-input shuffles and combined shuffle/blends with
   9385 /// two inputs. The single input shuffles are immediately delegated to
   9386 /// a dedicated lowering routine.
   9387 ///
   9388 /// The blends are lowered in one of three fundamental ways. If there are few
   9389 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
   9390 /// of the input is significantly cheaper when lowered as an interleaving of
   9391 /// the two inputs, try to interleave them. Otherwise, blend the low and high
   9392 /// halves of the inputs separately (making them have relatively few inputs)
   9393 /// and then concatenate them.
   9394 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9395                                        const X86Subtarget *Subtarget,
   9396                                        SelectionDAG &DAG) {
   9397   SDLoc DL(Op);
   9398   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
   9399   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   9400   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   9401   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9402   ArrayRef<int> OrigMask = SVOp->getMask();
   9403   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
   9404                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
   9405   MutableArrayRef<int> Mask(MaskStorage);
   9406 
   9407   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   9408 
   9409   // Whenever we can lower this as a zext, that instruction is strictly faster
   9410   // than any alternative.
   9411   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   9412           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
   9413     return ZExt;
   9414 
   9415   auto isV1 = [](int M) { return M >= 0 && M < 8; };
   9416   (void)isV1;
   9417   auto isV2 = [](int M) { return M >= 8; };
   9418 
   9419   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
   9420 
   9421   if (NumV2Inputs == 0) {
   9422     // Check for being able to broadcast a single element.
   9423     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
   9424                                                           Mask, Subtarget, DAG))
   9425       return Broadcast;
   9426 
   9427     // Try to use shift instructions.
   9428     if (SDValue Shift =
   9429             lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
   9430       return Shift;
   9431 
   9432     // Use dedicated unpack instructions for masks that match their pattern.
   9433     if (SDValue V =
   9434             lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
   9435       return V;
   9436 
   9437     // Try to use byte rotation instructions.
   9438     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
   9439                                                         Mask, Subtarget, DAG))
   9440       return Rotate;
   9441 
   9442     return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask,
   9443                                                      Subtarget, DAG);
   9444   }
   9445 
   9446   assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
   9447          "All single-input shuffles should be canonicalized to be V1-input "
   9448          "shuffles.");
   9449 
   9450   // Try to use shift instructions.
   9451   if (SDValue Shift =
   9452           lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
   9453     return Shift;
   9454 
   9455   // See if we can use SSE4A Extraction / Insertion.
   9456   if (Subtarget->hasSSE4A())
   9457     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
   9458       return V;
   9459 
   9460   // There are special ways we can lower some single-element blends.
   9461   if (NumV2Inputs == 1)
   9462     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
   9463                                                          Mask, Subtarget, DAG))
   9464       return V;
   9465 
   9466   // We have different paths for blend lowering, but they all must use the
   9467   // *exact* same predicate.
   9468   bool IsBlendSupported = Subtarget->hasSSE41();
   9469   if (IsBlendSupported)
   9470     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
   9471                                                   Subtarget, DAG))
   9472       return Blend;
   9473 
   9474   if (SDValue Masked =
   9475           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
   9476     return Masked;
   9477 
   9478   // Use dedicated unpack instructions for masks that match their pattern.
   9479   if (SDValue V =
   9480           lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
   9481     return V;
   9482 
   9483   // Try to use byte rotation instructions.
   9484   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   9485           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
   9486     return Rotate;
   9487 
   9488   if (SDValue BitBlend =
   9489           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
   9490     return BitBlend;
   9491 
   9492   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
   9493                                                             V2, Mask, DAG))
   9494     return Unpack;
   9495 
   9496   // If we can't directly blend but can use PSHUFB, that will be better as it
   9497   // can both shuffle and set up the inefficient blend.
   9498   if (!IsBlendSupported && Subtarget->hasSSSE3()) {
   9499     bool V1InUse, V2InUse;
   9500     return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
   9501                                       V1InUse, V2InUse);
   9502   }
   9503 
   9504   // We can always bit-blend if we have to so the fallback strategy is to
   9505   // decompose into single-input permutes and blends.
   9506   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
   9507                                                       Mask, DAG);
   9508 }
   9509 
   9510 /// \brief Check whether a compaction lowering can be done by dropping even
   9511 /// elements and compute how many times even elements must be dropped.
   9512 ///
   9513 /// This handles shuffles which take every Nth element where N is a power of
   9514 /// two. Example shuffle masks:
   9515 ///
   9516 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
   9517 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
   9518 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
   9519 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
   9520 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
   9521 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
   9522 ///
   9523 /// Any of these lanes can of course be undef.
   9524 ///
   9525 /// This routine only supports N <= 3.
   9526 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
   9527 /// for larger N.
   9528 ///
   9529 /// \returns N above, or the number of times even elements must be dropped if
   9530 /// there is such a number. Otherwise returns zero.
   9531 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
   9532   // Figure out whether we're looping over two inputs or just one.
   9533   bool IsSingleInput = isSingleInputShuffleMask(Mask);
   9534 
   9535   // The modulus for the shuffle vector entries is based on whether this is
   9536   // a single input or not.
   9537   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
   9538   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
   9539          "We should only be called with masks with a power-of-2 size!");
   9540 
   9541   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
   9542 
   9543   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
   9544   // and 2^3 simultaneously. This is because we may have ambiguity with
   9545   // partially undef inputs.
   9546   bool ViableForN[3] = {true, true, true};
   9547 
   9548   for (int i = 0, e = Mask.size(); i < e; ++i) {
   9549     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
   9550     // want.
   9551     if (Mask[i] == -1)
   9552       continue;
   9553 
   9554     bool IsAnyViable = false;
   9555     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
   9556       if (ViableForN[j]) {
   9557         uint64_t N = j + 1;
   9558 
   9559         // The shuffle mask must be equal to (i * 2^N) % M.
   9560         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
   9561           IsAnyViable = true;
   9562         else
   9563           ViableForN[j] = false;
   9564       }
   9565     // Early exit if we exhaust the possible powers of two.
   9566     if (!IsAnyViable)
   9567       break;
   9568   }
   9569 
   9570   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
   9571     if (ViableForN[j])
   9572       return j + 1;
   9573 
   9574   // Return 0 as there is no viable power of two.
   9575   return 0;
   9576 }
   9577 
   9578 /// \brief Generic lowering of v16i8 shuffles.
   9579 ///
   9580 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
   9581 /// detect any complexity reducing interleaving. If that doesn't help, it uses
   9582 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
   9583 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
   9584 /// back together.
   9585 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9586                                        const X86Subtarget *Subtarget,
   9587                                        SelectionDAG &DAG) {
   9588   SDLoc DL(Op);
   9589   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
   9590   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   9591   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   9592   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9593   ArrayRef<int> Mask = SVOp->getMask();
   9594   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   9595 
   9596   // Try to use shift instructions.
   9597   if (SDValue Shift =
   9598           lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
   9599     return Shift;
   9600 
   9601   // Try to use byte rotation instructions.
   9602   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   9603           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
   9604     return Rotate;
   9605 
   9606   // Try to use a zext lowering.
   9607   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   9608           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
   9609     return ZExt;
   9610 
   9611   // See if we can use SSE4A Extraction / Insertion.
   9612   if (Subtarget->hasSSE4A())
   9613     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
   9614       return V;
   9615 
   9616   int NumV2Elements =
   9617       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
   9618 
   9619   // For single-input shuffles, there are some nicer lowering tricks we can use.
   9620   if (NumV2Elements == 0) {
   9621     // Check for being able to broadcast a single element.
   9622     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
   9623                                                           Mask, Subtarget, DAG))
   9624       return Broadcast;
   9625 
   9626     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
   9627     // Notably, this handles splat and partial-splat shuffles more efficiently.
   9628     // However, it only makes sense if the pre-duplication shuffle simplifies
   9629     // things significantly. Currently, this means we need to be able to
   9630     // express the pre-duplication shuffle as an i16 shuffle.
   9631     //
   9632     // FIXME: We should check for other patterns which can be widened into an
   9633     // i16 shuffle as well.
   9634     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
   9635       for (int i = 0; i < 16; i += 2)
   9636         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
   9637           return false;
   9638 
   9639       return true;
   9640     };
   9641     auto tryToWidenViaDuplication = [&]() -> SDValue {
   9642       if (!canWidenViaDuplication(Mask))
   9643         return SDValue();
   9644       SmallVector<int, 4> LoInputs;
   9645       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
   9646                    [](int M) { return M >= 0 && M < 8; });
   9647       std::sort(LoInputs.begin(), LoInputs.end());
   9648       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
   9649                      LoInputs.end());
   9650       SmallVector<int, 4> HiInputs;
   9651       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
   9652                    [](int M) { return M >= 8; });
   9653       std::sort(HiInputs.begin(), HiInputs.end());
   9654       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
   9655                      HiInputs.end());
   9656 
   9657       bool TargetLo = LoInputs.size() >= HiInputs.size();
   9658       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
   9659       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
   9660 
   9661       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
   9662       SmallDenseMap<int, int, 8> LaneMap;
   9663       for (int I : InPlaceInputs) {
   9664         PreDupI16Shuffle[I/2] = I/2;
   9665         LaneMap[I] = I;
   9666       }
   9667       int j = TargetLo ? 0 : 4, je = j + 4;
   9668       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
   9669         // Check if j is already a shuffle of this input. This happens when
   9670         // there are two adjacent bytes after we move the low one.
   9671         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
   9672           // If we haven't yet mapped the input, search for a slot into which
   9673           // we can map it.
   9674           while (j < je && PreDupI16Shuffle[j] != -1)
   9675             ++j;
   9676 
   9677           if (j == je)
   9678             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
   9679             return SDValue();
   9680 
   9681           // Map this input with the i16 shuffle.
   9682           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
   9683         }
   9684 
   9685         // Update the lane map based on the mapping we ended up with.
   9686         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
   9687       }
   9688       V1 = DAG.getBitcast(
   9689           MVT::v16i8,
   9690           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
   9691                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
   9692 
   9693       // Unpack the bytes to form the i16s that will be shuffled into place.
   9694       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
   9695                        MVT::v16i8, V1, V1);
   9696 
   9697       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   9698       for (int i = 0; i < 16; ++i)
   9699         if (Mask[i] != -1) {
   9700           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
   9701           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
   9702           if (PostDupI16Shuffle[i / 2] == -1)
   9703             PostDupI16Shuffle[i / 2] = MappedMask;
   9704           else
   9705             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
   9706                    "Conflicting entrties in the original shuffle!");
   9707         }
   9708       return DAG.getBitcast(
   9709           MVT::v16i8,
   9710           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
   9711                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
   9712     };
   9713     if (SDValue V = tryToWidenViaDuplication())
   9714       return V;
   9715   }
   9716 
   9717   if (SDValue Masked =
   9718           lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG))
   9719     return Masked;
   9720 
   9721   // Use dedicated unpack instructions for masks that match their pattern.
   9722   if (SDValue V =
   9723           lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
   9724     return V;
   9725 
   9726   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
   9727   // with PSHUFB. It is important to do this before we attempt to generate any
   9728   // blends but after all of the single-input lowerings. If the single input
   9729   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
   9730   // want to preserve that and we can DAG combine any longer sequences into
   9731   // a PSHUFB in the end. But once we start blending from multiple inputs,
   9732   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
   9733   // and there are *very* few patterns that would actually be faster than the
   9734   // PSHUFB approach because of its ability to zero lanes.
   9735   //
   9736   // FIXME: The only exceptions to the above are blends which are exact
   9737   // interleavings with direct instructions supporting them. We currently don't
   9738   // handle those well here.
   9739   if (Subtarget->hasSSSE3()) {
   9740     bool V1InUse = false;
   9741     bool V2InUse = false;
   9742 
   9743     SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
   9744                                                 DAG, V1InUse, V2InUse);
   9745 
   9746     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
   9747     // do so. This avoids using them to handle blends-with-zero which is
   9748     // important as a single pshufb is significantly faster for that.
   9749     if (V1InUse && V2InUse) {
   9750       if (Subtarget->hasSSE41())
   9751         if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
   9752                                                       Mask, Subtarget, DAG))
   9753           return Blend;
   9754 
   9755       // We can use an unpack to do the blending rather than an or in some
   9756       // cases. Even though the or may be (very minorly) more efficient, we
   9757       // preference this lowering because there are common cases where part of
   9758       // the complexity of the shuffles goes away when we do the final blend as
   9759       // an unpack.
   9760       // FIXME: It might be worth trying to detect if the unpack-feeding
   9761       // shuffles will both be pshufb, in which case we shouldn't bother with
   9762       // this.
   9763       if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
   9764               DL, MVT::v16i8, V1, V2, Mask, DAG))
   9765         return Unpack;
   9766     }
   9767 
   9768     return PSHUFB;
   9769   }
   9770 
   9771   // There are special ways we can lower some single-element blends.
   9772   if (NumV2Elements == 1)
   9773     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
   9774                                                          Mask, Subtarget, DAG))
   9775       return V;
   9776 
   9777   if (SDValue BitBlend =
   9778           lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
   9779     return BitBlend;
   9780 
   9781   // Check whether a compaction lowering can be done. This handles shuffles
   9782   // which take every Nth element for some even N. See the helper function for
   9783   // details.
   9784   //
   9785   // We special case these as they can be particularly efficiently handled with
   9786   // the PACKUSB instruction on x86 and they show up in common patterns of
   9787   // rearranging bytes to truncate wide elements.
   9788   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
   9789     // NumEvenDrops is the power of two stride of the elements. Another way of
   9790     // thinking about it is that we need to drop the even elements this many
   9791     // times to get the original input.
   9792     bool IsSingleInput = isSingleInputShuffleMask(Mask);
   9793 
   9794     // First we need to zero all the dropped bytes.
   9795     assert(NumEvenDrops <= 3 &&
   9796            "No support for dropping even elements more than 3 times.");
   9797     // We use the mask type to pick which bytes are preserved based on how many
   9798     // elements are dropped.
   9799     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
   9800     SDValue ByteClearMask = DAG.getBitcast(
   9801         MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
   9802     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
   9803     if (!IsSingleInput)
   9804       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
   9805 
   9806     // Now pack things back together.
   9807     V1 = DAG.getBitcast(MVT::v8i16, V1);
   9808     V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
   9809     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
   9810     for (int i = 1; i < NumEvenDrops; ++i) {
   9811       Result = DAG.getBitcast(MVT::v8i16, Result);
   9812       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
   9813     }
   9814 
   9815     return Result;
   9816   }
   9817 
   9818   // Handle multi-input cases by blending single-input shuffles.
   9819   if (NumV2Elements > 0)
   9820     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
   9821                                                       Mask, DAG);
   9822 
   9823   // The fallback path for single-input shuffles widens this into two v8i16
   9824   // vectors with unpacks, shuffles those, and then pulls them back together
   9825   // with a pack.
   9826   SDValue V = V1;
   9827 
   9828   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   9829   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   9830   for (int i = 0; i < 16; ++i)
   9831     if (Mask[i] >= 0)
   9832       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
   9833 
   9834   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
   9835 
   9836   SDValue VLoHalf, VHiHalf;
   9837   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
   9838   // them out and avoid using UNPCK{L,H} to extract the elements of V as
   9839   // i16s.
   9840   if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
   9841                    [](int M) { return M >= 0 && M % 2 == 1; }) &&
   9842       std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
   9843                    [](int M) { return M >= 0 && M % 2 == 1; })) {
   9844     // Use a mask to drop the high bytes.
   9845     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
   9846     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
   9847                      DAG.getConstant(0x00FF, DL, MVT::v8i16));
   9848 
   9849     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
   9850     VHiHalf = DAG.getUNDEF(MVT::v8i16);
   9851 
   9852     // Squash the masks to point directly into VLoHalf.
   9853     for (int &M : LoBlendMask)
   9854       if (M >= 0)
   9855         M /= 2;
   9856     for (int &M : HiBlendMask)
   9857       if (M >= 0)
   9858         M /= 2;
   9859   } else {
   9860     // Otherwise just unpack the low half of V into VLoHalf and the high half into
   9861     // VHiHalf so that we can blend them as i16s.
   9862     VLoHalf = DAG.getBitcast(
   9863         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
   9864     VHiHalf = DAG.getBitcast(
   9865         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
   9866   }
   9867 
   9868   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
   9869   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
   9870 
   9871   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
   9872 }
   9873 
   9874 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
   9875 ///
   9876 /// This routine breaks down the specific type of 128-bit shuffle and
   9877 /// dispatches to the lowering routines accordingly.
   9878 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9879                                         MVT VT, const X86Subtarget *Subtarget,
   9880                                         SelectionDAG &DAG) {
   9881   switch (VT.SimpleTy) {
   9882   case MVT::v2i64:
   9883     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   9884   case MVT::v2f64:
   9885     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   9886   case MVT::v4i32:
   9887     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   9888   case MVT::v4f32:
   9889     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   9890   case MVT::v8i16:
   9891     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
   9892   case MVT::v16i8:
   9893     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
   9894 
   9895   default:
   9896     llvm_unreachable("Unimplemented!");
   9897   }
   9898 }
   9899 
   9900 /// \brief Helper function to test whether a shuffle mask could be
   9901 /// simplified by widening the elements being shuffled.
   9902 ///
   9903 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
   9904 /// leaves it in an unspecified state.
   9905 ///
   9906 /// NOTE: This must handle normal vector shuffle masks and *target* vector
   9907 /// shuffle masks. The latter have the special property of a '-2' representing
   9908 /// a zero-ed lane of a vector.
   9909 static bool canWidenShuffleElements(ArrayRef<int> Mask,
   9910                                     SmallVectorImpl<int> &WidenedMask) {
   9911   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
   9912     // If both elements are undef, its trivial.
   9913     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
   9914       WidenedMask.push_back(SM_SentinelUndef);
   9915       continue;
   9916     }
   9917 
   9918     // Check for an undef mask and a mask value properly aligned to fit with
   9919     // a pair of values. If we find such a case, use the non-undef mask's value.
   9920     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
   9921       WidenedMask.push_back(Mask[i + 1] / 2);
   9922       continue;
   9923     }
   9924     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
   9925       WidenedMask.push_back(Mask[i] / 2);
   9926       continue;
   9927     }
   9928 
   9929     // When zeroing, we need to spread the zeroing across both lanes to widen.
   9930     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
   9931       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
   9932           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
   9933         WidenedMask.push_back(SM_SentinelZero);
   9934         continue;
   9935       }
   9936       return false;
   9937     }
   9938 
   9939     // Finally check if the two mask values are adjacent and aligned with
   9940     // a pair.
   9941     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
   9942       WidenedMask.push_back(Mask[i] / 2);
   9943       continue;
   9944     }
   9945 
   9946     // Otherwise we can't safely widen the elements used in this shuffle.
   9947     return false;
   9948   }
   9949   assert(WidenedMask.size() == Mask.size() / 2 &&
   9950          "Incorrect size of mask after widening the elements!");
   9951 
   9952   return true;
   9953 }
   9954 
   9955 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
   9956 ///
   9957 /// This routine just extracts two subvectors, shuffles them independently, and
   9958 /// then concatenates them back together. This should work effectively with all
   9959 /// AVX vector shuffle types.
   9960 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
   9961                                           SDValue V2, ArrayRef<int> Mask,
   9962                                           SelectionDAG &DAG) {
   9963   assert(VT.getSizeInBits() >= 256 &&
   9964          "Only for 256-bit or wider vector shuffles!");
   9965   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
   9966   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
   9967 
   9968   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
   9969   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
   9970 
   9971   int NumElements = VT.getVectorNumElements();
   9972   int SplitNumElements = NumElements / 2;
   9973   MVT ScalarVT = VT.getVectorElementType();
   9974   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
   9975 
   9976   // Rather than splitting build-vectors, just build two narrower build
   9977   // vectors. This helps shuffling with splats and zeros.
   9978   auto SplitVector = [&](SDValue V) {
   9979     while (V.getOpcode() == ISD::BITCAST)
   9980       V = V->getOperand(0);
   9981 
   9982     MVT OrigVT = V.getSimpleValueType();
   9983     int OrigNumElements = OrigVT.getVectorNumElements();
   9984     int OrigSplitNumElements = OrigNumElements / 2;
   9985     MVT OrigScalarVT = OrigVT.getVectorElementType();
   9986     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
   9987 
   9988     SDValue LoV, HiV;
   9989 
   9990     auto *BV = dyn_cast<BuildVectorSDNode>(V);
   9991     if (!BV) {
   9992       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
   9993                         DAG.getIntPtrConstant(0, DL));
   9994       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
   9995                         DAG.getIntPtrConstant(OrigSplitNumElements, DL));
   9996     } else {
   9997 
   9998       SmallVector<SDValue, 16> LoOps, HiOps;
   9999       for (int i = 0; i < OrigSplitNumElements; ++i) {
   10000         LoOps.push_back(BV->getOperand(i));
   10001         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
   10002       }
   10003       LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
   10004       HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
   10005     }
   10006     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
   10007                           DAG.getBitcast(SplitVT, HiV));
   10008   };
   10009 
   10010   SDValue LoV1, HiV1, LoV2, HiV2;
   10011   std::tie(LoV1, HiV1) = SplitVector(V1);
   10012   std::tie(LoV2, HiV2) = SplitVector(V2);
   10013 
   10014   // Now create two 4-way blends of these half-width vectors.
   10015   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
   10016     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
   10017     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
   10018     for (int i = 0; i < SplitNumElements; ++i) {
   10019       int M = HalfMask[i];
   10020       if (M >= NumElements) {
   10021         if (M >= NumElements + SplitNumElements)
   10022           UseHiV2 = true;
   10023         else
   10024           UseLoV2 = true;
   10025         V2BlendMask.push_back(M - NumElements);
   10026         V1BlendMask.push_back(-1);
   10027         BlendMask.push_back(SplitNumElements + i);
   10028       } else if (M >= 0) {
   10029         if (M >= SplitNumElements)
   10030           UseHiV1 = true;
   10031         else
   10032           UseLoV1 = true;
   10033         V2BlendMask.push_back(-1);
   10034         V1BlendMask.push_back(M);
   10035         BlendMask.push_back(i);
   10036       } else {
   10037         V2BlendMask.push_back(-1);
   10038         V1BlendMask.push_back(-1);
   10039         BlendMask.push_back(-1);
   10040       }
   10041     }
   10042 
   10043     // Because the lowering happens after all combining takes place, we need to
   10044     // manually combine these blend masks as much as possible so that we create
   10045     // a minimal number of high-level vector shuffle nodes.
   10046 
   10047     // First try just blending the halves of V1 or V2.
   10048     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
   10049       return DAG.getUNDEF(SplitVT);
   10050     if (!UseLoV2 && !UseHiV2)
   10051       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
   10052     if (!UseLoV1 && !UseHiV1)
   10053       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
   10054 
   10055     SDValue V1Blend, V2Blend;
   10056     if (UseLoV1 && UseHiV1) {
   10057       V1Blend =
   10058         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
   10059     } else {
   10060       // We only use half of V1 so map the usage down into the final blend mask.
   10061       V1Blend = UseLoV1 ? LoV1 : HiV1;
   10062       for (int i = 0; i < SplitNumElements; ++i)
   10063         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
   10064           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
   10065     }
   10066     if (UseLoV2 && UseHiV2) {
   10067       V2Blend =
   10068         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
   10069     } else {
   10070       // We only use half of V2 so map the usage down into the final blend mask.
   10071       V2Blend = UseLoV2 ? LoV2 : HiV2;
   10072       for (int i = 0; i < SplitNumElements; ++i)
   10073         if (BlendMask[i] >= SplitNumElements)
   10074           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
   10075     }
   10076     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
   10077   };
   10078   SDValue Lo = HalfBlend(LoMask);
   10079   SDValue Hi = HalfBlend(HiMask);
   10080   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
   10081 }
   10082 
   10083 /// \brief Either split a vector in halves or decompose the shuffles and the
   10084 /// blend.
   10085 ///
   10086 /// This is provided as a good fallback for many lowerings of non-single-input
   10087 /// shuffles with more than one 128-bit lane. In those cases, we want to select
   10088 /// between splitting the shuffle into 128-bit components and stitching those
   10089 /// back together vs. extracting the single-input shuffles and blending those
   10090 /// results.
   10091 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
   10092                                                 SDValue V2, ArrayRef<int> Mask,
   10093                                                 SelectionDAG &DAG) {
   10094   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
   10095                                             "lower single-input shuffles as it "
   10096                                             "could then recurse on itself.");
   10097   int Size = Mask.size();
   10098 
   10099   // If this can be modeled as a broadcast of two elements followed by a blend,
   10100   // prefer that lowering. This is especially important because broadcasts can
   10101   // often fold with memory operands.
   10102   auto DoBothBroadcast = [&] {
   10103     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
   10104     for (int M : Mask)
   10105       if (M >= Size) {
   10106         if (V2BroadcastIdx == -1)
   10107           V2BroadcastIdx = M - Size;
   10108         else if (M - Size != V2BroadcastIdx)
   10109           return false;
   10110       } else if (M >= 0) {
   10111         if (V1BroadcastIdx == -1)
   10112           V1BroadcastIdx = M;
   10113         else if (M != V1BroadcastIdx)
   10114           return false;
   10115       }
   10116     return true;
   10117   };
   10118   if (DoBothBroadcast())
   10119     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
   10120                                                       DAG);
   10121 
   10122   // If the inputs all stem from a single 128-bit lane of each input, then we
   10123   // split them rather than blending because the split will decompose to
   10124   // unusually few instructions.
   10125   int LaneCount = VT.getSizeInBits() / 128;
   10126   int LaneSize = Size / LaneCount;
   10127   SmallBitVector LaneInputs[2];
   10128   LaneInputs[0].resize(LaneCount, false);
   10129   LaneInputs[1].resize(LaneCount, false);
   10130   for (int i = 0; i < Size; ++i)
   10131     if (Mask[i] >= 0)
   10132       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
   10133   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
   10134     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   10135 
   10136   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
   10137   // that the decomposed single-input shuffles don't end up here.
   10138   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
   10139 }
   10140 
   10141 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
   10142 /// a permutation and blend of those lanes.
   10143 ///
   10144 /// This essentially blends the out-of-lane inputs to each lane into the lane
   10145 /// from a permuted copy of the vector. This lowering strategy results in four
   10146 /// instructions in the worst case for a single-input cross lane shuffle which
   10147 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
   10148 /// of. Special cases for each particular shuffle pattern should be handled
   10149 /// prior to trying this lowering.
   10150 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
   10151                                                        SDValue V1, SDValue V2,
   10152                                                        ArrayRef<int> Mask,
   10153                                                        SelectionDAG &DAG) {
   10154   // FIXME: This should probably be generalized for 512-bit vectors as well.
   10155   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
   10156   int LaneSize = Mask.size() / 2;
   10157 
   10158   // If there are only inputs from one 128-bit lane, splitting will in fact be
   10159   // less expensive. The flags track whether the given lane contains an element
   10160   // that crosses to another lane.
   10161   bool LaneCrossing[2] = {false, false};
   10162   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   10163     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
   10164       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
   10165   if (!LaneCrossing[0] || !LaneCrossing[1])
   10166     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   10167 
   10168   if (isSingleInputShuffleMask(Mask)) {
   10169     SmallVector<int, 32> FlippedBlendMask;
   10170     for (int i = 0, Size = Mask.size(); i < Size; ++i)
   10171       FlippedBlendMask.push_back(
   10172           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
   10173                                   ? Mask[i]
   10174                                   : Mask[i] % LaneSize +
   10175                                         (i / LaneSize) * LaneSize + Size));
   10176 
   10177     // Flip the vector, and blend the results which should now be in-lane. The
   10178     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
   10179     // 5 for the high source. The value 3 selects the high half of source 2 and
   10180     // the value 2 selects the low half of source 2. We only use source 2 to
   10181     // allow folding it into a memory operand.
   10182     unsigned PERMMask = 3 | 2 << 4;
   10183     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
   10184                                   V1, DAG.getConstant(PERMMask, DL, MVT::i8));
   10185     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
   10186   }
   10187 
   10188   // This now reduces to two single-input shuffles of V1 and V2 which at worst
   10189   // will be handled by the above logic and a blend of the results, much like
   10190   // other patterns in AVX.
   10191   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
   10192 }
   10193 
   10194 /// \brief Handle lowering 2-lane 128-bit shuffles.
   10195 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
   10196                                         SDValue V2, ArrayRef<int> Mask,
   10197                                         const X86Subtarget *Subtarget,
   10198                                         SelectionDAG &DAG) {
   10199   // TODO: If minimizing size and one of the inputs is a zero vector and the
   10200   // the zero vector has only one use, we could use a VPERM2X128 to save the
   10201   // instruction bytes needed to explicitly generate the zero vector.
   10202 
   10203   // Blends are faster and handle all the non-lane-crossing cases.
   10204   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
   10205                                                 Subtarget, DAG))
   10206     return Blend;
   10207 
   10208   bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
   10209   bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
   10210 
   10211   // If either input operand is a zero vector, use VPERM2X128 because its mask
   10212   // allows us to replace the zero input with an implicit zero.
   10213   if (!IsV1Zero && !IsV2Zero) {
   10214     // Check for patterns which can be matched with a single insert of a 128-bit
   10215     // subvector.
   10216     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
   10217     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
   10218       MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
   10219                                    VT.getVectorNumElements() / 2);
   10220       SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
   10221                                 DAG.getIntPtrConstant(0, DL));
   10222       SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
   10223                                 OnlyUsesV1 ? V1 : V2,
   10224                                 DAG.getIntPtrConstant(0, DL));
   10225       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
   10226     }
   10227   }
   10228 
   10229   // Otherwise form a 128-bit permutation. After accounting for undefs,
   10230   // convert the 64-bit shuffle mask selection values into 128-bit
   10231   // selection bits by dividing the indexes by 2 and shifting into positions
   10232   // defined by a vperm2*128 instruction's immediate control byte.
   10233 
   10234   // The immediate permute control byte looks like this:
   10235   //    [1:0] - select 128 bits from sources for low half of destination
   10236   //    [2]   - ignore
   10237   //    [3]   - zero low half of destination
   10238   //    [5:4] - select 128 bits from sources for high half of destination
   10239   //    [6]   - ignore
   10240   //    [7]   - zero high half of destination
   10241 
   10242   int MaskLO = Mask[0];
   10243   if (MaskLO == SM_SentinelUndef)
   10244     MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
   10245 
   10246   int MaskHI = Mask[2];
   10247   if (MaskHI == SM_SentinelUndef)
   10248     MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
   10249 
   10250   unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
   10251 
   10252   // If either input is a zero vector, replace it with an undef input.
   10253   // Shuffle mask values <  4 are selecting elements of V1.
   10254   // Shuffle mask values >= 4 are selecting elements of V2.
   10255   // Adjust each half of the permute mask by clearing the half that was
   10256   // selecting the zero vector and setting the zero mask bit.
   10257   if (IsV1Zero) {
   10258     V1 = DAG.getUNDEF(VT);
   10259     if (MaskLO < 4)
   10260       PermMask = (PermMask & 0xf0) | 0x08;
   10261     if (MaskHI < 4)
   10262       PermMask = (PermMask & 0x0f) | 0x80;
   10263   }
   10264   if (IsV2Zero) {
   10265     V2 = DAG.getUNDEF(VT);
   10266     if (MaskLO >= 4)
   10267       PermMask = (PermMask & 0xf0) | 0x08;
   10268     if (MaskHI >= 4)
   10269       PermMask = (PermMask & 0x0f) | 0x80;
   10270   }
   10271 
   10272   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
   10273                      DAG.getConstant(PermMask, DL, MVT::i8));
   10274 }
   10275 
   10276 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
   10277 /// shuffling each lane.
   10278 ///
   10279 /// This will only succeed when the result of fixing the 128-bit lanes results
   10280 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
   10281 /// each 128-bit lanes. This handles many cases where we can quickly blend away
   10282 /// the lane crosses early and then use simpler shuffles within each lane.
   10283 ///
   10284 /// FIXME: It might be worthwhile at some point to support this without
   10285 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
   10286 /// in x86 only floating point has interesting non-repeating shuffles, and even
   10287 /// those are still *marginally* more expensive.
   10288 static SDValue lowerVectorShuffleByMerging128BitLanes(
   10289     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   10290     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   10291   assert(!isSingleInputShuffleMask(Mask) &&
   10292          "This is only useful with multiple inputs.");
   10293 
   10294   int Size = Mask.size();
   10295   int LaneSize = 128 / VT.getScalarSizeInBits();
   10296   int NumLanes = Size / LaneSize;
   10297   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
   10298 
   10299   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
   10300   // check whether the in-128-bit lane shuffles share a repeating pattern.
   10301   SmallVector<int, 4> Lanes;
   10302   Lanes.resize(NumLanes, -1);
   10303   SmallVector<int, 4> InLaneMask;
   10304   InLaneMask.resize(LaneSize, -1);
   10305   for (int i = 0; i < Size; ++i) {
   10306     if (Mask[i] < 0)
   10307       continue;
   10308 
   10309     int j = i / LaneSize;
   10310 
   10311     if (Lanes[j] < 0) {
   10312       // First entry we've seen for this lane.
   10313       Lanes[j] = Mask[i] / LaneSize;
   10314     } else if (Lanes[j] != Mask[i] / LaneSize) {
   10315       // This doesn't match the lane selected previously!
   10316       return SDValue();
   10317     }
   10318 
   10319     // Check that within each lane we have a consistent shuffle mask.
   10320     int k = i % LaneSize;
   10321     if (InLaneMask[k] < 0) {
   10322       InLaneMask[k] = Mask[i] % LaneSize;
   10323     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
   10324       // This doesn't fit a repeating in-lane mask.
   10325       return SDValue();
   10326     }
   10327   }
   10328 
   10329   // First shuffle the lanes into place.
   10330   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
   10331                                 VT.getSizeInBits() / 64);
   10332   SmallVector<int, 8> LaneMask;
   10333   LaneMask.resize(NumLanes * 2, -1);
   10334   for (int i = 0; i < NumLanes; ++i)
   10335     if (Lanes[i] >= 0) {
   10336       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
   10337       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
   10338     }
   10339 
   10340   V1 = DAG.getBitcast(LaneVT, V1);
   10341   V2 = DAG.getBitcast(LaneVT, V2);
   10342   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
   10343 
   10344   // Cast it back to the type we actually want.
   10345   LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
   10346 
   10347   // Now do a simple shuffle that isn't lane crossing.
   10348   SmallVector<int, 8> NewMask;
   10349   NewMask.resize(Size, -1);
   10350   for (int i = 0; i < Size; ++i)
   10351     if (Mask[i] >= 0)
   10352       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
   10353   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
   10354          "Must not introduce lane crosses at this point!");
   10355 
   10356   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
   10357 }
   10358 
   10359 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
   10360 /// given mask.
   10361 ///
   10362 /// This returns true if the elements from a particular input are already in the
   10363 /// slot required by the given mask and require no permutation.
   10364 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
   10365   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
   10366   int Size = Mask.size();
   10367   for (int i = 0; i < Size; ++i)
   10368     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
   10369       return false;
   10370 
   10371   return true;
   10372 }
   10373 
   10374 static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT,
   10375                                             ArrayRef<int> Mask, SDValue V1,
   10376                                             SDValue V2, SelectionDAG &DAG) {
   10377 
   10378   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
   10379   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
   10380   assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
   10381   int NumElts = VT.getVectorNumElements();
   10382   bool ShufpdMask = true;
   10383   bool CommutableMask = true;
   10384   unsigned Immediate = 0;
   10385   for (int i = 0; i < NumElts; ++i) {
   10386     if (Mask[i] < 0)
   10387       continue;
   10388     int Val = (i & 6) + NumElts * (i & 1);
   10389     int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
   10390     if (Mask[i] < Val ||  Mask[i] > Val + 1)
   10391       ShufpdMask = false;
   10392     if (Mask[i] < CommutVal ||  Mask[i] > CommutVal + 1)
   10393       CommutableMask = false;
   10394     Immediate |= (Mask[i] % 2) << i;
   10395   }
   10396   if (ShufpdMask)
   10397     return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
   10398                        DAG.getConstant(Immediate, DL, MVT::i8));
   10399   if (CommutableMask)
   10400     return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
   10401                        DAG.getConstant(Immediate, DL, MVT::i8));
   10402   return SDValue();
   10403 }
   10404 
   10405 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
   10406 ///
   10407 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
   10408 /// isn't available.
   10409 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   10410                                        const X86Subtarget *Subtarget,
   10411                                        SelectionDAG &DAG) {
   10412   SDLoc DL(Op);
   10413   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   10414   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   10415   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   10416   ArrayRef<int> Mask = SVOp->getMask();
   10417   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   10418 
   10419   SmallVector<int, 4> WidenedMask;
   10420   if (canWidenShuffleElements(Mask, WidenedMask))
   10421     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
   10422                                     DAG);
   10423 
   10424   if (isSingleInputShuffleMask(Mask)) {
   10425     // Check for being able to broadcast a single element.
   10426     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
   10427                                                           Mask, Subtarget, DAG))
   10428       return Broadcast;
   10429 
   10430     // Use low duplicate instructions for masks that match their pattern.
   10431     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
   10432       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
   10433 
   10434     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
   10435       // Non-half-crossing single input shuffles can be lowerid with an
   10436       // interleaved permutation.
   10437       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
   10438                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
   10439       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
   10440                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
   10441     }
   10442 
   10443     // With AVX2 we have direct support for this permutation.
   10444     if (Subtarget->hasAVX2())
   10445       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
   10446                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   10447 
   10448     // Otherwise, fall back.
   10449     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
   10450                                                    DAG);
   10451   }
   10452 
   10453   // Use dedicated unpack instructions for masks that match their pattern.
   10454   if (SDValue V =
   10455           lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
   10456     return V;
   10457 
   10458   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
   10459                                                 Subtarget, DAG))
   10460     return Blend;
   10461 
   10462   // Check if the blend happens to exactly fit that of SHUFPD.
   10463   if (SDValue Op =
   10464       lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
   10465     return Op;
   10466 
   10467   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   10468   // shuffle. However, if we have AVX2 and either inputs are already in place,
   10469   // we will be able to shuffle even across lanes the other input in a single
   10470   // instruction so skip this pattern.
   10471   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
   10472                                  isShuffleMaskInputInPlace(1, Mask))))
   10473     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   10474             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
   10475       return Result;
   10476 
   10477   // If we have AVX2 then we always want to lower with a blend because an v4 we
   10478   // can fully permute the elements.
   10479   if (Subtarget->hasAVX2())
   10480     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
   10481                                                       Mask, DAG);
   10482 
   10483   // Otherwise fall back on generic lowering.
   10484   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
   10485 }
   10486 
   10487 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
   10488 ///
   10489 /// This routine is only called when we have AVX2 and thus a reasonable
   10490 /// instruction set for v4i64 shuffling..
   10491 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   10492                                        const X86Subtarget *Subtarget,
   10493                                        SelectionDAG &DAG) {
   10494   SDLoc DL(Op);
   10495   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   10496   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   10497   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   10498   ArrayRef<int> Mask = SVOp->getMask();
   10499   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   10500   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
   10501 
   10502   SmallVector<int, 4> WidenedMask;
   10503   if (canWidenShuffleElements(Mask, WidenedMask))
   10504     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
   10505                                     DAG);
   10506 
   10507   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
   10508                                                 Subtarget, DAG))
   10509     return Blend;
   10510 
   10511   // Check for being able to broadcast a single element.
   10512   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
   10513                                                         Mask, Subtarget, DAG))
   10514     return Broadcast;
   10515 
   10516   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
   10517   // use lower latency instructions that will operate on both 128-bit lanes.
   10518   SmallVector<int, 2> RepeatedMask;
   10519   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
   10520     if (isSingleInputShuffleMask(Mask)) {
   10521       int PSHUFDMask[] = {-1, -1, -1, -1};
   10522       for (int i = 0; i < 2; ++i)
   10523         if (RepeatedMask[i] >= 0) {
   10524           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
   10525           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
   10526         }
   10527       return DAG.getBitcast(
   10528           MVT::v4i64,
   10529           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
   10530                       DAG.getBitcast(MVT::v8i32, V1),
   10531                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   10532     }
   10533   }
   10534 
   10535   // AVX2 provides a direct instruction for permuting a single input across
   10536   // lanes.
   10537   if (isSingleInputShuffleMask(Mask))
   10538     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
   10539                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   10540 
   10541   // Try to use shift instructions.
   10542   if (SDValue Shift =
   10543           lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
   10544     return Shift;
   10545 
   10546   // Use dedicated unpack instructions for masks that match their pattern.
   10547   if (SDValue V =
   10548           lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
   10549     return V;
   10550 
   10551   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   10552   // shuffle. However, if we have AVX2 and either inputs are already in place,
   10553   // we will be able to shuffle even across lanes the other input in a single
   10554   // instruction so skip this pattern.
   10555   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
   10556                                  isShuffleMaskInputInPlace(1, Mask))))
   10557     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   10558             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
   10559       return Result;
   10560 
   10561   // Otherwise fall back on generic blend lowering.
   10562   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
   10563                                                     Mask, DAG);
   10564 }
   10565 
   10566 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
   10567 ///
   10568 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
   10569 /// isn't available.
   10570 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   10571                                        const X86Subtarget *Subtarget,
   10572                                        SelectionDAG &DAG) {
   10573   SDLoc DL(Op);
   10574   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   10575   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   10576   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   10577   ArrayRef<int> Mask = SVOp->getMask();
   10578   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   10579 
   10580   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
   10581                                                 Subtarget, DAG))
   10582     return Blend;
   10583 
   10584   // Check for being able to broadcast a single element.
   10585   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
   10586                                                         Mask, Subtarget, DAG))
   10587     return Broadcast;
   10588 
   10589   // If the shuffle mask is repeated in each 128-bit lane, we have many more
   10590   // options to efficiently lower the shuffle.
   10591   SmallVector<int, 4> RepeatedMask;
   10592   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
   10593     assert(RepeatedMask.size() == 4 &&
   10594            "Repeated masks must be half the mask width!");
   10595 
   10596     // Use even/odd duplicate instructions for masks that match their pattern.
   10597     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
   10598       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
   10599     if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7}))
   10600       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
   10601 
   10602     if (isSingleInputShuffleMask(Mask))
   10603       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
   10604                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   10605 
   10606     // Use dedicated unpack instructions for masks that match their pattern.
   10607     if (SDValue V =
   10608             lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
   10609       return V;
   10610 
   10611     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
   10612     // have already handled any direct blends. We also need to squash the
   10613     // repeated mask into a simulated v4f32 mask.
   10614     for (int i = 0; i < 4; ++i)
   10615       if (RepeatedMask[i] >= 8)
   10616         RepeatedMask[i] -= 4;
   10617     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
   10618   }
   10619 
   10620   // If we have a single input shuffle with different shuffle patterns in the
   10621   // two 128-bit lanes use the variable mask to VPERMILPS.
   10622   if (isSingleInputShuffleMask(Mask)) {
   10623     SDValue VPermMask[8];
   10624     for (int i = 0; i < 8; ++i)
   10625       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
   10626                                  : DAG.getConstant(Mask[i], DL, MVT::i32);
   10627     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
   10628       return DAG.getNode(
   10629           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
   10630           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
   10631 
   10632     if (Subtarget->hasAVX2())
   10633       return DAG.getNode(
   10634           X86ISD::VPERMV, DL, MVT::v8f32,
   10635           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
   10636 
   10637     // Otherwise, fall back.
   10638     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
   10639                                                    DAG);
   10640   }
   10641 
   10642   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   10643   // shuffle.
   10644   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   10645           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
   10646     return Result;
   10647 
   10648   // If we have AVX2 then we always want to lower with a blend because at v8 we
   10649   // can fully permute the elements.
   10650   if (Subtarget->hasAVX2())
   10651     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
   10652                                                       Mask, DAG);
   10653 
   10654   // Otherwise fall back on generic lowering.
   10655   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
   10656 }
   10657 
   10658 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
   10659 ///
   10660 /// This routine is only called when we have AVX2 and thus a reasonable
   10661 /// instruction set for v8i32 shuffling..
   10662 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   10663                                        const X86Subtarget *Subtarget,
   10664                                        SelectionDAG &DAG) {
   10665   SDLoc DL(Op);
   10666   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   10667   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   10668   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   10669   ArrayRef<int> Mask = SVOp->getMask();
   10670   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   10671   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
   10672 
   10673   // Whenever we can lower this as a zext, that instruction is strictly faster
   10674   // than any alternative. It also allows us to fold memory operands into the
   10675   // shuffle in many cases.
   10676   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
   10677                                                          Mask, Subtarget, DAG))
   10678     return ZExt;
   10679 
   10680   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
   10681                                                 Subtarget, DAG))
   10682     return Blend;
   10683 
   10684   // Check for being able to broadcast a single element.
   10685   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
   10686                                                         Mask, Subtarget, DAG))
   10687     return Broadcast;
   10688 
   10689   // If the shuffle mask is repeated in each 128-bit lane we can use more
   10690   // efficient instructions that mirror the shuffles across the two 128-bit
   10691   // lanes.
   10692   SmallVector<int, 4> RepeatedMask;
   10693   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
   10694     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
   10695     if (isSingleInputShuffleMask(Mask))
   10696       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
   10697                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   10698 
   10699     // Use dedicated unpack instructions for masks that match their pattern.
   10700     if (SDValue V =
   10701             lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
   10702       return V;
   10703   }
   10704 
   10705   // Try to use shift instructions.
   10706   if (SDValue Shift =
   10707           lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
   10708     return Shift;
   10709 
   10710   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   10711           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
   10712     return Rotate;
   10713 
   10714   // If the shuffle patterns aren't repeated but it is a single input, directly
   10715   // generate a cross-lane VPERMD instruction.
   10716   if (isSingleInputShuffleMask(Mask)) {
   10717     SDValue VPermMask[8];
   10718     for (int i = 0; i < 8; ++i)
   10719       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
   10720                                  : DAG.getConstant(Mask[i], DL, MVT::i32);
   10721     return DAG.getNode(
   10722         X86ISD::VPERMV, DL, MVT::v8i32,
   10723         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
   10724   }
   10725 
   10726   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   10727   // shuffle.
   10728   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   10729           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
   10730     return Result;
   10731 
   10732   // Otherwise fall back on generic blend lowering.
   10733   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
   10734                                                     Mask, DAG);
   10735 }
   10736 
   10737 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
   10738 ///
   10739 /// This routine is only called when we have AVX2 and thus a reasonable
   10740 /// instruction set for v16i16 shuffling..
   10741 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   10742                                         const X86Subtarget *Subtarget,
   10743                                         SelectionDAG &DAG) {
   10744   SDLoc DL(Op);
   10745   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   10746   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   10747   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   10748   ArrayRef<int> Mask = SVOp->getMask();
   10749   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   10750   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
   10751 
   10752   // Whenever we can lower this as a zext, that instruction is strictly faster
   10753   // than any alternative. It also allows us to fold memory operands into the
   10754   // shuffle in many cases.
   10755   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
   10756                                                          Mask, Subtarget, DAG))
   10757     return ZExt;
   10758 
   10759   // Check for being able to broadcast a single element.
   10760   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
   10761                                                         Mask, Subtarget, DAG))
   10762     return Broadcast;
   10763 
   10764   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
   10765                                                 Subtarget, DAG))
   10766     return Blend;
   10767 
   10768   // Use dedicated unpack instructions for masks that match their pattern.
   10769   if (SDValue V =
   10770           lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
   10771     return V;
   10772 
   10773   // Try to use shift instructions.
   10774   if (SDValue Shift =
   10775           lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
   10776     return Shift;
   10777 
   10778   // Try to use byte rotation instructions.
   10779   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   10780           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
   10781     return Rotate;
   10782 
   10783   if (isSingleInputShuffleMask(Mask)) {
   10784     // There are no generalized cross-lane shuffle operations available on i16
   10785     // element types.
   10786     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
   10787       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
   10788                                                      Mask, DAG);
   10789 
   10790     SmallVector<int, 8> RepeatedMask;
   10791     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
   10792       // As this is a single-input shuffle, the repeated mask should be
   10793       // a strictly valid v8i16 mask that we can pass through to the v8i16
   10794       // lowering to handle even the v16 case.
   10795       return lowerV8I16GeneralSingleInputVectorShuffle(
   10796           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
   10797     }
   10798 
   10799     SDValue PSHUFBMask[32];
   10800     for (int i = 0; i < 16; ++i) {
   10801       if (Mask[i] == -1) {
   10802         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
   10803         continue;
   10804       }
   10805 
   10806       int M = i < 8 ? Mask[i] : Mask[i] - 8;
   10807       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
   10808       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8);
   10809       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8);
   10810     }
   10811     return DAG.getBitcast(MVT::v16i16,
   10812                           DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8,
   10813                                       DAG.getBitcast(MVT::v32i8, V1),
   10814                                       DAG.getNode(ISD::BUILD_VECTOR, DL,
   10815                                                   MVT::v32i8, PSHUFBMask)));
   10816   }
   10817 
   10818   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   10819   // shuffle.
   10820   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   10821           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
   10822     return Result;
   10823 
   10824   // Otherwise fall back on generic lowering.
   10825   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
   10826 }
   10827 
   10828 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
   10829 ///
   10830 /// This routine is only called when we have AVX2 and thus a reasonable
   10831 /// instruction set for v32i8 shuffling..
   10832 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   10833                                        const X86Subtarget *Subtarget,
   10834                                        SelectionDAG &DAG) {
   10835   SDLoc DL(Op);
   10836   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   10837   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   10838   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   10839   ArrayRef<int> Mask = SVOp->getMask();
   10840   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   10841   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
   10842 
   10843   // Whenever we can lower this as a zext, that instruction is strictly faster
   10844   // than any alternative. It also allows us to fold memory operands into the
   10845   // shuffle in many cases.
   10846   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
   10847                                                          Mask, Subtarget, DAG))
   10848     return ZExt;
   10849 
   10850   // Check for being able to broadcast a single element.
   10851   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
   10852                                                         Mask, Subtarget, DAG))
   10853     return Broadcast;
   10854 
   10855   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
   10856                                                 Subtarget, DAG))
   10857     return Blend;
   10858 
   10859   // Use dedicated unpack instructions for masks that match their pattern.
   10860   if (SDValue V =
   10861           lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
   10862     return V;
   10863 
   10864   // Try to use shift instructions.
   10865   if (SDValue Shift =
   10866           lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
   10867     return Shift;
   10868 
   10869   // Try to use byte rotation instructions.
   10870   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   10871           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
   10872     return Rotate;
   10873 
   10874   if (isSingleInputShuffleMask(Mask)) {
   10875     // There are no generalized cross-lane shuffle operations available on i8
   10876     // element types.
   10877     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
   10878       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
   10879                                                      Mask, DAG);
   10880 
   10881     SDValue PSHUFBMask[32];
   10882     for (int i = 0; i < 32; ++i)
   10883       PSHUFBMask[i] =
   10884           Mask[i] < 0
   10885               ? DAG.getUNDEF(MVT::i8)
   10886               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL,
   10887                                 MVT::i8);
   10888 
   10889     return DAG.getNode(
   10890         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
   10891         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
   10892   }
   10893 
   10894   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   10895   // shuffle.
   10896   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   10897           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
   10898     return Result;
   10899 
   10900   // Otherwise fall back on generic lowering.
   10901   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
   10902 }
   10903 
   10904 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
   10905 ///
   10906 /// This routine either breaks down the specific type of a 256-bit x86 vector
   10907 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
   10908 /// together based on the available instructions.
   10909 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   10910                                         MVT VT, const X86Subtarget *Subtarget,
   10911                                         SelectionDAG &DAG) {
   10912   SDLoc DL(Op);
   10913   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   10914   ArrayRef<int> Mask = SVOp->getMask();
   10915 
   10916   // If we have a single input to the zero element, insert that into V1 if we
   10917   // can do so cheaply.
   10918   int NumElts = VT.getVectorNumElements();
   10919   int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
   10920     return M >= NumElts;
   10921   });
   10922 
   10923   if (NumV2Elements == 1 && Mask[0] >= NumElts)
   10924     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   10925                               DL, VT, V1, V2, Mask, Subtarget, DAG))
   10926       return Insertion;
   10927 
   10928   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
   10929   // can check for those subtargets here and avoid much of the subtarget
   10930   // querying in the per-vector-type lowering routines. With AVX1 we have
   10931   // essentially *zero* ability to manipulate a 256-bit vector with integer
   10932   // types. Since we'll use floating point types there eventually, just
   10933   // immediately cast everything to a float and operate entirely in that domain.
   10934   if (VT.isInteger() && !Subtarget->hasAVX2()) {
   10935     int ElementBits = VT.getScalarSizeInBits();
   10936     if (ElementBits < 32)
   10937       // No floating point type available, decompose into 128-bit vectors.
   10938       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   10939 
   10940     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
   10941                                 VT.getVectorNumElements());
   10942     V1 = DAG.getBitcast(FpVT, V1);
   10943     V2 = DAG.getBitcast(FpVT, V2);
   10944     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
   10945   }
   10946 
   10947   switch (VT.SimpleTy) {
   10948   case MVT::v4f64:
   10949     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   10950   case MVT::v4i64:
   10951     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   10952   case MVT::v8f32:
   10953     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   10954   case MVT::v8i32:
   10955     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   10956   case MVT::v16i16:
   10957     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
   10958   case MVT::v32i8:
   10959     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
   10960 
   10961   default:
   10962     llvm_unreachable("Not a valid 256-bit x86 vector type!");
   10963   }
   10964 }
   10965 
   10966 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
   10967 static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
   10968                                         ArrayRef<int> Mask,
   10969                                         SDValue V1, SDValue V2,
   10970                                         SelectionDAG &DAG) {
   10971   assert(VT.getScalarSizeInBits() == 64 &&
   10972          "Unexpected element type size for 128bit shuffle.");
   10973 
   10974   // To handle 256 bit vector requires VLX and most probably
   10975   // function lowerV2X128VectorShuffle() is better solution.
   10976   assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle.");
   10977 
   10978   SmallVector<int, 4> WidenedMask;
   10979   if (!canWidenShuffleElements(Mask, WidenedMask))
   10980     return SDValue();
   10981 
   10982   // Form a 128-bit permutation.
   10983   // Convert the 64-bit shuffle mask selection values into 128-bit selection
   10984   // bits defined by a vshuf64x2 instruction's immediate control byte.
   10985   unsigned PermMask = 0, Imm = 0;
   10986   unsigned ControlBitsNum = WidenedMask.size() / 2;
   10987 
   10988   for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
   10989     if (WidenedMask[i] == SM_SentinelZero)
   10990       return SDValue();
   10991 
   10992     // Use first element in place of undef mask.
   10993     Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
   10994     PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
   10995   }
   10996 
   10997   return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
   10998                      DAG.getConstant(PermMask, DL, MVT::i8));
   10999 }
   11000 
   11001 static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
   11002                                            ArrayRef<int> Mask, SDValue V1,
   11003                                            SDValue V2, SelectionDAG &DAG) {
   11004 
   11005   assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
   11006 
   11007   MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
   11008   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
   11009 
   11010   SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
   11011   if (isSingleInputShuffleMask(Mask))
   11012     return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
   11013 
   11014   return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
   11015 }
   11016 
   11017 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
   11018 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   11019                                        const X86Subtarget *Subtarget,
   11020                                        SelectionDAG &DAG) {
   11021   SDLoc DL(Op);
   11022   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   11023   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   11024   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   11025   ArrayRef<int> Mask = SVOp->getMask();
   11026   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   11027 
   11028   if (SDValue Shuf128 =
   11029           lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
   11030     return Shuf128;
   11031 
   11032   if (SDValue Unpck =
   11033           lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
   11034     return Unpck;
   11035 
   11036   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
   11037 }
   11038 
   11039 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
   11040 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   11041                                         const X86Subtarget *Subtarget,
   11042                                         SelectionDAG &DAG) {
   11043   SDLoc DL(Op);
   11044   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   11045   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   11046   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   11047   ArrayRef<int> Mask = SVOp->getMask();
   11048   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   11049 
   11050   if (SDValue Unpck =
   11051           lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
   11052     return Unpck;
   11053 
   11054   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
   11055 }
   11056 
   11057 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
   11058 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   11059                                        const X86Subtarget *Subtarget,
   11060                                        SelectionDAG &DAG) {
   11061   SDLoc DL(Op);
   11062   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   11063   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   11064   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   11065   ArrayRef<int> Mask = SVOp->getMask();
   11066   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   11067 
   11068   if (SDValue Shuf128 =
   11069           lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
   11070     return Shuf128;
   11071 
   11072   if (SDValue Unpck =
   11073           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
   11074     return Unpck;
   11075 
   11076   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
   11077 }
   11078 
   11079 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
   11080 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   11081                                         const X86Subtarget *Subtarget,
   11082                                         SelectionDAG &DAG) {
   11083   SDLoc DL(Op);
   11084   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   11085   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   11086   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   11087   ArrayRef<int> Mask = SVOp->getMask();
   11088   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   11089 
   11090   if (SDValue Unpck =
   11091           lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
   11092     return Unpck;
   11093 
   11094   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
   11095 }
   11096 
   11097 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
   11098 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   11099                                         const X86Subtarget *Subtarget,
   11100                                         SelectionDAG &DAG) {
   11101   SDLoc DL(Op);
   11102   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   11103   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   11104   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   11105   ArrayRef<int> Mask = SVOp->getMask();
   11106   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   11107   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
   11108 
   11109   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
   11110 }
   11111 
   11112 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
   11113 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   11114                                        const X86Subtarget *Subtarget,
   11115                                        SelectionDAG &DAG) {
   11116   SDLoc DL(Op);
   11117   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   11118   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   11119   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   11120   ArrayRef<int> Mask = SVOp->getMask();
   11121   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
   11122   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
   11123 
   11124   // FIXME: Implement direct support for this type!
   11125   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
   11126 }
   11127 
   11128 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
   11129 ///
   11130 /// This routine either breaks down the specific type of a 512-bit x86 vector
   11131 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
   11132 /// together based on the available instructions.
   11133 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   11134                                         MVT VT, const X86Subtarget *Subtarget,
   11135                                         SelectionDAG &DAG) {
   11136   SDLoc DL(Op);
   11137   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   11138   ArrayRef<int> Mask = SVOp->getMask();
   11139   assert(Subtarget->hasAVX512() &&
   11140          "Cannot lower 512-bit vectors w/ basic ISA!");
   11141 
   11142   // Check for being able to broadcast a single element.
   11143   if (SDValue Broadcast =
   11144           lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
   11145     return Broadcast;
   11146 
   11147   // Dispatch to each element type for lowering. If we don't have supprot for
   11148   // specific element type shuffles at 512 bits, immediately split them and
   11149   // lower them. Each lowering routine of a given type is allowed to assume that
   11150   // the requisite ISA extensions for that element type are available.
   11151   switch (VT.SimpleTy) {
   11152   case MVT::v8f64:
   11153     return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   11154   case MVT::v16f32:
   11155     return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   11156   case MVT::v8i64:
   11157     return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   11158   case MVT::v16i32:
   11159     return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   11160   case MVT::v32i16:
   11161     if (Subtarget->hasBWI())
   11162       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
   11163     break;
   11164   case MVT::v64i8:
   11165     if (Subtarget->hasBWI())
   11166       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
   11167     break;
   11168 
   11169   default:
   11170     llvm_unreachable("Not a valid 512-bit x86 vector type!");
   11171   }
   11172 
   11173   // Otherwise fall back on splitting.
   11174   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   11175 }
   11176 
   11177 // Lower vXi1 vector shuffles.
   11178 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
   11179 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
   11180 // vector, shuffle and then truncate it back.
   11181 static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   11182                                       MVT VT, const X86Subtarget *Subtarget,
   11183                                       SelectionDAG &DAG) {
   11184   SDLoc DL(Op);
   11185   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   11186   ArrayRef<int> Mask = SVOp->getMask();
   11187   assert(Subtarget->hasAVX512() &&
   11188          "Cannot lower 512-bit vectors w/o basic ISA!");
   11189   MVT ExtVT;
   11190   switch (VT.SimpleTy) {
   11191   default:
   11192     llvm_unreachable("Expected a vector of i1 elements");
   11193   case MVT::v2i1:
   11194     ExtVT = MVT::v2i64;
   11195     break;
   11196   case MVT::v4i1:
   11197     ExtVT = MVT::v4i32;
   11198     break;
   11199   case MVT::v8i1:
   11200     ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
   11201     break;
   11202   case MVT::v16i1:
   11203     ExtVT = MVT::v16i32;
   11204     break;
   11205   case MVT::v32i1:
   11206     ExtVT = MVT::v32i16;
   11207     break;
   11208   case MVT::v64i1:
   11209     ExtVT = MVT::v64i8;
   11210     break;
   11211   }
   11212 
   11213   if (ISD::isBuildVectorAllZeros(V1.getNode()))
   11214     V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
   11215   else if (ISD::isBuildVectorAllOnes(V1.getNode()))
   11216     V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
   11217   else
   11218     V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
   11219 
   11220   if (V2.isUndef())
   11221     V2 = DAG.getUNDEF(ExtVT);
   11222   else if (ISD::isBuildVectorAllZeros(V2.getNode()))
   11223     V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
   11224   else if (ISD::isBuildVectorAllOnes(V2.getNode()))
   11225     V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
   11226   else
   11227     V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
   11228   return DAG.getNode(ISD::TRUNCATE, DL, VT,
   11229                      DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
   11230 }
   11231 /// \brief Top-level lowering for x86 vector shuffles.
   11232 ///
   11233 /// This handles decomposition, canonicalization, and lowering of all x86
   11234 /// vector shuffles. Most of the specific lowering strategies are encapsulated
   11235 /// above in helper routines. The canonicalization attempts to widen shuffles
   11236 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
   11237 /// s.t. only one of the two inputs needs to be tested, etc.
   11238 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   11239                                   SelectionDAG &DAG) {
   11240   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   11241   ArrayRef<int> Mask = SVOp->getMask();
   11242   SDValue V1 = Op.getOperand(0);
   11243   SDValue V2 = Op.getOperand(1);
   11244   MVT VT = Op.getSimpleValueType();
   11245   int NumElements = VT.getVectorNumElements();
   11246   SDLoc dl(Op);
   11247   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
   11248 
   11249   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
   11250          "Can't lower MMX shuffles");
   11251 
   11252   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   11253   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   11254   if (V1IsUndef && V2IsUndef)
   11255     return DAG.getUNDEF(VT);
   11256 
   11257   // When we create a shuffle node we put the UNDEF node to second operand,
   11258   // but in some cases the first operand may be transformed to UNDEF.
   11259   // In this case we should just commute the node.
   11260   if (V1IsUndef)
   11261     return DAG.getCommutedVectorShuffle(*SVOp);
   11262 
   11263   // Check for non-undef masks pointing at an undef vector and make the masks
   11264   // undef as well. This makes it easier to match the shuffle based solely on
   11265   // the mask.
   11266   if (V2IsUndef)
   11267     for (int M : Mask)
   11268       if (M >= NumElements) {
   11269         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
   11270         for (int &M : NewMask)
   11271           if (M >= NumElements)
   11272             M = -1;
   11273         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
   11274       }
   11275 
   11276   // We actually see shuffles that are entirely re-arrangements of a set of
   11277   // zero inputs. This mostly happens while decomposing complex shuffles into
   11278   // simple ones. Directly lower these as a buildvector of zeros.
   11279   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   11280   if (Zeroable.all())
   11281     return getZeroVector(VT, Subtarget, DAG, dl);
   11282 
   11283   // Try to collapse shuffles into using a vector type with fewer elements but
   11284   // wider element types. We cap this to not form integers or floating point
   11285   // elements wider than 64 bits, but it might be interesting to form i128
   11286   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
   11287   SmallVector<int, 16> WidenedMask;
   11288   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
   11289       canWidenShuffleElements(Mask, WidenedMask)) {
   11290     MVT NewEltVT = VT.isFloatingPoint()
   11291                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
   11292                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
   11293     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
   11294     // Make sure that the new vector type is legal. For example, v2f64 isn't
   11295     // legal on SSE1.
   11296     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
   11297       V1 = DAG.getBitcast(NewVT, V1);
   11298       V2 = DAG.getBitcast(NewVT, V2);
   11299       return DAG.getBitcast(
   11300           VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
   11301     }
   11302   }
   11303 
   11304   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
   11305   for (int M : SVOp->getMask())
   11306     if (M < 0)
   11307       ++NumUndefElements;
   11308     else if (M < NumElements)
   11309       ++NumV1Elements;
   11310     else
   11311       ++NumV2Elements;
   11312 
   11313   // Commute the shuffle as needed such that more elements come from V1 than
   11314   // V2. This allows us to match the shuffle pattern strictly on how many
   11315   // elements come from V1 without handling the symmetric cases.
   11316   if (NumV2Elements > NumV1Elements)
   11317     return DAG.getCommutedVectorShuffle(*SVOp);
   11318 
   11319   // When the number of V1 and V2 elements are the same, try to minimize the
   11320   // number of uses of V2 in the low half of the vector. When that is tied,
   11321   // ensure that the sum of indices for V1 is equal to or lower than the sum
   11322   // indices for V2. When those are equal, try to ensure that the number of odd
   11323   // indices for V1 is lower than the number of odd indices for V2.
   11324   if (NumV1Elements == NumV2Elements) {
   11325     int LowV1Elements = 0, LowV2Elements = 0;
   11326     for (int M : SVOp->getMask().slice(0, NumElements / 2))
   11327       if (M >= NumElements)
   11328         ++LowV2Elements;
   11329       else if (M >= 0)
   11330         ++LowV1Elements;
   11331     if (LowV2Elements > LowV1Elements) {
   11332       return DAG.getCommutedVectorShuffle(*SVOp);
   11333     } else if (LowV2Elements == LowV1Elements) {
   11334       int SumV1Indices = 0, SumV2Indices = 0;
   11335       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
   11336         if (SVOp->getMask()[i] >= NumElements)
   11337           SumV2Indices += i;
   11338         else if (SVOp->getMask()[i] >= 0)
   11339           SumV1Indices += i;
   11340       if (SumV2Indices < SumV1Indices) {
   11341         return DAG.getCommutedVectorShuffle(*SVOp);
   11342       } else if (SumV2Indices == SumV1Indices) {
   11343         int NumV1OddIndices = 0, NumV2OddIndices = 0;
   11344         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
   11345           if (SVOp->getMask()[i] >= NumElements)
   11346             NumV2OddIndices += i % 2;
   11347           else if (SVOp->getMask()[i] >= 0)
   11348             NumV1OddIndices += i % 2;
   11349         if (NumV2OddIndices < NumV1OddIndices)
   11350           return DAG.getCommutedVectorShuffle(*SVOp);
   11351       }
   11352     }
   11353   }
   11354 
   11355   // For each vector width, delegate to a specialized lowering routine.
   11356   if (VT.is128BitVector())
   11357     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
   11358 
   11359   if (VT.is256BitVector())
   11360     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
   11361 
   11362   if (VT.is512BitVector())
   11363     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
   11364 
   11365   if (Is1BitVector)
   11366     return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
   11367   llvm_unreachable("Unimplemented!");
   11368 }
   11369 
   11370 // This function assumes its argument is a BUILD_VECTOR of constants or
   11371 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
   11372 // true.
   11373 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
   11374                                     unsigned &MaskValue) {
   11375   MaskValue = 0;
   11376   unsigned NumElems = BuildVector->getNumOperands();
   11377 
   11378   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
   11379   // We don't handle the >2 lanes case right now.
   11380   unsigned NumLanes = (NumElems - 1) / 8 + 1;
   11381   if (NumLanes > 2)
   11382     return false;
   11383 
   11384   unsigned NumElemsInLane = NumElems / NumLanes;
   11385 
   11386   // Blend for v16i16 should be symmetric for the both lanes.
   11387   for (unsigned i = 0; i < NumElemsInLane; ++i) {
   11388     SDValue EltCond = BuildVector->getOperand(i);
   11389     SDValue SndLaneEltCond =
   11390         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
   11391 
   11392     int Lane1Cond = -1, Lane2Cond = -1;
   11393     if (isa<ConstantSDNode>(EltCond))
   11394       Lane1Cond = !isNullConstant(EltCond);
   11395     if (isa<ConstantSDNode>(SndLaneEltCond))
   11396       Lane2Cond = !isNullConstant(SndLaneEltCond);
   11397 
   11398     unsigned LaneMask = 0;
   11399     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
   11400       // Lane1Cond != 0, means we want the first argument.
   11401       // Lane1Cond == 0, means we want the second argument.
   11402       // The encoding of this argument is 0 for the first argument, 1
   11403       // for the second. Therefore, invert the condition.
   11404       LaneMask = !Lane1Cond << i;
   11405     else if (Lane1Cond < 0)
   11406       LaneMask = !Lane2Cond << i;
   11407     else
   11408       return false;
   11409 
   11410     MaskValue |= LaneMask;
   11411     if (NumLanes == 2)
   11412       MaskValue |= LaneMask << NumElemsInLane;
   11413   }
   11414   return true;
   11415 }
   11416 
   11417 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
   11418 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
   11419                                            const X86Subtarget *Subtarget,
   11420                                            SelectionDAG &DAG) {
   11421   SDValue Cond = Op.getOperand(0);
   11422   SDValue LHS = Op.getOperand(1);
   11423   SDValue RHS = Op.getOperand(2);
   11424   SDLoc dl(Op);
   11425   MVT VT = Op.getSimpleValueType();
   11426 
   11427   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
   11428     return SDValue();
   11429   auto *CondBV = cast<BuildVectorSDNode>(Cond);
   11430 
   11431   // Only non-legal VSELECTs reach this lowering, convert those into generic
   11432   // shuffles and re-use the shuffle lowering path for blends.
   11433   SmallVector<int, 32> Mask;
   11434   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
   11435     SDValue CondElt = CondBV->getOperand(i);
   11436     Mask.push_back(
   11437         isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
   11438                                      : -1);
   11439   }
   11440   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
   11441 }
   11442 
   11443 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   11444   // A vselect where all conditions and data are constants can be optimized into
   11445   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
   11446   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
   11447       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
   11448       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
   11449     return SDValue();
   11450 
   11451   // Try to lower this to a blend-style vector shuffle. This can handle all
   11452   // constant condition cases.
   11453   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
   11454     return BlendOp;
   11455 
   11456   // Variable blends are only legal from SSE4.1 onward.
   11457   if (!Subtarget->hasSSE41())
   11458     return SDValue();
   11459 
   11460   // Only some types will be legal on some subtargets. If we can emit a legal
   11461   // VSELECT-matching blend, return Op, and but if we need to expand, return
   11462   // a null value.
   11463   switch (Op.getSimpleValueType().SimpleTy) {
   11464   default:
   11465     // Most of the vector types have blends past SSE4.1.
   11466     return Op;
   11467 
   11468   case MVT::v32i8:
   11469     // The byte blends for AVX vectors were introduced only in AVX2.
   11470     if (Subtarget->hasAVX2())
   11471       return Op;
   11472 
   11473     return SDValue();
   11474 
   11475   case MVT::v8i16:
   11476   case MVT::v16i16:
   11477     // AVX-512 BWI and VLX features support VSELECT with i16 elements.
   11478     if (Subtarget->hasBWI() && Subtarget->hasVLX())
   11479       return Op;
   11480 
   11481     // FIXME: We should custom lower this by fixing the condition and using i8
   11482     // blends.
   11483     return SDValue();
   11484   }
   11485 }
   11486 
   11487 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   11488   MVT VT = Op.getSimpleValueType();
   11489   SDLoc dl(Op);
   11490 
   11491   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
   11492     return SDValue();
   11493 
   11494   if (VT.getSizeInBits() == 8) {
   11495     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
   11496                                   Op.getOperand(0), Op.getOperand(1));
   11497     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   11498                                   DAG.getValueType(VT));
   11499     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   11500   }
   11501 
   11502   if (VT.getSizeInBits() == 16) {
   11503     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
   11504     if (isNullConstant(Op.getOperand(1)))
   11505       return DAG.getNode(
   11506           ISD::TRUNCATE, dl, MVT::i16,
   11507           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   11508                       DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
   11509                       Op.getOperand(1)));
   11510     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
   11511                                   Op.getOperand(0), Op.getOperand(1));
   11512     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   11513                                   DAG.getValueType(VT));
   11514     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   11515   }
   11516 
   11517   if (VT == MVT::f32) {
   11518     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
   11519     // the result back to FR32 register. It's only worth matching if the
   11520     // result has a single use which is a store or a bitcast to i32.  And in
   11521     // the case of a store, it's not worth it if the index is a constant 0,
   11522     // because a MOVSSmr can be used instead, which is smaller and faster.
   11523     if (!Op.hasOneUse())
   11524       return SDValue();
   11525     SDNode *User = *Op.getNode()->use_begin();
   11526     if ((User->getOpcode() != ISD::STORE ||
   11527          isNullConstant(Op.getOperand(1))) &&
   11528         (User->getOpcode() != ISD::BITCAST ||
   11529          User->getValueType(0) != MVT::i32))
   11530       return SDValue();
   11531     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   11532                                   DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
   11533                                   Op.getOperand(1));
   11534     return DAG.getBitcast(MVT::f32, Extract);
   11535   }
   11536 
   11537   if (VT == MVT::i32 || VT == MVT::i64) {
   11538     // ExtractPS/pextrq works with constant index.
   11539     if (isa<ConstantSDNode>(Op.getOperand(1)))
   11540       return Op;
   11541   }
   11542   return SDValue();
   11543 }
   11544 
   11545 /// Extract one bit from mask vector, like v16i1 or v8i1.
   11546 /// AVX-512 feature.
   11547 SDValue
   11548 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
   11549   SDValue Vec = Op.getOperand(0);
   11550   SDLoc dl(Vec);
   11551   MVT VecVT = Vec.getSimpleValueType();
   11552   SDValue Idx = Op.getOperand(1);
   11553   MVT EltVT = Op.getSimpleValueType();
   11554 
   11555   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
   11556   assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
   11557          "Unexpected vector type in ExtractBitFromMaskVector");
   11558 
   11559   // variable index can't be handled in mask registers,
   11560   // extend vector to VR512
   11561   if (!isa<ConstantSDNode>(Idx)) {
   11562     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
   11563     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
   11564     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   11565                               ExtVT.getVectorElementType(), Ext, Idx);
   11566     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
   11567   }
   11568 
   11569   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   11570   const TargetRegisterClass* rc = getRegClassFor(VecVT);
   11571   if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
   11572     rc = getRegClassFor(MVT::v16i1);
   11573   unsigned MaxSift = rc->getSize()*8 - 1;
   11574   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
   11575                     DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
   11576   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
   11577                     DAG.getConstant(MaxSift, dl, MVT::i8));
   11578   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
   11579                        DAG.getIntPtrConstant(0, dl));
   11580 }
   11581 
   11582 SDValue
   11583 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   11584                                            SelectionDAG &DAG) const {
   11585   SDLoc dl(Op);
   11586   SDValue Vec = Op.getOperand(0);
   11587   MVT VecVT = Vec.getSimpleValueType();
   11588   SDValue Idx = Op.getOperand(1);
   11589 
   11590   if (Op.getSimpleValueType() == MVT::i1)
   11591     return ExtractBitFromMaskVector(Op, DAG);
   11592 
   11593   if (!isa<ConstantSDNode>(Idx)) {
   11594     if (VecVT.is512BitVector() ||
   11595         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
   11596          VecVT.getVectorElementType().getSizeInBits() == 32)) {
   11597 
   11598       MVT MaskEltVT =
   11599         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
   11600       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
   11601                                     MaskEltVT.getSizeInBits());
   11602 
   11603       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
   11604       auto PtrVT = getPointerTy(DAG.getDataLayout());
   11605       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
   11606                                  getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
   11607                                  DAG.getConstant(0, dl, PtrVT));
   11608       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
   11609       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
   11610                          DAG.getConstant(0, dl, PtrVT));
   11611     }
   11612     return SDValue();
   11613   }
   11614 
   11615   // If this is a 256-bit vector result, first extract the 128-bit vector and
   11616   // then extract the element from the 128-bit vector.
   11617   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
   11618 
   11619     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   11620     // Get the 128-bit vector.
   11621     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
   11622     MVT EltVT = VecVT.getVectorElementType();
   11623 
   11624     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
   11625     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
   11626 
   11627     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
   11628     // this can be done with a mask.
   11629     IdxVal &= ElemsPerChunk - 1;
   11630     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
   11631                        DAG.getConstant(IdxVal, dl, MVT::i32));
   11632   }
   11633 
   11634   assert(VecVT.is128BitVector() && "Unexpected vector length");
   11635 
   11636   if (Subtarget->hasSSE41())
   11637     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
   11638       return Res;
   11639 
   11640   MVT VT = Op.getSimpleValueType();
   11641   // TODO: handle v16i8.
   11642   if (VT.getSizeInBits() == 16) {
   11643     SDValue Vec = Op.getOperand(0);
   11644     if (isNullConstant(Op.getOperand(1)))
   11645       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   11646                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   11647                                      DAG.getBitcast(MVT::v4i32, Vec),
   11648                                      Op.getOperand(1)));
   11649     // Transform it so it match pextrw which produces a 32-bit result.
   11650     MVT EltVT = MVT::i32;
   11651     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
   11652                                   Op.getOperand(0), Op.getOperand(1));
   11653     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
   11654                                   DAG.getValueType(VT));
   11655     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   11656   }
   11657 
   11658   if (VT.getSizeInBits() == 32) {
   11659     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   11660     if (Idx == 0)
   11661       return Op;
   11662 
   11663     // SHUFPS the element to the lowest double word, then movss.
   11664     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
   11665     MVT VVT = Op.getOperand(0).getSimpleValueType();
   11666     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
   11667                                        DAG.getUNDEF(VVT), Mask);
   11668     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   11669                        DAG.getIntPtrConstant(0, dl));
   11670   }
   11671 
   11672   if (VT.getSizeInBits() == 64) {
   11673     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
   11674     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
   11675     //        to match extract_elt for f64.
   11676     if (isNullConstant(Op.getOperand(1)))
   11677       return Op;
   11678 
   11679     // UNPCKHPD the element to the lowest double word, then movsd.
   11680     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
   11681     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
   11682     int Mask[2] = { 1, -1 };
   11683     MVT VVT = Op.getOperand(0).getSimpleValueType();
   11684     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
   11685                                        DAG.getUNDEF(VVT), Mask);
   11686     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   11687                        DAG.getIntPtrConstant(0, dl));
   11688   }
   11689 
   11690   return SDValue();
   11691 }
   11692 
   11693 /// Insert one bit to mask vector, like v16i1 or v8i1.
   11694 /// AVX-512 feature.
   11695 SDValue
   11696 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
   11697   SDLoc dl(Op);
   11698   SDValue Vec = Op.getOperand(0);
   11699   SDValue Elt = Op.getOperand(1);
   11700   SDValue Idx = Op.getOperand(2);
   11701   MVT VecVT = Vec.getSimpleValueType();
   11702 
   11703   if (!isa<ConstantSDNode>(Idx)) {
   11704     // Non constant index. Extend source and destination,
   11705     // insert element and then truncate the result.
   11706     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
   11707     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
   11708     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
   11709       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
   11710       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
   11711     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
   11712   }
   11713 
   11714   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   11715   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
   11716   if (IdxVal)
   11717     EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
   11718                            DAG.getConstant(IdxVal, dl, MVT::i8));
   11719   if (Vec.getOpcode() == ISD::UNDEF)
   11720     return EltInVec;
   11721   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
   11722 }
   11723 
   11724 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   11725                                                   SelectionDAG &DAG) const {
   11726   MVT VT = Op.getSimpleValueType();
   11727   MVT EltVT = VT.getVectorElementType();
   11728 
   11729   if (EltVT == MVT::i1)
   11730     return InsertBitToMaskVector(Op, DAG);
   11731 
   11732   SDLoc dl(Op);
   11733   SDValue N0 = Op.getOperand(0);
   11734   SDValue N1 = Op.getOperand(1);
   11735   SDValue N2 = Op.getOperand(2);
   11736   if (!isa<ConstantSDNode>(N2))
   11737     return SDValue();
   11738   auto *N2C = cast<ConstantSDNode>(N2);
   11739   unsigned IdxVal = N2C->getZExtValue();
   11740 
   11741   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
   11742   // into that, and then insert the subvector back into the result.
   11743   if (VT.is256BitVector() || VT.is512BitVector()) {
   11744     // With a 256-bit vector, we can insert into the zero element efficiently
   11745     // using a blend if we have AVX or AVX2 and the right data type.
   11746     if (VT.is256BitVector() && IdxVal == 0) {
   11747       // TODO: It is worthwhile to cast integer to floating point and back
   11748       // and incur a domain crossing penalty if that's what we'll end up
   11749       // doing anyway after extracting to a 128-bit vector.
   11750       if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
   11751           (Subtarget->hasAVX2() && EltVT == MVT::i32)) {
   11752         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
   11753         N2 = DAG.getIntPtrConstant(1, dl);
   11754         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
   11755       }
   11756     }
   11757 
   11758     // Get the desired 128-bit vector chunk.
   11759     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
   11760 
   11761     // Insert the element into the desired chunk.
   11762     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
   11763     assert(isPowerOf2_32(NumEltsIn128));
   11764     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
   11765     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
   11766 
   11767     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
   11768                     DAG.getConstant(IdxIn128, dl, MVT::i32));
   11769 
   11770     // Insert the changed part back into the bigger vector
   11771     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
   11772   }
   11773   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
   11774 
   11775   if (Subtarget->hasSSE41()) {
   11776     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
   11777       unsigned Opc;
   11778       if (VT == MVT::v8i16) {
   11779         Opc = X86ISD::PINSRW;
   11780       } else {
   11781         assert(VT == MVT::v16i8);
   11782         Opc = X86ISD::PINSRB;
   11783       }
   11784 
   11785       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
   11786       // argument.
   11787       if (N1.getValueType() != MVT::i32)
   11788         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   11789       if (N2.getValueType() != MVT::i32)
   11790         N2 = DAG.getIntPtrConstant(IdxVal, dl);
   11791       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
   11792     }
   11793 
   11794     if (EltVT == MVT::f32) {
   11795       // Bits [7:6] of the constant are the source select. This will always be
   11796       //   zero here. The DAG Combiner may combine an extract_elt index into
   11797       //   these bits. For example (insert (extract, 3), 2) could be matched by
   11798       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
   11799       // Bits [5:4] of the constant are the destination select. This is the
   11800       //   value of the incoming immediate.
   11801       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
   11802       //   combine either bitwise AND or insert of float 0.0 to set these bits.
   11803 
   11804       bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
   11805       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
   11806         // If this is an insertion of 32-bits into the low 32-bits of
   11807         // a vector, we prefer to generate a blend with immediate rather
   11808         // than an insertps. Blends are simpler operations in hardware and so
   11809         // will always have equal or better performance than insertps.
   11810         // But if optimizing for size and there's a load folding opportunity,
   11811         // generate insertps because blendps does not have a 32-bit memory
   11812         // operand form.
   11813         N2 = DAG.getIntPtrConstant(1, dl);
   11814         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
   11815         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
   11816       }
   11817       N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
   11818       // Create this as a scalar to vector..
   11819       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
   11820       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
   11821     }
   11822 
   11823     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
   11824       // PINSR* works with constant index.
   11825       return Op;
   11826     }
   11827   }
   11828 
   11829   if (EltVT == MVT::i8)
   11830     return SDValue();
   11831 
   11832   if (EltVT.getSizeInBits() == 16) {
   11833     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
   11834     // as its second argument.
   11835     if (N1.getValueType() != MVT::i32)
   11836       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   11837     if (N2.getValueType() != MVT::i32)
   11838       N2 = DAG.getIntPtrConstant(IdxVal, dl);
   11839     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
   11840   }
   11841   return SDValue();
   11842 }
   11843 
   11844 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   11845   SDLoc dl(Op);
   11846   MVT OpVT = Op.getSimpleValueType();
   11847 
   11848   // If this is a 256-bit vector result, first insert into a 128-bit
   11849   // vector and then insert into the 256-bit vector.
   11850   if (!OpVT.is128BitVector()) {
   11851     // Insert into a 128-bit vector.
   11852     unsigned SizeFactor = OpVT.getSizeInBits()/128;
   11853     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
   11854                                  OpVT.getVectorNumElements() / SizeFactor);
   11855 
   11856     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
   11857 
   11858     // Insert the 128-bit vector.
   11859     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   11860   }
   11861 
   11862   if (OpVT == MVT::v1i64 &&
   11863       Op.getOperand(0).getValueType() == MVT::i64)
   11864     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
   11865 
   11866   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
   11867   assert(OpVT.is128BitVector() && "Expected an SSE type!");
   11868   return DAG.getBitcast(
   11869       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
   11870 }
   11871 
   11872 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
   11873 // a simple subregister reference or explicit instructions to grab
   11874 // upper bits of a vector.
   11875 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   11876                                       SelectionDAG &DAG) {
   11877   SDLoc dl(Op);
   11878   SDValue In =  Op.getOperand(0);
   11879   SDValue Idx = Op.getOperand(1);
   11880   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   11881   MVT ResVT   = Op.getSimpleValueType();
   11882   MVT InVT    = In.getSimpleValueType();
   11883 
   11884   if (Subtarget->hasFp256()) {
   11885     if (ResVT.is128BitVector() &&
   11886         (InVT.is256BitVector() || InVT.is512BitVector()) &&
   11887         isa<ConstantSDNode>(Idx)) {
   11888       return Extract128BitVector(In, IdxVal, DAG, dl);
   11889     }
   11890     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
   11891         isa<ConstantSDNode>(Idx)) {
   11892       return Extract256BitVector(In, IdxVal, DAG, dl);
   11893     }
   11894   }
   11895   return SDValue();
   11896 }
   11897 
   11898 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
   11899 // simple superregister reference or explicit instructions to insert
   11900 // the upper bits of a vector.
   11901 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   11902                                      SelectionDAG &DAG) {
   11903   if (!Subtarget->hasAVX())
   11904     return SDValue();
   11905 
   11906   SDLoc dl(Op);
   11907   SDValue Vec = Op.getOperand(0);
   11908   SDValue SubVec = Op.getOperand(1);
   11909   SDValue Idx = Op.getOperand(2);
   11910 
   11911   if (!isa<ConstantSDNode>(Idx))
   11912     return SDValue();
   11913 
   11914   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   11915   MVT OpVT = Op.getSimpleValueType();
   11916   MVT SubVecVT = SubVec.getSimpleValueType();
   11917 
   11918   // Fold two 16-byte subvector loads into one 32-byte load:
   11919   // (insert_subvector (insert_subvector undef, (load addr), 0),
   11920   //                   (load addr + 16), Elts/2)
   11921   // --> load32 addr
   11922   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
   11923       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
   11924       OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
   11925     auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
   11926     if (Idx2 && Idx2->getZExtValue() == 0) {
   11927       SDValue SubVec2 = Vec.getOperand(1);
   11928       // If needed, look through a bitcast to get to the load.
   11929       if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST)
   11930         SubVec2 = SubVec2.getOperand(0);
   11931 
   11932       if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
   11933         bool Fast;
   11934         unsigned Alignment = FirstLd->getAlignment();
   11935         unsigned AS = FirstLd->getAddressSpace();
   11936         const X86TargetLowering *TLI = Subtarget->getTargetLowering();
   11937         if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
   11938                                     OpVT, AS, Alignment, &Fast) && Fast) {
   11939           SDValue Ops[] = { SubVec2, SubVec };
   11940           if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
   11941             return Ld;
   11942         }
   11943       }
   11944     }
   11945   }
   11946 
   11947   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
   11948       SubVecVT.is128BitVector())
   11949     return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
   11950 
   11951   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
   11952     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
   11953 
   11954   if (OpVT.getVectorElementType() == MVT::i1)
   11955     return Insert1BitVector(Op, DAG);
   11956 
   11957   return SDValue();
   11958 }
   11959 
   11960 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
   11961 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
   11962 // one of the above mentioned nodes. It has to be wrapped because otherwise
   11963 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
   11964 // be used to form addressing mode. These wrapped nodes will be selected
   11965 // into MOV32ri.
   11966 SDValue
   11967 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   11968   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   11969 
   11970   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   11971   // global base reg.
   11972   unsigned char OpFlag = 0;
   11973   unsigned WrapperKind = X86ISD::Wrapper;
   11974   CodeModel::Model M = DAG.getTarget().getCodeModel();
   11975 
   11976   if (Subtarget->isPICStyleRIPRel() &&
   11977       (M == CodeModel::Small || M == CodeModel::Kernel))
   11978     WrapperKind = X86ISD::WrapperRIP;
   11979   else if (Subtarget->isPICStyleGOT())
   11980     OpFlag = X86II::MO_GOTOFF;
   11981   else if (Subtarget->isPICStyleStubPIC())
   11982     OpFlag = X86II::MO_PIC_BASE_OFFSET;
   11983 
   11984   auto PtrVT = getPointerTy(DAG.getDataLayout());
   11985   SDValue Result = DAG.getTargetConstantPool(
   11986       CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
   11987   SDLoc DL(CP);
   11988   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
   11989   // With PIC, the address is actually $g + Offset.
   11990   if (OpFlag) {
   11991     Result =
   11992         DAG.getNode(ISD::ADD, DL, PtrVT,
   11993                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   11994   }
   11995 
   11996   return Result;
   11997 }
   11998 
   11999 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   12000   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   12001 
   12002   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   12003   // global base reg.
   12004   unsigned char OpFlag = 0;
   12005   unsigned WrapperKind = X86ISD::Wrapper;
   12006   CodeModel::Model M = DAG.getTarget().getCodeModel();
   12007 
   12008   if (Subtarget->isPICStyleRIPRel() &&
   12009       (M == CodeModel::Small || M == CodeModel::Kernel))
   12010     WrapperKind = X86ISD::WrapperRIP;
   12011   else if (Subtarget->isPICStyleGOT())
   12012     OpFlag = X86II::MO_GOTOFF;
   12013   else if (Subtarget->isPICStyleStubPIC())
   12014     OpFlag = X86II::MO_PIC_BASE_OFFSET;
   12015 
   12016   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12017   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
   12018   SDLoc DL(JT);
   12019   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
   12020 
   12021   // With PIC, the address is actually $g + Offset.
   12022   if (OpFlag)
   12023     Result =
   12024         DAG.getNode(ISD::ADD, DL, PtrVT,
   12025                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   12026 
   12027   return Result;
   12028 }
   12029 
   12030 SDValue
   12031 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   12032   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
   12033 
   12034   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   12035   // global base reg.
   12036   unsigned char OpFlag = 0;
   12037   unsigned WrapperKind = X86ISD::Wrapper;
   12038   CodeModel::Model M = DAG.getTarget().getCodeModel();
   12039 
   12040   if (Subtarget->isPICStyleRIPRel() &&
   12041       (M == CodeModel::Small || M == CodeModel::Kernel)) {
   12042     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
   12043       OpFlag = X86II::MO_GOTPCREL;
   12044     WrapperKind = X86ISD::WrapperRIP;
   12045   } else if (Subtarget->isPICStyleGOT()) {
   12046     OpFlag = X86II::MO_GOT;
   12047   } else if (Subtarget->isPICStyleStubPIC()) {
   12048     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
   12049   } else if (Subtarget->isPICStyleStubNoDynamic()) {
   12050     OpFlag = X86II::MO_DARWIN_NONLAZY;
   12051   }
   12052 
   12053   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12054   SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
   12055 
   12056   SDLoc DL(Op);
   12057   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
   12058 
   12059   // With PIC, the address is actually $g + Offset.
   12060   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
   12061       !Subtarget->is64Bit()) {
   12062     Result =
   12063         DAG.getNode(ISD::ADD, DL, PtrVT,
   12064                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   12065   }
   12066 
   12067   // For symbols that require a load from a stub to get the address, emit the
   12068   // load.
   12069   if (isGlobalStubReference(OpFlag))
   12070     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
   12071                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
   12072                          false, false, false, 0);
   12073 
   12074   return Result;
   12075 }
   12076 
   12077 SDValue
   12078 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   12079   // Create the TargetBlockAddressAddress node.
   12080   unsigned char OpFlags =
   12081     Subtarget->ClassifyBlockAddressReference();
   12082   CodeModel::Model M = DAG.getTarget().getCodeModel();
   12083   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   12084   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   12085   SDLoc dl(Op);
   12086   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12087   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
   12088 
   12089   if (Subtarget->isPICStyleRIPRel() &&
   12090       (M == CodeModel::Small || M == CodeModel::Kernel))
   12091     Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
   12092   else
   12093     Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
   12094 
   12095   // With PIC, the address is actually $g + Offset.
   12096   if (isGlobalRelativeToPICBase(OpFlags)) {
   12097     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
   12098                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   12099   }
   12100 
   12101   return Result;
   12102 }
   12103 
   12104 SDValue
   12105 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
   12106                                       int64_t Offset, SelectionDAG &DAG) const {
   12107   // Create the TargetGlobalAddress node, folding in the constant
   12108   // offset if it is legal.
   12109   unsigned char OpFlags =
   12110       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
   12111   CodeModel::Model M = DAG.getTarget().getCodeModel();
   12112   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12113   SDValue Result;
   12114   if (OpFlags == X86II::MO_NO_FLAG &&
   12115       X86::isOffsetSuitableForCodeModel(Offset, M)) {
   12116     // A direct static reference to a global.
   12117     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
   12118     Offset = 0;
   12119   } else {
   12120     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
   12121   }
   12122 
   12123   if (Subtarget->isPICStyleRIPRel() &&
   12124       (M == CodeModel::Small || M == CodeModel::Kernel))
   12125     Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
   12126   else
   12127     Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
   12128 
   12129   // With PIC, the address is actually $g + Offset.
   12130   if (isGlobalRelativeToPICBase(OpFlags)) {
   12131     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
   12132                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   12133   }
   12134 
   12135   // For globals that require a load from a stub to get the address, emit the
   12136   // load.
   12137   if (isGlobalStubReference(OpFlags))
   12138     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
   12139                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
   12140                          false, false, false, 0);
   12141 
   12142   // If there was a non-zero offset that we didn't fold, create an explicit
   12143   // addition for it.
   12144   if (Offset != 0)
   12145     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
   12146                          DAG.getConstant(Offset, dl, PtrVT));
   12147 
   12148   return Result;
   12149 }
   12150 
   12151 SDValue
   12152 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   12153   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   12154   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
   12155   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
   12156 }
   12157 
   12158 static SDValue
   12159 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
   12160            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
   12161            unsigned char OperandFlags, bool LocalDynamic = false) {
   12162   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   12163   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   12164   SDLoc dl(GA);
   12165   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   12166                                            GA->getValueType(0),
   12167                                            GA->getOffset(),
   12168                                            OperandFlags);
   12169 
   12170   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
   12171                                            : X86ISD::TLSADDR;
   12172 
   12173   if (InFlag) {
   12174     SDValue Ops[] = { Chain,  TGA, *InFlag };
   12175     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   12176   } else {
   12177     SDValue Ops[]  = { Chain, TGA };
   12178     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   12179   }
   12180 
   12181   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   12182   MFI->setAdjustsStack(true);
   12183   MFI->setHasCalls(true);
   12184 
   12185   SDValue Flag = Chain.getValue(1);
   12186   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
   12187 }
   12188 
   12189 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
   12190 static SDValue
   12191 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   12192                                 const EVT PtrVT) {
   12193   SDValue InFlag;
   12194   SDLoc dl(GA);  // ? function entry point might be better
   12195   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
   12196                                    DAG.getNode(X86ISD::GlobalBaseReg,
   12197                                                SDLoc(), PtrVT), InFlag);
   12198   InFlag = Chain.getValue(1);
   12199 
   12200   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
   12201 }
   12202 
   12203 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
   12204 static SDValue
   12205 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   12206                                 const EVT PtrVT) {
   12207   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
   12208                     X86::RAX, X86II::MO_TLSGD);
   12209 }
   12210 
   12211 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
   12212                                            SelectionDAG &DAG,
   12213                                            const EVT PtrVT,
   12214                                            bool is64Bit) {
   12215   SDLoc dl(GA);
   12216 
   12217   // Get the start address of the TLS block for this module.
   12218   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
   12219       .getInfo<X86MachineFunctionInfo>();
   12220   MFI->incNumLocalDynamicTLSAccesses();
   12221 
   12222   SDValue Base;
   12223   if (is64Bit) {
   12224     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
   12225                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
   12226   } else {
   12227     SDValue InFlag;
   12228     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
   12229         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
   12230     InFlag = Chain.getValue(1);
   12231     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
   12232                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
   12233   }
   12234 
   12235   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
   12236   // of Base.
   12237 
   12238   // Build x@dtpoff.
   12239   unsigned char OperandFlags = X86II::MO_DTPOFF;
   12240   unsigned WrapperKind = X86ISD::Wrapper;
   12241   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   12242                                            GA->getValueType(0),
   12243                                            GA->getOffset(), OperandFlags);
   12244   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
   12245 
   12246   // Add x@dtpoff with the base.
   12247   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
   12248 }
   12249 
   12250 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
   12251 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   12252                                    const EVT PtrVT, TLSModel::Model model,
   12253                                    bool is64Bit, bool isPIC) {
   12254   SDLoc dl(GA);
   12255 
   12256   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
   12257   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
   12258                                                          is64Bit ? 257 : 256));
   12259 
   12260   SDValue ThreadPointer =
   12261       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
   12262                   MachinePointerInfo(Ptr), false, false, false, 0);
   12263 
   12264   unsigned char OperandFlags = 0;
   12265   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
   12266   // initialexec.
   12267   unsigned WrapperKind = X86ISD::Wrapper;
   12268   if (model == TLSModel::LocalExec) {
   12269     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
   12270   } else if (model == TLSModel::InitialExec) {
   12271     if (is64Bit) {
   12272       OperandFlags = X86II::MO_GOTTPOFF;
   12273       WrapperKind = X86ISD::WrapperRIP;
   12274     } else {
   12275       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
   12276     }
   12277   } else {
   12278     llvm_unreachable("Unexpected model");
   12279   }
   12280 
   12281   // emit "addl x@ntpoff,%eax" (local exec)
   12282   // or "addl x@indntpoff,%eax" (initial exec)
   12283   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
   12284   SDValue TGA =
   12285       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
   12286                                  GA->getOffset(), OperandFlags);
   12287   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
   12288 
   12289   if (model == TLSModel::InitialExec) {
   12290     if (isPIC && !is64Bit) {
   12291       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
   12292                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
   12293                            Offset);
   12294     }
   12295 
   12296     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
   12297                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
   12298                          false, false, false, 0);
   12299   }
   12300 
   12301   // The address of the thread local variable is the add of the thread
   12302   // pointer with the offset of the variable.
   12303   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
   12304 }
   12305 
   12306 SDValue
   12307 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   12308 
   12309   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   12310 
   12311   // Cygwin uses emutls.
   12312   // FIXME: It may be EmulatedTLS-generic also for X86-Android.
   12313   if (Subtarget->isTargetWindowsCygwin())
   12314     return LowerToTLSEmulatedModel(GA, DAG);
   12315 
   12316   const GlobalValue *GV = GA->getGlobal();
   12317   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12318 
   12319   if (Subtarget->isTargetELF()) {
   12320     if (DAG.getTarget().Options.EmulatedTLS)
   12321       return LowerToTLSEmulatedModel(GA, DAG);
   12322     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
   12323     switch (model) {
   12324       case TLSModel::GeneralDynamic:
   12325         if (Subtarget->is64Bit())
   12326           return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
   12327         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
   12328       case TLSModel::LocalDynamic:
   12329         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
   12330                                            Subtarget->is64Bit());
   12331       case TLSModel::InitialExec:
   12332       case TLSModel::LocalExec:
   12333         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(),
   12334                                    DAG.getTarget().getRelocationModel() ==
   12335                                        Reloc::PIC_);
   12336     }
   12337     llvm_unreachable("Unknown TLS model.");
   12338   }
   12339 
   12340   if (Subtarget->isTargetDarwin()) {
   12341     // Darwin only has one model of TLS.  Lower to that.
   12342     unsigned char OpFlag = 0;
   12343     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
   12344                            X86ISD::WrapperRIP : X86ISD::Wrapper;
   12345 
   12346     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   12347     // global base reg.
   12348     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
   12349                  !Subtarget->is64Bit();
   12350     if (PIC32)
   12351       OpFlag = X86II::MO_TLVP_PIC_BASE;
   12352     else
   12353       OpFlag = X86II::MO_TLVP;
   12354     SDLoc DL(Op);
   12355     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
   12356                                                 GA->getValueType(0),
   12357                                                 GA->getOffset(), OpFlag);
   12358     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
   12359 
   12360     // With PIC32, the address is actually $g + Offset.
   12361     if (PIC32)
   12362       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
   12363                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
   12364                            Offset);
   12365 
   12366     // Lowering the machine isd will make sure everything is in the right
   12367     // location.
   12368     SDValue Chain = DAG.getEntryNode();
   12369     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   12370     SDValue Args[] = { Chain, Offset };
   12371     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
   12372 
   12373     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
   12374     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   12375     MFI->setAdjustsStack(true);
   12376 
   12377     // And our return value (tls address) is in the standard call return value
   12378     // location.
   12379     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
   12380     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
   12381   }
   12382 
   12383   if (Subtarget->isTargetKnownWindowsMSVC() ||
   12384       Subtarget->isTargetWindowsGNU()) {
   12385     // Just use the implicit TLS architecture
   12386     // Need to generate someting similar to:
   12387     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
   12388     //                                  ; from TEB
   12389     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
   12390     //   mov     rcx, qword [rdx+rcx*8]
   12391     //   mov     eax, .tls$:tlsvar
   12392     //   [rax+rcx] contains the address
   12393     // Windows 64bit: gs:0x58
   12394     // Windows 32bit: fs:__tls_array
   12395 
   12396     SDLoc dl(GA);
   12397     SDValue Chain = DAG.getEntryNode();
   12398 
   12399     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
   12400     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
   12401     // use its literal value of 0x2C.
   12402     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
   12403                                         ? Type::getInt8PtrTy(*DAG.getContext(),
   12404                                                              256)
   12405                                         : Type::getInt32PtrTy(*DAG.getContext(),
   12406                                                               257));
   12407 
   12408     SDValue TlsArray = Subtarget->is64Bit()
   12409                            ? DAG.getIntPtrConstant(0x58, dl)
   12410                            : (Subtarget->isTargetWindowsGNU()
   12411                                   ? DAG.getIntPtrConstant(0x2C, dl)
   12412                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
   12413 
   12414     SDValue ThreadPointer =
   12415         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false,
   12416                     false, false, 0);
   12417 
   12418     SDValue res;
   12419     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
   12420       res = ThreadPointer;
   12421     } else {
   12422       // Load the _tls_index variable
   12423       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
   12424       if (Subtarget->is64Bit())
   12425         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
   12426                              MachinePointerInfo(), MVT::i32, false, false,
   12427                              false, 0);
   12428       else
   12429         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false,
   12430                           false, false, 0);
   12431 
   12432       auto &DL = DAG.getDataLayout();
   12433       SDValue Scale =
   12434           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
   12435       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
   12436 
   12437       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
   12438     }
   12439 
   12440     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false,
   12441                       false, 0);
   12442 
   12443     // Get the offset of start of .tls section
   12444     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   12445                                              GA->getValueType(0),
   12446                                              GA->getOffset(), X86II::MO_SECREL);
   12447     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
   12448 
   12449     // The address of the thread local variable is the add of the thread
   12450     // pointer with the offset of the variable.
   12451     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
   12452   }
   12453 
   12454   llvm_unreachable("TLS not implemented for this target.");
   12455 }
   12456 
   12457 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
   12458 /// and take a 2 x i32 value to shift plus a shift amount.
   12459 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   12460   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   12461   MVT VT = Op.getSimpleValueType();
   12462   unsigned VTBits = VT.getSizeInBits();
   12463   SDLoc dl(Op);
   12464   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   12465   SDValue ShOpLo = Op.getOperand(0);
   12466   SDValue ShOpHi = Op.getOperand(1);
   12467   SDValue ShAmt  = Op.getOperand(2);
   12468   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
   12469   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
   12470   // during isel.
   12471   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
   12472                                   DAG.getConstant(VTBits - 1, dl, MVT::i8));
   12473   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
   12474                                      DAG.getConstant(VTBits - 1, dl, MVT::i8))
   12475                        : DAG.getConstant(0, dl, VT);
   12476 
   12477   SDValue Tmp2, Tmp3;
   12478   if (Op.getOpcode() == ISD::SHL_PARTS) {
   12479     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
   12480     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
   12481   } else {
   12482     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
   12483     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
   12484   }
   12485 
   12486   // If the shift amount is larger or equal than the width of a part we can't
   12487   // rely on the results of shld/shrd. Insert a test and select the appropriate
   12488   // values for large shift amounts.
   12489   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
   12490                                 DAG.getConstant(VTBits, dl, MVT::i8));
   12491   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   12492                              AndNode, DAG.getConstant(0, dl, MVT::i8));
   12493 
   12494   SDValue Hi, Lo;
   12495   SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
   12496   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
   12497   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
   12498 
   12499   if (Op.getOpcode() == ISD::SHL_PARTS) {
   12500     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
   12501     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   12502   } else {
   12503     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
   12504     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   12505   }
   12506 
   12507   SDValue Ops[2] = { Lo, Hi };
   12508   return DAG.getMergeValues(Ops, dl);
   12509 }
   12510 
   12511 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   12512                                            SelectionDAG &DAG) const {
   12513   SDValue Src = Op.getOperand(0);
   12514   MVT SrcVT = Src.getSimpleValueType();
   12515   MVT VT = Op.getSimpleValueType();
   12516   SDLoc dl(Op);
   12517 
   12518   if (SrcVT.isVector()) {
   12519     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
   12520       return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
   12521                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
   12522                          DAG.getUNDEF(SrcVT)));
   12523     }
   12524     if (SrcVT.getVectorElementType() == MVT::i1) {
   12525       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
   12526       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
   12527                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
   12528     }
   12529     return SDValue();
   12530   }
   12531 
   12532   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
   12533          "Unknown SINT_TO_FP to lower!");
   12534 
   12535   // These are really Legal; return the operand so the caller accepts it as
   12536   // Legal.
   12537   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
   12538     return Op;
   12539   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
   12540       Subtarget->is64Bit()) {
   12541     return Op;
   12542   }
   12543 
   12544   unsigned Size = SrcVT.getSizeInBits()/8;
   12545   MachineFunction &MF = DAG.getMachineFunction();
   12546   auto PtrVT = getPointerTy(MF.getDataLayout());
   12547   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
   12548   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   12549   SDValue Chain = DAG.getStore(
   12550       DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot,
   12551       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
   12552       false, 0);
   12553   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
   12554 }
   12555 
   12556 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
   12557                                      SDValue StackSlot,
   12558                                      SelectionDAG &DAG) const {
   12559   // Build the FILD
   12560   SDLoc DL(Op);
   12561   SDVTList Tys;
   12562   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
   12563   if (useSSE)
   12564     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
   12565   else
   12566     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
   12567 
   12568   unsigned ByteSize = SrcVT.getSizeInBits()/8;
   12569 
   12570   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
   12571   MachineMemOperand *MMO;
   12572   if (FI) {
   12573     int SSFI = FI->getIndex();
   12574     MMO = DAG.getMachineFunction().getMachineMemOperand(
   12575         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
   12576         MachineMemOperand::MOLoad, ByteSize, ByteSize);
   12577   } else {
   12578     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
   12579     StackSlot = StackSlot.getOperand(1);
   12580   }
   12581   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   12582   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
   12583                                            X86ISD::FILD, DL,
   12584                                            Tys, Ops, SrcVT, MMO);
   12585 
   12586   if (useSSE) {
   12587     Chain = Result.getValue(1);
   12588     SDValue InFlag = Result.getValue(2);
   12589 
   12590     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
   12591     // shouldn't be necessary except that RFP cannot be live across
   12592     // multiple blocks. When stackifier is fixed, they can be uncoupled.
   12593     MachineFunction &MF = DAG.getMachineFunction();
   12594     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
   12595     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
   12596     auto PtrVT = getPointerTy(MF.getDataLayout());
   12597     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   12598     Tys = DAG.getVTList(MVT::Other);
   12599     SDValue Ops[] = {
   12600       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
   12601     };
   12602     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
   12603         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
   12604         MachineMemOperand::MOStore, SSFISize, SSFISize);
   12605 
   12606     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
   12607                                     Ops, Op.getValueType(), MMO);
   12608     Result = DAG.getLoad(
   12609         Op.getValueType(), DL, Chain, StackSlot,
   12610         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
   12611         false, false, false, 0);
   12612   }
   12613 
   12614   return Result;
   12615 }
   12616 
   12617 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
   12618 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   12619                                                SelectionDAG &DAG) const {
   12620   // This algorithm is not obvious. Here it is what we're trying to output:
   12621   /*
   12622      movq       %rax,  %xmm0
   12623      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
   12624      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
   12625      #ifdef __SSE3__
   12626        haddpd   %xmm0, %xmm0
   12627      #else
   12628        pshufd   $0x4e, %xmm0, %xmm1
   12629        addpd    %xmm1, %xmm0
   12630      #endif
   12631   */
   12632 
   12633   SDLoc dl(Op);
   12634   LLVMContext *Context = DAG.getContext();
   12635 
   12636   // Build some magic constants.
   12637   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   12638   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   12639   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12640   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
   12641 
   12642   SmallVector<Constant*,2> CV1;
   12643   CV1.push_back(
   12644     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
   12645                                       APInt(64, 0x4330000000000000ULL))));
   12646   CV1.push_back(
   12647     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
   12648                                       APInt(64, 0x4530000000000000ULL))));
   12649   Constant *C1 = ConstantVector::get(CV1);
   12650   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
   12651 
   12652   // Load the 64-bit value into an XMM register.
   12653   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   12654                             Op.getOperand(0));
   12655   SDValue CLod0 =
   12656       DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
   12657                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   12658                   false, false, false, 16);
   12659   SDValue Unpck1 =
   12660       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
   12661 
   12662   SDValue CLod1 =
   12663       DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
   12664                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   12665                   false, false, false, 16);
   12666   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
   12667   // TODO: Are there any fast-math-flags to propagate here?
   12668   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   12669   SDValue Result;
   12670 
   12671   if (Subtarget->hasSSE3()) {
   12672     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
   12673     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   12674   } else {
   12675     SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
   12676     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
   12677                                            S2F, 0x4E, DAG);
   12678     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
   12679                          DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
   12680   }
   12681 
   12682   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
   12683                      DAG.getIntPtrConstant(0, dl));
   12684 }
   12685 
   12686 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
   12687 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
   12688                                                SelectionDAG &DAG) const {
   12689   SDLoc dl(Op);
   12690   // FP constant to bias correct the final result.
   12691   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
   12692                                    MVT::f64);
   12693 
   12694   // Load the 32-bit value into an XMM register.
   12695   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
   12696                              Op.getOperand(0));
   12697 
   12698   // Zero out the upper parts of the register.
   12699   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
   12700 
   12701   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   12702                      DAG.getBitcast(MVT::v2f64, Load),
   12703                      DAG.getIntPtrConstant(0, dl));
   12704 
   12705   // Or the load with the bias.
   12706   SDValue Or = DAG.getNode(
   12707       ISD::OR, dl, MVT::v2i64,
   12708       DAG.getBitcast(MVT::v2i64,
   12709                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
   12710       DAG.getBitcast(MVT::v2i64,
   12711                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
   12712   Or =
   12713       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   12714                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
   12715 
   12716   // Subtract the bias.
   12717   // TODO: Are there any fast-math-flags to propagate here?
   12718   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
   12719 
   12720   // Handle final rounding.
   12721   MVT DestVT = Op.getSimpleValueType();
   12722 
   12723   if (DestVT.bitsLT(MVT::f64))
   12724     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
   12725                        DAG.getIntPtrConstant(0, dl));
   12726   if (DestVT.bitsGT(MVT::f64))
   12727     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
   12728 
   12729   // Handle final rounding.
   12730   return Sub;
   12731 }
   12732 
   12733 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
   12734                                      const X86Subtarget &Subtarget) {
   12735   // The algorithm is the following:
   12736   // #ifdef __SSE4_1__
   12737   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
   12738   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
   12739   //                                 (uint4) 0x53000000, 0xaa);
   12740   // #else
   12741   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
   12742   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
   12743   // #endif
   12744   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   12745   //     return (float4) lo + fhi;
   12746 
   12747   // We shouldn't use it when unsafe-fp-math is enabled though: we might later
   12748   // reassociate the two FADDs, and if we do that, the algorithm fails
   12749   // spectacularly (PR24512).
   12750   // FIXME: If we ever have some kind of Machine FMF, this should be marked
   12751   // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
   12752   // there's also the MachineCombiner reassociations happening on Machine IR.
   12753   if (DAG.getTarget().Options.UnsafeFPMath)
   12754     return SDValue();
   12755 
   12756   SDLoc DL(Op);
   12757   SDValue V = Op->getOperand(0);
   12758   MVT VecIntVT = V.getSimpleValueType();
   12759   bool Is128 = VecIntVT == MVT::v4i32;
   12760   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
   12761   // If we convert to something else than the supported type, e.g., to v4f64,
   12762   // abort early.
   12763   if (VecFloatVT != Op->getSimpleValueType(0))
   12764     return SDValue();
   12765 
   12766   unsigned NumElts = VecIntVT.getVectorNumElements();
   12767   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
   12768          "Unsupported custom type");
   12769   assert(NumElts <= 8 && "The size of the constant array must be fixed");
   12770 
   12771   // In the #idef/#else code, we have in common:
   12772   // - The vector of constants:
   12773   // -- 0x4b000000
   12774   // -- 0x53000000
   12775   // - A shift:
   12776   // -- v >> 16
   12777 
   12778   // Create the splat vector for 0x4b000000.
   12779   SDValue CstLow = DAG.getConstant(0x4b000000, DL, MVT::i32);
   12780   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
   12781                            CstLow, CstLow, CstLow, CstLow};
   12782   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
   12783                                   makeArrayRef(&CstLowArray[0], NumElts));
   12784   // Create the splat vector for 0x53000000.
   12785   SDValue CstHigh = DAG.getConstant(0x53000000, DL, MVT::i32);
   12786   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
   12787                             CstHigh, CstHigh, CstHigh, CstHigh};
   12788   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
   12789                                    makeArrayRef(&CstHighArray[0], NumElts));
   12790 
   12791   // Create the right shift.
   12792   SDValue CstShift = DAG.getConstant(16, DL, MVT::i32);
   12793   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
   12794                              CstShift, CstShift, CstShift, CstShift};
   12795   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
   12796                                     makeArrayRef(&CstShiftArray[0], NumElts));
   12797   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
   12798 
   12799   SDValue Low, High;
   12800   if (Subtarget.hasSSE41()) {
   12801     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
   12802     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
   12803     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
   12804     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
   12805     // Low will be bitcasted right away, so do not bother bitcasting back to its
   12806     // original type.
   12807     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
   12808                       VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
   12809     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
   12810     //                                 (uint4) 0x53000000, 0xaa);
   12811     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
   12812     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
   12813     // High will be bitcasted right away, so do not bother bitcasting back to
   12814     // its original type.
   12815     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
   12816                        VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
   12817   } else {
   12818     SDValue CstMask = DAG.getConstant(0xffff, DL, MVT::i32);
   12819     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
   12820                                      CstMask, CstMask, CstMask);
   12821     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
   12822     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
   12823     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
   12824 
   12825     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
   12826     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
   12827   }
   12828 
   12829   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
   12830   SDValue CstFAdd = DAG.getConstantFP(
   12831       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, MVT::f32);
   12832   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
   12833                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
   12834   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
   12835                                    makeArrayRef(&CstFAddArray[0], NumElts));
   12836 
   12837   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   12838   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
   12839   // TODO: Are there any fast-math-flags to propagate here?
   12840   SDValue FHigh =
   12841       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
   12842   //     return (float4) lo + fhi;
   12843   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
   12844   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
   12845 }
   12846 
   12847 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
   12848                                                SelectionDAG &DAG) const {
   12849   SDValue N0 = Op.getOperand(0);
   12850   MVT SVT = N0.getSimpleValueType();
   12851   SDLoc dl(Op);
   12852 
   12853   switch (SVT.SimpleTy) {
   12854   default:
   12855     llvm_unreachable("Custom UINT_TO_FP is not supported!");
   12856   case MVT::v4i8:
   12857   case MVT::v4i16:
   12858   case MVT::v8i8:
   12859   case MVT::v8i16: {
   12860     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
   12861     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
   12862                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
   12863   }
   12864   case MVT::v4i32:
   12865   case MVT::v8i32:
   12866     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
   12867   case MVT::v16i8:
   12868   case MVT::v16i16:
   12869     assert(Subtarget->hasAVX512());
   12870     return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
   12871                        DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
   12872   }
   12873 }
   12874 
   12875 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   12876                                            SelectionDAG &DAG) const {
   12877   SDValue N0 = Op.getOperand(0);
   12878   SDLoc dl(Op);
   12879   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12880 
   12881   if (Op.getSimpleValueType().isVector())
   12882     return lowerUINT_TO_FP_vec(Op, DAG);
   12883 
   12884   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
   12885   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
   12886   // the optimization here.
   12887   if (DAG.SignBitIsZero(N0))
   12888     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
   12889 
   12890   MVT SrcVT = N0.getSimpleValueType();
   12891   MVT DstVT = Op.getSimpleValueType();
   12892 
   12893   if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
   12894       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) {
   12895     // Conversions from unsigned i32 to f32/f64 are legal,
   12896     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
   12897     return Op;
   12898   }
   12899 
   12900   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
   12901     return LowerUINT_TO_FP_i64(Op, DAG);
   12902   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
   12903     return LowerUINT_TO_FP_i32(Op, DAG);
   12904   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
   12905     return SDValue();
   12906 
   12907   // Make a 64-bit buffer, and use it to build an FILD.
   12908   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
   12909   if (SrcVT == MVT::i32) {
   12910     SDValue WordOff = DAG.getConstant(4, dl, PtrVT);
   12911     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff);
   12912     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   12913                                   StackSlot, MachinePointerInfo(),
   12914                                   false, false, 0);
   12915     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
   12916                                   OffsetSlot, MachinePointerInfo(),
   12917                                   false, false, 0);
   12918     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
   12919     return Fild;
   12920   }
   12921 
   12922   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
   12923   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   12924                                StackSlot, MachinePointerInfo(),
   12925                                false, false, 0);
   12926   // For i64 source, we need to add the appropriate power of 2 if the input
   12927   // was negative.  This is the same as the optimization in
   12928   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
   12929   // we must be careful to do the computation in x87 extended precision, not
   12930   // in SSE. (The generic code can't know it's OK to do this, or how to.)
   12931   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
   12932   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
   12933       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
   12934       MachineMemOperand::MOLoad, 8, 8);
   12935 
   12936   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   12937   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
   12938   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
   12939                                          MVT::i64, MMO);
   12940 
   12941   APInt FF(32, 0x5F800000ULL);
   12942 
   12943   // Check whether the sign bit is set.
   12944   SDValue SignSet = DAG.getSetCC(
   12945       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
   12946       Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
   12947 
   12948   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
   12949   SDValue FudgePtr = DAG.getConstantPool(
   12950       ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
   12951 
   12952   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   12953   SDValue Zero = DAG.getIntPtrConstant(0, dl);
   12954   SDValue Four = DAG.getIntPtrConstant(4, dl);
   12955   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
   12956                                Zero, Four);
   12957   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
   12958 
   12959   // Load the value out, extending it from f32 to f80.
   12960   // FIXME: Avoid the extend by constructing the right constant pool?
   12961   SDValue Fudge = DAG.getExtLoad(
   12962       ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
   12963       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
   12964       false, false, false, 4);
   12965   // Extend everything to 80 bits to force it to be done on x87.
   12966   // TODO: Are there any fast-math-flags to propagate here?
   12967   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
   12968   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
   12969                      DAG.getIntPtrConstant(0, dl));
   12970 }
   12971 
   12972 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
   12973 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
   12974 // just return an <SDValue(), SDValue()> pair.
   12975 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
   12976 // to i16, i32 or i64, and we lower it to a legal sequence.
   12977 // If lowered to the final integer result we return a <result, SDValue()> pair.
   12978 // Otherwise we lower it to a sequence ending with a FIST, return a
   12979 // <FIST, StackSlot> pair, and the caller is responsible for loading
   12980 // the final integer result from StackSlot.
   12981 std::pair<SDValue,SDValue>
   12982 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   12983                                    bool IsSigned, bool IsReplace) const {
   12984   SDLoc DL(Op);
   12985 
   12986   EVT DstTy = Op.getValueType();
   12987   EVT TheVT = Op.getOperand(0).getValueType();
   12988   auto PtrVT = getPointerTy(DAG.getDataLayout());
   12989 
   12990   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
   12991     // f16 must be promoted before using the lowering in this routine.
   12992     // fp128 does not use this lowering.
   12993     return std::make_pair(SDValue(), SDValue());
   12994   }
   12995 
   12996   // If using FIST to compute an unsigned i64, we'll need some fixup
   12997   // to handle values above the maximum signed i64.  A FIST is always
   12998   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
   12999   bool UnsignedFixup = !IsSigned &&
   13000                        DstTy == MVT::i64 &&
   13001                        (!Subtarget->is64Bit() ||
   13002                         !isScalarFPTypeInSSEReg(TheVT));
   13003 
   13004   if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) {
   13005     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
   13006     // The low 32 bits of the fist result will have the correct uint32 result.
   13007     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
   13008     DstTy = MVT::i64;
   13009   }
   13010 
   13011   assert(DstTy.getSimpleVT() <= MVT::i64 &&
   13012          DstTy.getSimpleVT() >= MVT::i16 &&
   13013          "Unknown FP_TO_INT to lower!");
   13014 
   13015   // These are really Legal.
   13016   if (DstTy == MVT::i32 &&
   13017       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   13018     return std::make_pair(SDValue(), SDValue());
   13019   if (Subtarget->is64Bit() &&
   13020       DstTy == MVT::i64 &&
   13021       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   13022     return std::make_pair(SDValue(), SDValue());
   13023 
   13024   // We lower FP->int64 into FISTP64 followed by a load from a temporary
   13025   // stack slot.
   13026   MachineFunction &MF = DAG.getMachineFunction();
   13027   unsigned MemSize = DstTy.getSizeInBits()/8;
   13028   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   13029   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   13030 
   13031   unsigned Opc;
   13032   switch (DstTy.getSimpleVT().SimpleTy) {
   13033   default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
   13034   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
   13035   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
   13036   case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
   13037   }
   13038 
   13039   SDValue Chain = DAG.getEntryNode();
   13040   SDValue Value = Op.getOperand(0);
   13041   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
   13042 
   13043   if (UnsignedFixup) {
   13044     //
   13045     // Conversion to unsigned i64 is implemented with a select,
   13046     // depending on whether the source value fits in the range
   13047     // of a signed i64.  Let Thresh be the FP equivalent of
   13048     // 0x8000000000000000ULL.
   13049     //
   13050     //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
   13051     //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
   13052     //  Fist-to-mem64 FistSrc
   13053     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
   13054     //  to XOR'ing the high 32 bits with Adjust.
   13055     //
   13056     // Being a power of 2, Thresh is exactly representable in all FP formats.
   13057     // For X87 we'd like to use the smallest FP type for this constant, but
   13058     // for DAG type consistency we have to match the FP operand type.
   13059 
   13060     APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
   13061     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
   13062     bool LosesInfo = false;
   13063     if (TheVT == MVT::f64)
   13064       // The rounding mode is irrelevant as the conversion should be exact.
   13065       Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
   13066                               &LosesInfo);
   13067     else if (TheVT == MVT::f80)
   13068       Status = Thresh.convert(APFloat::x87DoubleExtended,
   13069                               APFloat::rmNearestTiesToEven, &LosesInfo);
   13070 
   13071     assert(Status == APFloat::opOK && !LosesInfo &&
   13072            "FP conversion should have been exact");
   13073 
   13074     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
   13075 
   13076     SDValue Cmp = DAG.getSetCC(DL,
   13077                                getSetCCResultType(DAG.getDataLayout(),
   13078                                                   *DAG.getContext(), TheVT),
   13079                                Value, ThreshVal, ISD::SETLT);
   13080     Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
   13081                            DAG.getConstant(0, DL, MVT::i32),
   13082                            DAG.getConstant(0x80000000, DL, MVT::i32));
   13083     SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
   13084     Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
   13085                                               *DAG.getContext(), TheVT),
   13086                        Value, ThreshVal, ISD::SETLT);
   13087     Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
   13088   }
   13089 
   13090   // FIXME This causes a redundant load/store if the SSE-class value is already
   13091   // in memory, such as if it is on the callstack.
   13092   if (isScalarFPTypeInSSEReg(TheVT)) {
   13093     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
   13094     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
   13095                          MachinePointerInfo::getFixedStack(MF, SSFI), false,
   13096                          false, 0);
   13097     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
   13098     SDValue Ops[] = {
   13099       Chain, StackSlot, DAG.getValueType(TheVT)
   13100     };
   13101 
   13102     MachineMemOperand *MMO =
   13103         MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
   13104                                 MachineMemOperand::MOLoad, MemSize, MemSize);
   13105     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
   13106     Chain = Value.getValue(1);
   13107     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   13108     StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   13109   }
   13110 
   13111   MachineMemOperand *MMO =
   13112       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
   13113                               MachineMemOperand::MOStore, MemSize, MemSize);
   13114 
   13115   if (UnsignedFixup) {
   13116 
   13117     // Insert the FIST, load its result as two i32's,
   13118     // and XOR the high i32 with Adjust.
   13119 
   13120     SDValue FistOps[] = { Chain, Value, StackSlot };
   13121     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
   13122                                            FistOps, DstTy, MMO);
   13123 
   13124     SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
   13125                                 MachinePointerInfo(),
   13126                                 false, false, false, 0);
   13127     SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot,
   13128                                    DAG.getConstant(4, DL, PtrVT));
   13129 
   13130     SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
   13131                                  MachinePointerInfo(),
   13132                                  false, false, false, 0);
   13133     High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
   13134 
   13135     if (Subtarget->is64Bit()) {
   13136       // Join High32 and Low32 into a 64-bit result.
   13137       // (High32 << 32) | Low32
   13138       Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
   13139       High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
   13140       High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
   13141                            DAG.getConstant(32, DL, MVT::i8));
   13142       SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
   13143       return std::make_pair(Result, SDValue());
   13144     }
   13145 
   13146     SDValue ResultOps[] = { Low32, High32 };
   13147 
   13148     SDValue pair = IsReplace
   13149       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
   13150       : DAG.getMergeValues(ResultOps, DL);
   13151     return std::make_pair(pair, SDValue());
   13152   } else {
   13153     // Build the FP_TO_INT*_IN_MEM
   13154     SDValue Ops[] = { Chain, Value, StackSlot };
   13155     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
   13156                                            Ops, DstTy, MMO);
   13157     return std::make_pair(FIST, StackSlot);
   13158   }
   13159 }
   13160 
   13161 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   13162                               const X86Subtarget *Subtarget) {
   13163   MVT VT = Op->getSimpleValueType(0);
   13164   SDValue In = Op->getOperand(0);
   13165   MVT InVT = In.getSimpleValueType();
   13166   SDLoc dl(Op);
   13167 
   13168   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
   13169     return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
   13170 
   13171   // Optimize vectors in AVX mode:
   13172   //
   13173   //   v8i16 -> v8i32
   13174   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
   13175   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
   13176   //   Concat upper and lower parts.
   13177   //
   13178   //   v4i32 -> v4i64
   13179   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
   13180   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   13181   //   Concat upper and lower parts.
   13182   //
   13183 
   13184   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
   13185       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
   13186       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
   13187     return SDValue();
   13188 
   13189   if (Subtarget->hasInt256())
   13190     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
   13191 
   13192   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
   13193   SDValue Undef = DAG.getUNDEF(InVT);
   13194   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
   13195   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   13196   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   13197 
   13198   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
   13199                              VT.getVectorNumElements()/2);
   13200 
   13201   OpLo = DAG.getBitcast(HVT, OpLo);
   13202   OpHi = DAG.getBitcast(HVT, OpHi);
   13203 
   13204   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   13205 }
   13206 
   13207 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
   13208                   const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   13209   MVT VT = Op->getSimpleValueType(0);
   13210   SDValue In = Op->getOperand(0);
   13211   MVT InVT = In.getSimpleValueType();
   13212   SDLoc DL(Op);
   13213   unsigned int NumElts = VT.getVectorNumElements();
   13214   if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
   13215     return SDValue();
   13216 
   13217   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
   13218     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
   13219 
   13220   assert(InVT.getVectorElementType() == MVT::i1);
   13221   MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
   13222   SDValue One =
   13223    DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
   13224   SDValue Zero =
   13225    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
   13226 
   13227   SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
   13228   if (VT.is512BitVector())
   13229     return V;
   13230   return DAG.getNode(X86ISD::VTRUNC, DL, VT, V);
   13231 }
   13232 
   13233 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   13234                                SelectionDAG &DAG) {
   13235   if (Subtarget->hasFp256())
   13236     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
   13237       return Res;
   13238 
   13239   return SDValue();
   13240 }
   13241 
   13242 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   13243                                 SelectionDAG &DAG) {
   13244   SDLoc DL(Op);
   13245   MVT VT = Op.getSimpleValueType();
   13246   SDValue In = Op.getOperand(0);
   13247   MVT SVT = In.getSimpleValueType();
   13248 
   13249   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
   13250     return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
   13251 
   13252   if (Subtarget->hasFp256())
   13253     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
   13254       return Res;
   13255 
   13256   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
   13257          VT.getVectorNumElements() != SVT.getVectorNumElements());
   13258   return SDValue();
   13259 }
   13260 
   13261 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   13262   SDLoc DL(Op);
   13263   MVT VT = Op.getSimpleValueType();
   13264   SDValue In = Op.getOperand(0);
   13265   MVT InVT = In.getSimpleValueType();
   13266 
   13267   if (VT == MVT::i1) {
   13268     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
   13269            "Invalid scalar TRUNCATE operation");
   13270     if (InVT.getSizeInBits() >= 32)
   13271       return SDValue();
   13272     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
   13273     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
   13274   }
   13275   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
   13276          "Invalid TRUNCATE operation");
   13277 
   13278   // move vector to mask - truncate solution for SKX
   13279   if (VT.getVectorElementType() == MVT::i1) {
   13280     if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
   13281         Subtarget->hasBWI())
   13282       return Op; // legal, will go to VPMOVB2M, VPMOVW2M
   13283     if ((InVT.is256BitVector() || InVT.is128BitVector())
   13284         && InVT.getScalarSizeInBits() <= 16 &&
   13285         Subtarget->hasBWI() && Subtarget->hasVLX())
   13286       return Op; // legal, will go to VPMOVB2M, VPMOVW2M
   13287     if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
   13288         Subtarget->hasDQI())
   13289       return Op; // legal, will go to VPMOVD2M, VPMOVQ2M
   13290     if ((InVT.is256BitVector() || InVT.is128BitVector())
   13291         && InVT.getScalarSizeInBits() >= 32 &&
   13292         Subtarget->hasDQI() && Subtarget->hasVLX())
   13293       return Op; // legal, will go to VPMOVB2M, VPMOVQ2M
   13294     }
   13295 
   13296   if (VT.getVectorElementType() == MVT::i1) {
   13297     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
   13298     unsigned NumElts = InVT.getVectorNumElements();
   13299     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
   13300     if (InVT.getSizeInBits() < 512) {
   13301       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
   13302       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
   13303       InVT = ExtVT;
   13304     }
   13305 
   13306     SDValue OneV =
   13307      DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT);
   13308     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
   13309     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
   13310   }
   13311 
   13312   // vpmovqb/w/d, vpmovdb/w, vpmovwb
   13313   if (Subtarget->hasAVX512()) {
   13314     // word to byte only under BWI
   13315     if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8
   13316       return DAG.getNode(X86ISD::VTRUNC, DL, VT,
   13317                          DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
   13318     return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
   13319   }
   13320   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
   13321     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
   13322     if (Subtarget->hasInt256()) {
   13323       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
   13324       In = DAG.getBitcast(MVT::v8i32, In);
   13325       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
   13326                                 ShufMask);
   13327       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
   13328                          DAG.getIntPtrConstant(0, DL));
   13329     }
   13330 
   13331     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   13332                                DAG.getIntPtrConstant(0, DL));
   13333     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   13334                                DAG.getIntPtrConstant(2, DL));
   13335     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
   13336     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
   13337     static const int ShufMask[] = {0, 2, 4, 6};
   13338     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
   13339   }
   13340 
   13341   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
   13342     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
   13343     if (Subtarget->hasInt256()) {
   13344       In = DAG.getBitcast(MVT::v32i8, In);
   13345 
   13346       SmallVector<SDValue,32> pshufbMask;
   13347       for (unsigned i = 0; i < 2; ++i) {
   13348         pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
   13349         pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
   13350         pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
   13351         pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
   13352         pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
   13353         pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
   13354         pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
   13355         pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
   13356         for (unsigned j = 0; j < 8; ++j)
   13357           pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
   13358       }
   13359       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
   13360       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
   13361       In = DAG.getBitcast(MVT::v4i64, In);
   13362 
   13363       static const int ShufMask[] = {0,  2,  -1,  -1};
   13364       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
   13365                                 &ShufMask[0]);
   13366       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   13367                        DAG.getIntPtrConstant(0, DL));
   13368       return DAG.getBitcast(VT, In);
   13369     }
   13370 
   13371     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
   13372                                DAG.getIntPtrConstant(0, DL));
   13373 
   13374     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
   13375                                DAG.getIntPtrConstant(4, DL));
   13376 
   13377     OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
   13378     OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
   13379 
   13380     // The PSHUFB mask:
   13381     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
   13382                                    -1, -1, -1, -1, -1, -1, -1, -1};
   13383 
   13384     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
   13385     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
   13386     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
   13387 
   13388     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
   13389     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
   13390 
   13391     // The MOVLHPS Mask:
   13392     static const int ShufMask2[] = {0, 1, 4, 5};
   13393     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
   13394     return DAG.getBitcast(MVT::v8i16, res);
   13395   }
   13396 
   13397   // Handle truncation of V256 to V128 using shuffles.
   13398   if (!VT.is128BitVector() || !InVT.is256BitVector())
   13399     return SDValue();
   13400 
   13401   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
   13402 
   13403   unsigned NumElems = VT.getVectorNumElements();
   13404   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
   13405 
   13406   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
   13407   // Prepare truncation shuffle mask
   13408   for (unsigned i = 0; i != NumElems; ++i)
   13409     MaskVec[i] = i * 2;
   13410   SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
   13411                                    DAG.getUNDEF(NVT), &MaskVec[0]);
   13412   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
   13413                      DAG.getIntPtrConstant(0, DL));
   13414 }
   13415 
   13416 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
   13417                                            SelectionDAG &DAG) const {
   13418   assert(!Op.getSimpleValueType().isVector());
   13419 
   13420   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   13421     /*IsSigned=*/ true, /*IsReplace=*/ false);
   13422   SDValue FIST = Vals.first, StackSlot = Vals.second;
   13423   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
   13424   if (!FIST.getNode())
   13425     return Op;
   13426 
   13427   if (StackSlot.getNode())
   13428     // Load the result.
   13429     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
   13430                        FIST, StackSlot, MachinePointerInfo(),
   13431                        false, false, false, 0);
   13432 
   13433   // The node is the result.
   13434   return FIST;
   13435 }
   13436 
   13437 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
   13438                                            SelectionDAG &DAG) const {
   13439   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   13440     /*IsSigned=*/ false, /*IsReplace=*/ false);
   13441   SDValue FIST = Vals.first, StackSlot = Vals.second;
   13442   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
   13443   if (!FIST.getNode())
   13444     return Op;
   13445 
   13446   if (StackSlot.getNode())
   13447     // Load the result.
   13448     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
   13449                        FIST, StackSlot, MachinePointerInfo(),
   13450                        false, false, false, 0);
   13451 
   13452   // The node is the result.
   13453   return FIST;
   13454 }
   13455 
   13456 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
   13457   SDLoc DL(Op);
   13458   MVT VT = Op.getSimpleValueType();
   13459   SDValue In = Op.getOperand(0);
   13460   MVT SVT = In.getSimpleValueType();
   13461 
   13462   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
   13463 
   13464   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
   13465                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
   13466                                  In, DAG.getUNDEF(SVT)));
   13467 }
   13468 
   13469 /// The only differences between FABS and FNEG are the mask and the logic op.
   13470 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
   13471 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   13472   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
   13473          "Wrong opcode for lowering FABS or FNEG.");
   13474 
   13475   bool IsFABS = (Op.getOpcode() == ISD::FABS);
   13476 
   13477   // If this is a FABS and it has an FNEG user, bail out to fold the combination
   13478   // into an FNABS. We'll lower the FABS after that if it is still in use.
   13479   if (IsFABS)
   13480     for (SDNode *User : Op->uses())
   13481       if (User->getOpcode() == ISD::FNEG)
   13482         return Op;
   13483 
   13484   SDLoc dl(Op);
   13485   MVT VT = Op.getSimpleValueType();
   13486 
   13487   bool IsF128 = (VT == MVT::f128);
   13488 
   13489   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
   13490   // decide if we should generate a 16-byte constant mask when we only need 4 or
   13491   // 8 bytes for the scalar case.
   13492 
   13493   MVT LogicVT;
   13494   MVT EltVT;
   13495   unsigned NumElts;
   13496 
   13497   if (VT.isVector()) {
   13498     LogicVT = VT;
   13499     EltVT = VT.getVectorElementType();
   13500     NumElts = VT.getVectorNumElements();
   13501   } else if (IsF128) {
   13502     // SSE instructions are used for optimized f128 logical operations.
   13503     LogicVT = MVT::f128;
   13504     EltVT = VT;
   13505     NumElts = 1;
   13506   } else {
   13507     // There are no scalar bitwise logical SSE/AVX instructions, so we
   13508     // generate a 16-byte vector constant and logic op even for the scalar case.
   13509     // Using a 16-byte mask allows folding the load of the mask with
   13510     // the logic op, so it can save (~4 bytes) on code size.
   13511     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
   13512     EltVT = VT;
   13513     NumElts = (VT == MVT::f64) ? 2 : 4;
   13514   }
   13515 
   13516   unsigned EltBits = EltVT.getSizeInBits();
   13517   LLVMContext *Context = DAG.getContext();
   13518   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
   13519   APInt MaskElt =
   13520     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
   13521   Constant *C = ConstantInt::get(*Context, MaskElt);
   13522   C = ConstantVector::getSplat(NumElts, C);
   13523   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   13524   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
   13525   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   13526   SDValue Mask =
   13527       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
   13528                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   13529                   false, false, false, Alignment);
   13530 
   13531   SDValue Op0 = Op.getOperand(0);
   13532   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
   13533   unsigned LogicOp =
   13534     IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
   13535   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
   13536 
   13537   if (VT.isVector() || IsF128)
   13538     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
   13539 
   13540   // For the scalar case extend to a 128-bit vector, perform the logic op,
   13541   // and extract the scalar result back out.
   13542   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
   13543   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
   13544   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
   13545                      DAG.getIntPtrConstant(0, dl));
   13546 }
   13547 
   13548 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   13549   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   13550   LLVMContext *Context = DAG.getContext();
   13551   SDValue Op0 = Op.getOperand(0);
   13552   SDValue Op1 = Op.getOperand(1);
   13553   SDLoc dl(Op);
   13554   MVT VT = Op.getSimpleValueType();
   13555   MVT SrcVT = Op1.getSimpleValueType();
   13556   bool IsF128 = (VT == MVT::f128);
   13557 
   13558   // If second operand is smaller, extend it first.
   13559   if (SrcVT.bitsLT(VT)) {
   13560     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
   13561     SrcVT = VT;
   13562   }
   13563   // And if it is bigger, shrink it first.
   13564   if (SrcVT.bitsGT(VT)) {
   13565     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl));
   13566     SrcVT = VT;
   13567   }
   13568 
   13569   // At this point the operands and the result should have the same
   13570   // type, and that won't be f80 since that is not custom lowered.
   13571   assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
   13572          "Unexpected type in LowerFCOPYSIGN");
   13573 
   13574   const fltSemantics &Sem =
   13575       VT == MVT::f64 ? APFloat::IEEEdouble :
   13576           (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
   13577   const unsigned SizeInBits = VT.getSizeInBits();
   13578 
   13579   SmallVector<Constant *, 4> CV(
   13580       VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
   13581       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
   13582 
   13583   // First, clear all bits but the sign bit from the second operand (sign).
   13584   CV[0] = ConstantFP::get(*Context,
   13585                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
   13586   Constant *C = ConstantVector::get(CV);
   13587   auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   13588   SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
   13589 
   13590   // Perform all logic operations as 16-byte vectors because there are no
   13591   // scalar FP logic instructions in SSE. This allows load folding of the
   13592   // constants into the logic instructions.
   13593   MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
   13594   SDValue Mask1 =
   13595       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
   13596                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   13597                   false, false, false, 16);
   13598   if (!IsF128)
   13599     Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
   13600   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
   13601 
   13602   // Next, clear the sign bit from the first operand (magnitude).
   13603   // If it's a constant, we can clear it here.
   13604   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
   13605     APFloat APF = Op0CN->getValueAPF();
   13606     // If the magnitude is a positive zero, the sign bit alone is enough.
   13607     if (APF.isPosZero())
   13608       return IsF128 ? SignBit :
   13609           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
   13610                       DAG.getIntPtrConstant(0, dl));
   13611     APF.clearSign();
   13612     CV[0] = ConstantFP::get(*Context, APF);
   13613   } else {
   13614     CV[0] = ConstantFP::get(
   13615         *Context,
   13616         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
   13617   }
   13618   C = ConstantVector::get(CV);
   13619   CPIdx = DAG.getConstantPool(C, PtrVT, 16);
   13620   SDValue Val =
   13621       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
   13622                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   13623                   false, false, false, 16);
   13624   // If the magnitude operand wasn't a constant, we need to AND out the sign.
   13625   if (!isa<ConstantFPSDNode>(Op0)) {
   13626     if (!IsF128)
   13627       Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
   13628     Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
   13629   }
   13630   // OR the magnitude value with the sign bit.
   13631   Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
   13632   return IsF128 ? Val :
   13633       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
   13634                   DAG.getIntPtrConstant(0, dl));
   13635 }
   13636 
   13637 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   13638   SDValue N0 = Op.getOperand(0);
   13639   SDLoc dl(Op);
   13640   MVT VT = Op.getSimpleValueType();
   13641 
   13642   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
   13643   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
   13644                                   DAG.getConstant(1, dl, VT));
   13645   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, dl, VT));
   13646 }
   13647 
   13648 // Check whether an OR'd tree is PTEST-able.
   13649 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
   13650                                       SelectionDAG &DAG) {
   13651   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
   13652 
   13653   if (!Subtarget->hasSSE41())
   13654     return SDValue();
   13655 
   13656   if (!Op->hasOneUse())
   13657     return SDValue();
   13658 
   13659   SDNode *N = Op.getNode();
   13660   SDLoc DL(N);
   13661 
   13662   SmallVector<SDValue, 8> Opnds;
   13663   DenseMap<SDValue, unsigned> VecInMap;
   13664   SmallVector<SDValue, 8> VecIns;
   13665   EVT VT = MVT::Other;
   13666 
   13667   // Recognize a special case where a vector is casted into wide integer to
   13668   // test all 0s.
   13669   Opnds.push_back(N->getOperand(0));
   13670   Opnds.push_back(N->getOperand(1));
   13671 
   13672   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
   13673     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
   13674     // BFS traverse all OR'd operands.
   13675     if (I->getOpcode() == ISD::OR) {
   13676       Opnds.push_back(I->getOperand(0));
   13677       Opnds.push_back(I->getOperand(1));
   13678       // Re-evaluate the number of nodes to be traversed.
   13679       e += 2; // 2 more nodes (LHS and RHS) are pushed.
   13680       continue;
   13681     }
   13682 
   13683     // Quit if a non-EXTRACT_VECTOR_ELT
   13684     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   13685       return SDValue();
   13686 
   13687     // Quit if without a constant index.
   13688     SDValue Idx = I->getOperand(1);
   13689     if (!isa<ConstantSDNode>(Idx))
   13690       return SDValue();
   13691 
   13692     SDValue ExtractedFromVec = I->getOperand(0);
   13693     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
   13694     if (M == VecInMap.end()) {
   13695       VT = ExtractedFromVec.getValueType();
   13696       // Quit if not 128/256-bit vector.
   13697       if (!VT.is128BitVector() && !VT.is256BitVector())
   13698         return SDValue();
   13699       // Quit if not the same type.
   13700       if (VecInMap.begin() != VecInMap.end() &&
   13701           VT != VecInMap.begin()->first.getValueType())
   13702         return SDValue();
   13703       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
   13704       VecIns.push_back(ExtractedFromVec);
   13705     }
   13706     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
   13707   }
   13708 
   13709   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   13710          "Not extracted from 128-/256-bit vector.");
   13711 
   13712   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
   13713 
   13714   for (DenseMap<SDValue, unsigned>::const_iterator
   13715         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
   13716     // Quit if not all elements are used.
   13717     if (I->second != FullMask)
   13718       return SDValue();
   13719   }
   13720 
   13721   MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
   13722 
   13723   // Cast all vectors into TestVT for PTEST.
   13724   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
   13725     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
   13726 
   13727   // If more than one full vectors are evaluated, OR them first before PTEST.
   13728   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
   13729     // Each iteration will OR 2 nodes and append the result until there is only
   13730     // 1 node left, i.e. the final OR'd value of all vectors.
   13731     SDValue LHS = VecIns[Slot];
   13732     SDValue RHS = VecIns[Slot + 1];
   13733     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
   13734   }
   13735 
   13736   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
   13737                      VecIns.back(), VecIns.back());
   13738 }
   13739 
   13740 /// \brief return true if \c Op has a use that doesn't just read flags.
   13741 static bool hasNonFlagsUse(SDValue Op) {
   13742   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
   13743        ++UI) {
   13744     SDNode *User = *UI;
   13745     unsigned UOpNo = UI.getOperandNo();
   13746     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
   13747       // Look pass truncate.
   13748       UOpNo = User->use_begin().getOperandNo();
   13749       User = *User->use_begin();
   13750     }
   13751 
   13752     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
   13753         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
   13754       return true;
   13755   }
   13756   return false;
   13757 }
   13758 
   13759 /// Emit nodes that will be selected as "test Op0,Op0", or something
   13760 /// equivalent.
   13761 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
   13762                                     SelectionDAG &DAG) const {
   13763   if (Op.getValueType() == MVT::i1) {
   13764     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
   13765     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
   13766                        DAG.getConstant(0, dl, MVT::i8));
   13767   }
   13768   // CF and OF aren't always set the way we want. Determine which
   13769   // of these we need.
   13770   bool NeedCF = false;
   13771   bool NeedOF = false;
   13772   switch (X86CC) {
   13773   default: break;
   13774   case X86::COND_A: case X86::COND_AE:
   13775   case X86::COND_B: case X86::COND_BE:
   13776     NeedCF = true;
   13777     break;
   13778   case X86::COND_G: case X86::COND_GE:
   13779   case X86::COND_L: case X86::COND_LE:
   13780   case X86::COND_O: case X86::COND_NO: {
   13781     // Check if we really need to set the
   13782     // Overflow flag. If NoSignedWrap is present
   13783     // that is not actually needed.
   13784     switch (Op->getOpcode()) {
   13785     case ISD::ADD:
   13786     case ISD::SUB:
   13787     case ISD::MUL:
   13788     case ISD::SHL: {
   13789       const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
   13790       if (BinNode->Flags.hasNoSignedWrap())
   13791         break;
   13792     }
   13793     default:
   13794       NeedOF = true;
   13795       break;
   13796     }
   13797     break;
   13798   }
   13799   }
   13800   // See if we can use the EFLAGS value from the operand instead of
   13801   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   13802   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   13803   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
   13804     // Emit a CMP with 0, which is the TEST pattern.
   13805     //if (Op.getValueType() == MVT::i1)
   13806     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
   13807     //                     DAG.getConstant(0, MVT::i1));
   13808     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   13809                        DAG.getConstant(0, dl, Op.getValueType()));
   13810   }
   13811   unsigned Opcode = 0;
   13812   unsigned NumOperands = 0;
   13813 
   13814   // Truncate operations may prevent the merge of the SETCC instruction
   13815   // and the arithmetic instruction before it. Attempt to truncate the operands
   13816   // of the arithmetic instruction and use a reduced bit-width instruction.
   13817   bool NeedTruncation = false;
   13818   SDValue ArithOp = Op;
   13819   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
   13820     SDValue Arith = Op->getOperand(0);
   13821     // Both the trunc and the arithmetic op need to have one user each.
   13822     if (Arith->hasOneUse())
   13823       switch (Arith.getOpcode()) {
   13824         default: break;
   13825         case ISD::ADD:
   13826         case ISD::SUB:
   13827         case ISD::AND:
   13828         case ISD::OR:
   13829         case ISD::XOR: {
   13830           NeedTruncation = true;
   13831           ArithOp = Arith;
   13832         }
   13833       }
   13834   }
   13835 
   13836   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
   13837   // which may be the result of a CAST.  We use the variable 'Op', which is the
   13838   // non-casted variable when we check for possible users.
   13839   switch (ArithOp.getOpcode()) {
   13840   case ISD::ADD:
   13841     // Due to an isel shortcoming, be conservative if this add is likely to be
   13842     // selected as part of a load-modify-store instruction. When the root node
   13843     // in a match is a store, isel doesn't know how to remap non-chain non-flag
   13844     // uses of other nodes in the match, such as the ADD in this case. This
   13845     // leads to the ADD being left around and reselected, with the result being
   13846     // two adds in the output.  Alas, even if none our users are stores, that
   13847     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
   13848     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
   13849     // climbing the DAG back to the root, and it doesn't seem to be worth the
   13850     // effort.
   13851     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   13852          UE = Op.getNode()->use_end(); UI != UE; ++UI)
   13853       if (UI->getOpcode() != ISD::CopyToReg &&
   13854           UI->getOpcode() != ISD::SETCC &&
   13855           UI->getOpcode() != ISD::STORE)
   13856         goto default_case;
   13857 
   13858     if (ConstantSDNode *C =
   13859         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
   13860       // An add of one will be selected as an INC.
   13861       if (C->isOne() && !Subtarget->slowIncDec()) {
   13862         Opcode = X86ISD::INC;
   13863         NumOperands = 1;
   13864         break;
   13865       }
   13866 
   13867       // An add of negative one (subtract of one) will be selected as a DEC.
   13868       if (C->isAllOnesValue() && !Subtarget->slowIncDec()) {
   13869         Opcode = X86ISD::DEC;
   13870         NumOperands = 1;
   13871         break;
   13872       }
   13873     }
   13874 
   13875     // Otherwise use a regular EFLAGS-setting add.
   13876     Opcode = X86ISD::ADD;
   13877     NumOperands = 2;
   13878     break;
   13879   case ISD::SHL:
   13880   case ISD::SRL:
   13881     // If we have a constant logical shift that's only used in a comparison
   13882     // against zero turn it into an equivalent AND. This allows turning it into
   13883     // a TEST instruction later.
   13884     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
   13885         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
   13886       EVT VT = Op.getValueType();
   13887       unsigned BitWidth = VT.getSizeInBits();
   13888       unsigned ShAmt = Op->getConstantOperandVal(1);
   13889       if (ShAmt >= BitWidth) // Avoid undefined shifts.
   13890         break;
   13891       APInt Mask = ArithOp.getOpcode() == ISD::SRL
   13892                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
   13893                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
   13894       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
   13895         break;
   13896       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
   13897                                 DAG.getConstant(Mask, dl, VT));
   13898       DAG.ReplaceAllUsesWith(Op, New);
   13899       Op = New;
   13900     }
   13901     break;
   13902 
   13903   case ISD::AND:
   13904     // If the primary and result isn't used, don't bother using X86ISD::AND,
   13905     // because a TEST instruction will be better.
   13906     if (!hasNonFlagsUse(Op))
   13907       break;
   13908     // FALL THROUGH
   13909   case ISD::SUB:
   13910   case ISD::OR:
   13911   case ISD::XOR:
   13912     // Due to the ISEL shortcoming noted above, be conservative if this op is
   13913     // likely to be selected as part of a load-modify-store instruction.
   13914     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   13915            UE = Op.getNode()->use_end(); UI != UE; ++UI)
   13916       if (UI->getOpcode() == ISD::STORE)
   13917         goto default_case;
   13918 
   13919     // Otherwise use a regular EFLAGS-setting instruction.
   13920     switch (ArithOp.getOpcode()) {
   13921     default: llvm_unreachable("unexpected operator!");
   13922     case ISD::SUB: Opcode = X86ISD::SUB; break;
   13923     case ISD::XOR: Opcode = X86ISD::XOR; break;
   13924     case ISD::AND: Opcode = X86ISD::AND; break;
   13925     case ISD::OR: {
   13926       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
   13927         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
   13928         if (EFLAGS.getNode())
   13929           return EFLAGS;
   13930       }
   13931       Opcode = X86ISD::OR;
   13932       break;
   13933     }
   13934     }
   13935 
   13936     NumOperands = 2;
   13937     break;
   13938   case X86ISD::ADD:
   13939   case X86ISD::SUB:
   13940   case X86ISD::INC:
   13941   case X86ISD::DEC:
   13942   case X86ISD::OR:
   13943   case X86ISD::XOR:
   13944   case X86ISD::AND:
   13945     return SDValue(Op.getNode(), 1);
   13946   default:
   13947   default_case:
   13948     break;
   13949   }
   13950 
   13951   // If we found that truncation is beneficial, perform the truncation and
   13952   // update 'Op'.
   13953   if (NeedTruncation) {
   13954     EVT VT = Op.getValueType();
   13955     SDValue WideVal = Op->getOperand(0);
   13956     EVT WideVT = WideVal.getValueType();
   13957     unsigned ConvertedOp = 0;
   13958     // Use a target machine opcode to prevent further DAGCombine
   13959     // optimizations that may separate the arithmetic operations
   13960     // from the setcc node.
   13961     switch (WideVal.getOpcode()) {
   13962       default: break;
   13963       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
   13964       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
   13965       case ISD::AND: ConvertedOp = X86ISD::AND; break;
   13966       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
   13967       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
   13968     }
   13969 
   13970     if (ConvertedOp) {
   13971       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   13972       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
   13973         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
   13974         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
   13975         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
   13976       }
   13977     }
   13978   }
   13979 
   13980   if (Opcode == 0)
   13981     // Emit a CMP with 0, which is the TEST pattern.
   13982     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   13983                        DAG.getConstant(0, dl, Op.getValueType()));
   13984 
   13985   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   13986   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
   13987 
   13988   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
   13989   DAG.ReplaceAllUsesWith(Op, New);
   13990   return SDValue(New.getNode(), 1);
   13991 }
   13992 
   13993 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
   13994 /// equivalent.
   13995 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
   13996                                    SDLoc dl, SelectionDAG &DAG) const {
   13997   if (isNullConstant(Op1))
   13998     return EmitTest(Op0, X86CC, dl, DAG);
   13999 
   14000   assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
   14001          "Unexpected comparison operation for MVT::i1 operands");
   14002 
   14003   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
   14004        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
   14005     // Do the comparison at i32 if it's smaller, besides the Atom case.
   14006     // This avoids subregister aliasing issues. Keep the smaller reference
   14007     // if we're optimizing for size, however, as that'll allow better folding
   14008     // of memory operations.
   14009     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
   14010         !DAG.getMachineFunction().getFunction()->optForMinSize() &&
   14011         !Subtarget->isAtom()) {
   14012       unsigned ExtendOp =
   14013           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
   14014       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
   14015       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
   14016     }
   14017     // Use SUB instead of CMP to enable CSE between SUB and CMP.
   14018     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
   14019     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
   14020                               Op0, Op1);
   14021     return SDValue(Sub.getNode(), 1);
   14022   }
   14023   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
   14024 }
   14025 
   14026 /// Convert a comparison if required by the subtarget.
   14027 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
   14028                                                  SelectionDAG &DAG) const {
   14029   // If the subtarget does not support the FUCOMI instruction, floating-point
   14030   // comparisons have to be converted.
   14031   if (Subtarget->hasCMov() ||
   14032       Cmp.getOpcode() != X86ISD::CMP ||
   14033       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
   14034       !Cmp.getOperand(1).getValueType().isFloatingPoint())
   14035     return Cmp;
   14036 
   14037   // The instruction selector will select an FUCOM instruction instead of
   14038   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
   14039   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
   14040   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
   14041   SDLoc dl(Cmp);
   14042   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
   14043   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
   14044   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
   14045                             DAG.getConstant(8, dl, MVT::i8));
   14046   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
   14047 
   14048   // Some 64-bit targets lack SAHF support, but they do support FCOMI.
   14049   assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
   14050   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
   14051 }
   14052 
   14053 /// The minimum architected relative accuracy is 2^-12. We need one
   14054 /// Newton-Raphson step to have a good float result (24 bits of precision).
   14055 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
   14056                                             DAGCombinerInfo &DCI,
   14057                                             unsigned &RefinementSteps,
   14058                                             bool &UseOneConstNR) const {
   14059   EVT VT = Op.getValueType();
   14060   const char *RecipOp;
   14061 
   14062   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
   14063   // TODO: Add support for AVX512 (v16f32).
   14064   // It is likely not profitable to do this for f64 because a double-precision
   14065   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
   14066   // instructions: convert to single, rsqrtss, convert back to double, refine
   14067   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
   14068   // along with FMA, this could be a throughput win.
   14069   if (VT == MVT::f32 && Subtarget->hasSSE1())
   14070     RecipOp = "sqrtf";
   14071   else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
   14072            (VT == MVT::v8f32 && Subtarget->hasAVX()))
   14073     RecipOp = "vec-sqrtf";
   14074   else
   14075     return SDValue();
   14076 
   14077   TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
   14078   if (!Recips.isEnabled(RecipOp))
   14079     return SDValue();
   14080 
   14081   RefinementSteps = Recips.getRefinementSteps(RecipOp);
   14082   UseOneConstNR = false;
   14083   return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
   14084 }
   14085 
   14086 /// The minimum architected relative accuracy is 2^-12. We need one
   14087 /// Newton-Raphson step to have a good float result (24 bits of precision).
   14088 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
   14089                                             DAGCombinerInfo &DCI,
   14090                                             unsigned &RefinementSteps) const {
   14091   EVT VT = Op.getValueType();
   14092   const char *RecipOp;
   14093 
   14094   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
   14095   // TODO: Add support for AVX512 (v16f32).
   14096   // It is likely not profitable to do this for f64 because a double-precision
   14097   // reciprocal estimate with refinement on x86 prior to FMA requires
   14098   // 15 instructions: convert to single, rcpss, convert back to double, refine
   14099   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
   14100   // along with FMA, this could be a throughput win.
   14101   if (VT == MVT::f32 && Subtarget->hasSSE1())
   14102     RecipOp = "divf";
   14103   else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
   14104            (VT == MVT::v8f32 && Subtarget->hasAVX()))
   14105     RecipOp = "vec-divf";
   14106   else
   14107     return SDValue();
   14108 
   14109   TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
   14110   if (!Recips.isEnabled(RecipOp))
   14111     return SDValue();
   14112 
   14113   RefinementSteps = Recips.getRefinementSteps(RecipOp);
   14114   return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
   14115 }
   14116 
   14117 /// If we have at least two divisions that use the same divisor, convert to
   14118 /// multplication by a reciprocal. This may need to be adjusted for a given
   14119 /// CPU if a division's cost is not at least twice the cost of a multiplication.
   14120 /// This is because we still need one division to calculate the reciprocal and
   14121 /// then we need two multiplies by that reciprocal as replacements for the
   14122 /// original divisions.
   14123 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
   14124   return 2;
   14125 }
   14126 
   14127 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
   14128 /// if it's possible.
   14129 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
   14130                                      SDLoc dl, SelectionDAG &DAG) const {
   14131   SDValue Op0 = And.getOperand(0);
   14132   SDValue Op1 = And.getOperand(1);
   14133   if (Op0.getOpcode() == ISD::TRUNCATE)
   14134     Op0 = Op0.getOperand(0);
   14135   if (Op1.getOpcode() == ISD::TRUNCATE)
   14136     Op1 = Op1.getOperand(0);
   14137 
   14138   SDValue LHS, RHS;
   14139   if (Op1.getOpcode() == ISD::SHL)
   14140     std::swap(Op0, Op1);
   14141   if (Op0.getOpcode() == ISD::SHL) {
   14142     if (isOneConstant(Op0.getOperand(0))) {
   14143         // If we looked past a truncate, check that it's only truncating away
   14144         // known zeros.
   14145         unsigned BitWidth = Op0.getValueSizeInBits();
   14146         unsigned AndBitWidth = And.getValueSizeInBits();
   14147         if (BitWidth > AndBitWidth) {
   14148           APInt Zeros, Ones;
   14149           DAG.computeKnownBits(Op0, Zeros, Ones);
   14150           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
   14151             return SDValue();
   14152         }
   14153         LHS = Op1;
   14154         RHS = Op0.getOperand(1);
   14155       }
   14156   } else if (Op1.getOpcode() == ISD::Constant) {
   14157     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
   14158     uint64_t AndRHSVal = AndRHS->getZExtValue();
   14159     SDValue AndLHS = Op0;
   14160 
   14161     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
   14162       LHS = AndLHS.getOperand(0);
   14163       RHS = AndLHS.getOperand(1);
   14164     }
   14165 
   14166     // Use BT if the immediate can't be encoded in a TEST instruction.
   14167     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
   14168       LHS = AndLHS;
   14169       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
   14170     }
   14171   }
   14172 
   14173   if (LHS.getNode()) {
   14174     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
   14175     // instruction.  Since the shift amount is in-range-or-undefined, we know
   14176     // that doing a bittest on the i32 value is ok.  We extend to i32 because
   14177     // the encoding for the i16 version is larger than the i32 version.
   14178     // Also promote i16 to i32 for performance / code size reason.
   14179     if (LHS.getValueType() == MVT::i8 ||
   14180         LHS.getValueType() == MVT::i16)
   14181       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
   14182 
   14183     // If the operand types disagree, extend the shift amount to match.  Since
   14184     // BT ignores high bits (like shifts) we can use anyextend.
   14185     if (LHS.getValueType() != RHS.getValueType())
   14186       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
   14187 
   14188     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
   14189     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
   14190     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   14191                        DAG.getConstant(Cond, dl, MVT::i8), BT);
   14192   }
   14193 
   14194   return SDValue();
   14195 }
   14196 
   14197 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
   14198 /// mask CMPs.
   14199 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
   14200                               SDValue &Op1) {
   14201   unsigned SSECC;
   14202   bool Swap = false;
   14203 
   14204   // SSE Condition code mapping:
   14205   //  0 - EQ
   14206   //  1 - LT
   14207   //  2 - LE
   14208   //  3 - UNORD
   14209   //  4 - NEQ
   14210   //  5 - NLT
   14211   //  6 - NLE
   14212   //  7 - ORD
   14213   switch (SetCCOpcode) {
   14214   default: llvm_unreachable("Unexpected SETCC condition");
   14215   case ISD::SETOEQ:
   14216   case ISD::SETEQ:  SSECC = 0; break;
   14217   case ISD::SETOGT:
   14218   case ISD::SETGT:  Swap = true; // Fallthrough
   14219   case ISD::SETLT:
   14220   case ISD::SETOLT: SSECC = 1; break;
   14221   case ISD::SETOGE:
   14222   case ISD::SETGE:  Swap = true; // Fallthrough
   14223   case ISD::SETLE:
   14224   case ISD::SETOLE: SSECC = 2; break;
   14225   case ISD::SETUO:  SSECC = 3; break;
   14226   case ISD::SETUNE:
   14227   case ISD::SETNE:  SSECC = 4; break;
   14228   case ISD::SETULE: Swap = true; // Fallthrough
   14229   case ISD::SETUGE: SSECC = 5; break;
   14230   case ISD::SETULT: Swap = true; // Fallthrough
   14231   case ISD::SETUGT: SSECC = 6; break;
   14232   case ISD::SETO:   SSECC = 7; break;
   14233   case ISD::SETUEQ:
   14234   case ISD::SETONE: SSECC = 8; break;
   14235   }
   14236   if (Swap)
   14237     std::swap(Op0, Op1);
   14238 
   14239   return SSECC;
   14240 }
   14241 
   14242 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
   14243 // ones, and then concatenate the result back.
   14244 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   14245   MVT VT = Op.getSimpleValueType();
   14246 
   14247   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
   14248          "Unsupported value type for operation");
   14249 
   14250   unsigned NumElems = VT.getVectorNumElements();
   14251   SDLoc dl(Op);
   14252   SDValue CC = Op.getOperand(2);
   14253 
   14254   // Extract the LHS vectors
   14255   SDValue LHS = Op.getOperand(0);
   14256   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
   14257   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
   14258 
   14259   // Extract the RHS vectors
   14260   SDValue RHS = Op.getOperand(1);
   14261   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
   14262   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
   14263 
   14264   // Issue the operation on the smaller types and concatenate the result back
   14265   MVT EltVT = VT.getVectorElementType();
   14266   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   14267   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   14268                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
   14269                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
   14270 }
   14271 
   14272 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
   14273   SDValue Op0 = Op.getOperand(0);
   14274   SDValue Op1 = Op.getOperand(1);
   14275   SDValue CC = Op.getOperand(2);
   14276   MVT VT = Op.getSimpleValueType();
   14277   SDLoc dl(Op);
   14278 
   14279   assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
   14280          "Unexpected type for boolean compare operation");
   14281   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   14282   SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
   14283                                DAG.getConstant(-1, dl, VT));
   14284   SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
   14285                                DAG.getConstant(-1, dl, VT));
   14286   switch (SetCCOpcode) {
   14287   default: llvm_unreachable("Unexpected SETCC condition");
   14288   case ISD::SETEQ:
   14289     // (x == y) -> ~(x ^ y)
   14290     return DAG.getNode(ISD::XOR, dl, VT,
   14291                        DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
   14292                        DAG.getConstant(-1, dl, VT));
   14293   case ISD::SETNE:
   14294     // (x != y) -> (x ^ y)
   14295     return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
   14296   case ISD::SETUGT:
   14297   case ISD::SETGT:
   14298     // (x > y) -> (x & ~y)
   14299     return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
   14300   case ISD::SETULT:
   14301   case ISD::SETLT:
   14302     // (x < y) -> (~x & y)
   14303     return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
   14304   case ISD::SETULE:
   14305   case ISD::SETLE:
   14306     // (x <= y) -> (~x | y)
   14307     return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
   14308   case ISD::SETUGE:
   14309   case ISD::SETGE:
   14310     // (x >=y) -> (x | ~y)
   14311     return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
   14312   }
   14313 }
   14314 
   14315 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
   14316                                      const X86Subtarget *Subtarget) {
   14317   SDValue Op0 = Op.getOperand(0);
   14318   SDValue Op1 = Op.getOperand(1);
   14319   SDValue CC = Op.getOperand(2);
   14320   MVT VT = Op.getSimpleValueType();
   14321   SDLoc dl(Op);
   14322 
   14323   assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 &&
   14324          Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
   14325          "Cannot set masked compare for this operation");
   14326 
   14327   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   14328   unsigned  Opc = 0;
   14329   bool Unsigned = false;
   14330   bool Swap = false;
   14331   unsigned SSECC;
   14332   switch (SetCCOpcode) {
   14333   default: llvm_unreachable("Unexpected SETCC condition");
   14334   case ISD::SETNE:  SSECC = 4; break;
   14335   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
   14336   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
   14337   case ISD::SETLT:  Swap = true; //fall-through
   14338   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
   14339   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
   14340   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
   14341   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
   14342   case ISD::SETULE: Unsigned = true; //fall-through
   14343   case ISD::SETLE:  SSECC = 2; break;
   14344   }
   14345 
   14346   if (Swap)
   14347     std::swap(Op0, Op1);
   14348   if (Opc)
   14349     return DAG.getNode(Opc, dl, VT, Op0, Op1);
   14350   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
   14351   return DAG.getNode(Opc, dl, VT, Op0, Op1,
   14352                      DAG.getConstant(SSECC, dl, MVT::i8));
   14353 }
   14354 
   14355 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
   14356 /// operand \p Op1.  If non-trivial (for example because it's not constant)
   14357 /// return an empty value.
   14358 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
   14359 {
   14360   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
   14361   if (!BV)
   14362     return SDValue();
   14363 
   14364   MVT VT = Op1.getSimpleValueType();
   14365   MVT EVT = VT.getVectorElementType();
   14366   unsigned n = VT.getVectorNumElements();
   14367   SmallVector<SDValue, 8> ULTOp1;
   14368 
   14369   for (unsigned i = 0; i < n; ++i) {
   14370     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
   14371     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
   14372       return SDValue();
   14373 
   14374     // Avoid underflow.
   14375     APInt Val = Elt->getAPIntValue();
   14376     if (Val == 0)
   14377       return SDValue();
   14378 
   14379     ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
   14380   }
   14381 
   14382   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
   14383 }
   14384 
   14385 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   14386                            SelectionDAG &DAG) {
   14387   SDValue Op0 = Op.getOperand(0);
   14388   SDValue Op1 = Op.getOperand(1);
   14389   SDValue CC = Op.getOperand(2);
   14390   MVT VT = Op.getSimpleValueType();
   14391   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   14392   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
   14393   SDLoc dl(Op);
   14394 
   14395   if (isFP) {
   14396 #ifndef NDEBUG
   14397     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
   14398     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
   14399 #endif
   14400 
   14401     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
   14402     unsigned Opc = X86ISD::CMPP;
   14403     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
   14404       assert(VT.getVectorNumElements() <= 16);
   14405       Opc = X86ISD::CMPM;
   14406     }
   14407     // In the two special cases we can't handle, emit two comparisons.
   14408     if (SSECC == 8) {
   14409       unsigned CC0, CC1;
   14410       unsigned CombineOpc;
   14411       if (SetCCOpcode == ISD::SETUEQ) {
   14412         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
   14413       } else {
   14414         assert(SetCCOpcode == ISD::SETONE);
   14415         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
   14416       }
   14417 
   14418       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
   14419                                  DAG.getConstant(CC0, dl, MVT::i8));
   14420       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
   14421                                  DAG.getConstant(CC1, dl, MVT::i8));
   14422       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
   14423     }
   14424     // Handle all other FP comparisons here.
   14425     return DAG.getNode(Opc, dl, VT, Op0, Op1,
   14426                        DAG.getConstant(SSECC, dl, MVT::i8));
   14427   }
   14428 
   14429   MVT VTOp0 = Op0.getSimpleValueType();
   14430   assert(VTOp0 == Op1.getSimpleValueType() &&
   14431          "Expected operands with same type!");
   14432   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
   14433          "Invalid number of packed elements for source and destination!");
   14434 
   14435   if (VT.is128BitVector() && VTOp0.is256BitVector()) {
   14436     // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
   14437     // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
   14438     // legalizer firstly checks if the first operand in input to the setcc has
   14439     // a legal type. If so, then it promotes the return type to that same type.
   14440     // Otherwise, the return type is promoted to the 'next legal type' which,
   14441     // for a vector of MVT::i1 is always a 128-bit integer vector type.
   14442     //
   14443     // We reach this code only if the following two conditions are met:
   14444     // 1. Both return type and operand type have been promoted to wider types
   14445     //    by the type legalizer.
   14446     // 2. The original operand type has been promoted to a 256-bit vector.
   14447     //
   14448     // Note that condition 2. only applies for AVX targets.
   14449     SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
   14450     return DAG.getZExtOrTrunc(NewOp, dl, VT);
   14451   }
   14452 
   14453   // The non-AVX512 code below works under the assumption that source and
   14454   // destination types are the same.
   14455   assert((Subtarget->hasAVX512() || (VT == VTOp0)) &&
   14456          "Value types for source and destination must be the same!");
   14457 
   14458   // Break 256-bit integer vector compare into smaller ones.
   14459   if (VT.is256BitVector() && !Subtarget->hasInt256())
   14460     return Lower256IntVSETCC(Op, DAG);
   14461 
   14462   MVT OpVT = Op1.getSimpleValueType();
   14463   if (OpVT.getVectorElementType() == MVT::i1)
   14464     return LowerBoolVSETCC_AVX512(Op, DAG);
   14465 
   14466   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
   14467   if (Subtarget->hasAVX512()) {
   14468     if (Op1.getSimpleValueType().is512BitVector() ||
   14469         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
   14470         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
   14471       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
   14472 
   14473     // In AVX-512 architecture setcc returns mask with i1 elements,
   14474     // But there is no compare instruction for i8 and i16 elements in KNL.
   14475     // We are not talking about 512-bit operands in this case, these
   14476     // types are illegal.
   14477     if (MaskResult &&
   14478         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
   14479          OpVT.getVectorElementType().getSizeInBits() >= 8))
   14480       return DAG.getNode(ISD::TRUNCATE, dl, VT,
   14481                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
   14482   }
   14483 
   14484   // Lower using XOP integer comparisons.
   14485   if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
   14486        VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) {
   14487     // Translate compare code to XOP PCOM compare mode.
   14488     unsigned CmpMode = 0;
   14489     switch (SetCCOpcode) {
   14490     default: llvm_unreachable("Unexpected SETCC condition");
   14491     case ISD::SETULT:
   14492     case ISD::SETLT: CmpMode = 0x00; break;
   14493     case ISD::SETULE:
   14494     case ISD::SETLE: CmpMode = 0x01; break;
   14495     case ISD::SETUGT:
   14496     case ISD::SETGT: CmpMode = 0x02; break;
   14497     case ISD::SETUGE:
   14498     case ISD::SETGE: CmpMode = 0x03; break;
   14499     case ISD::SETEQ: CmpMode = 0x04; break;
   14500     case ISD::SETNE: CmpMode = 0x05; break;
   14501     }
   14502 
   14503     // Are we comparing unsigned or signed integers?
   14504     unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
   14505       ? X86ISD::VPCOMU : X86ISD::VPCOM;
   14506 
   14507     return DAG.getNode(Opc, dl, VT, Op0, Op1,
   14508                        DAG.getConstant(CmpMode, dl, MVT::i8));
   14509   }
   14510 
   14511   // We are handling one of the integer comparisons here.  Since SSE only has
   14512   // GT and EQ comparisons for integer, swapping operands and multiple
   14513   // operations may be required for some comparisons.
   14514   unsigned Opc;
   14515   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
   14516   bool Subus = false;
   14517 
   14518   switch (SetCCOpcode) {
   14519   default: llvm_unreachable("Unexpected SETCC condition");
   14520   case ISD::SETNE:  Invert = true;
   14521   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
   14522   case ISD::SETLT:  Swap = true;
   14523   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
   14524   case ISD::SETGE:  Swap = true;
   14525   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
   14526                     Invert = true; break;
   14527   case ISD::SETULT: Swap = true;
   14528   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
   14529                     FlipSigns = true; break;
   14530   case ISD::SETUGE: Swap = true;
   14531   case ISD::SETULE: Opc = X86ISD::PCMPGT;
   14532                     FlipSigns = true; Invert = true; break;
   14533   }
   14534 
   14535   // Special case: Use min/max operations for SETULE/SETUGE
   14536   MVT VET = VT.getVectorElementType();
   14537   bool hasMinMax =
   14538        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
   14539     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
   14540 
   14541   if (hasMinMax) {
   14542     switch (SetCCOpcode) {
   14543     default: break;
   14544     case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
   14545     case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
   14546     }
   14547 
   14548     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
   14549   }
   14550 
   14551   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
   14552   if (!MinMax && hasSubus) {
   14553     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
   14554     // Op0 u<= Op1:
   14555     //   t = psubus Op0, Op1
   14556     //   pcmpeq t, <0..0>
   14557     switch (SetCCOpcode) {
   14558     default: break;
   14559     case ISD::SETULT: {
   14560       // If the comparison is against a constant we can turn this into a
   14561       // setule.  With psubus, setule does not require a swap.  This is
   14562       // beneficial because the constant in the register is no longer
   14563       // destructed as the destination so it can be hoisted out of a loop.
   14564       // Only do this pre-AVX since vpcmp* is no longer destructive.
   14565       if (Subtarget->hasAVX())
   14566         break;
   14567       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
   14568       if (ULEOp1.getNode()) {
   14569         Op1 = ULEOp1;
   14570         Subus = true; Invert = false; Swap = false;
   14571       }
   14572       break;
   14573     }
   14574     // Psubus is better than flip-sign because it requires no inversion.
   14575     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
   14576     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
   14577     }
   14578 
   14579     if (Subus) {
   14580       Opc = X86ISD::SUBUS;
   14581       FlipSigns = false;
   14582     }
   14583   }
   14584 
   14585   if (Swap)
   14586     std::swap(Op0, Op1);
   14587 
   14588   // Check that the operation in question is available (most are plain SSE2,
   14589   // but PCMPGTQ and PCMPEQQ have different requirements).
   14590   if (VT == MVT::v2i64) {
   14591     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
   14592       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
   14593 
   14594       // First cast everything to the right type.
   14595       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
   14596       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
   14597 
   14598       // Since SSE has no unsigned integer comparisons, we need to flip the sign
   14599       // bits of the inputs before performing those operations. The lower
   14600       // compare is always unsigned.
   14601       SDValue SB;
   14602       if (FlipSigns) {
   14603         SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
   14604       } else {
   14605         SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
   14606         SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
   14607         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   14608                          Sign, Zero, Sign, Zero);
   14609       }
   14610       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
   14611       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
   14612 
   14613       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
   14614       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
   14615       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
   14616 
   14617       // Create masks for only the low parts/high parts of the 64 bit integers.
   14618       static const int MaskHi[] = { 1, 1, 3, 3 };
   14619       static const int MaskLo[] = { 0, 0, 2, 2 };
   14620       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
   14621       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
   14622       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
   14623 
   14624       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
   14625       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
   14626 
   14627       if (Invert)
   14628         Result = DAG.getNOT(dl, Result, MVT::v4i32);
   14629 
   14630       return DAG.getBitcast(VT, Result);
   14631     }
   14632 
   14633     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
   14634       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
   14635       // pcmpeqd + pshufd + pand.
   14636       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
   14637 
   14638       // First cast everything to the right type.
   14639       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
   14640       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
   14641 
   14642       // Do the compare.
   14643       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
   14644 
   14645       // Make sure the lower and upper halves are both all-ones.
   14646       static const int Mask[] = { 1, 0, 3, 2 };
   14647       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
   14648       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
   14649 
   14650       if (Invert)
   14651         Result = DAG.getNOT(dl, Result, MVT::v4i32);
   14652 
   14653       return DAG.getBitcast(VT, Result);
   14654     }
   14655   }
   14656 
   14657   // Since SSE has no unsigned integer comparisons, we need to flip the sign
   14658   // bits of the inputs before performing those operations.
   14659   if (FlipSigns) {
   14660     MVT EltVT = VT.getVectorElementType();
   14661     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
   14662                                  VT);
   14663     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
   14664     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
   14665   }
   14666 
   14667   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
   14668 
   14669   // If the logical-not of the result is required, perform that now.
   14670   if (Invert)
   14671     Result = DAG.getNOT(dl, Result, VT);
   14672 
   14673   if (MinMax)
   14674     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
   14675 
   14676   if (Subus)
   14677     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
   14678                          getZeroVector(VT, Subtarget, DAG, dl));
   14679 
   14680   return Result;
   14681 }
   14682 
   14683 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   14684 
   14685   MVT VT = Op.getSimpleValueType();
   14686 
   14687   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
   14688 
   14689   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
   14690          && "SetCC type must be 8-bit or 1-bit integer");
   14691   SDValue Op0 = Op.getOperand(0);
   14692   SDValue Op1 = Op.getOperand(1);
   14693   SDLoc dl(Op);
   14694   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   14695 
   14696   // Optimize to BT if possible.
   14697   // Lower (X & (1 << N)) == 0 to BT(X, N).
   14698   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
   14699   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
   14700   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
   14701       isNullConstant(Op1) &&
   14702       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   14703     if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
   14704       if (VT == MVT::i1)
   14705         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
   14706       return NewSetCC;
   14707     }
   14708   }
   14709 
   14710   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   14711   // these.
   14712   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
   14713       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   14714 
   14715     // If the input is a setcc, then reuse the input setcc or use a new one with
   14716     // the inverted condition.
   14717     if (Op0.getOpcode() == X86ISD::SETCC) {
   14718       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
   14719       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
   14720       if (!Invert)
   14721         return Op0;
   14722 
   14723       CCode = X86::GetOppositeBranchCondition(CCode);
   14724       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   14725                                   DAG.getConstant(CCode, dl, MVT::i8),
   14726                                   Op0.getOperand(1));
   14727       if (VT == MVT::i1)
   14728         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
   14729       return SetCC;
   14730     }
   14731   }
   14732   if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) &&
   14733       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   14734 
   14735     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
   14736     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
   14737   }
   14738 
   14739   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
   14740   unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG);
   14741   if (X86CC == X86::COND_INVALID)
   14742     return SDValue();
   14743 
   14744   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
   14745   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   14746   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   14747                               DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
   14748   if (VT == MVT::i1)
   14749     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
   14750   return SetCC;
   14751 }
   14752 
   14753 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
   14754   SDValue LHS = Op.getOperand(0);
   14755   SDValue RHS = Op.getOperand(1);
   14756   SDValue Carry = Op.getOperand(2);
   14757   SDValue Cond = Op.getOperand(3);
   14758   SDLoc DL(Op);
   14759 
   14760   assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
   14761   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
   14762 
   14763   assert(Carry.getOpcode() != ISD::CARRY_FALSE);
   14764   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   14765   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
   14766   return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(),
   14767                      DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
   14768 }
   14769 
   14770 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
   14771 static bool isX86LogicalCmp(SDValue Op) {
   14772   unsigned Opc = Op.getNode()->getOpcode();
   14773   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
   14774       Opc == X86ISD::SAHF)
   14775     return true;
   14776   if (Op.getResNo() == 1 &&
   14777       (Opc == X86ISD::ADD ||
   14778        Opc == X86ISD::SUB ||
   14779        Opc == X86ISD::ADC ||
   14780        Opc == X86ISD::SBB ||
   14781        Opc == X86ISD::SMUL ||
   14782        Opc == X86ISD::UMUL ||
   14783        Opc == X86ISD::INC ||
   14784        Opc == X86ISD::DEC ||
   14785        Opc == X86ISD::OR ||
   14786        Opc == X86ISD::XOR ||
   14787        Opc == X86ISD::AND))
   14788     return true;
   14789 
   14790   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
   14791     return true;
   14792 
   14793   return false;
   14794 }
   14795 
   14796 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   14797   if (V.getOpcode() != ISD::TRUNCATE)
   14798     return false;
   14799 
   14800   SDValue VOp0 = V.getOperand(0);
   14801   unsigned InBits = VOp0.getValueSizeInBits();
   14802   unsigned Bits = V.getValueSizeInBits();
   14803   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
   14804 }
   14805 
   14806 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   14807   bool addTest = true;
   14808   SDValue Cond  = Op.getOperand(0);
   14809   SDValue Op1 = Op.getOperand(1);
   14810   SDValue Op2 = Op.getOperand(2);
   14811   SDLoc DL(Op);
   14812   MVT VT = Op1.getSimpleValueType();
   14813   SDValue CC;
   14814 
   14815   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
   14816   // are available or VBLENDV if AVX is available.
   14817   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
   14818   if (Cond.getOpcode() == ISD::SETCC &&
   14819       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
   14820        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
   14821       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
   14822     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
   14823     int SSECC = translateX86FSETCC(
   14824         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
   14825 
   14826     if (SSECC != 8) {
   14827       if (Subtarget->hasAVX512()) {
   14828         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
   14829                                   DAG.getConstant(SSECC, DL, MVT::i8));
   14830         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
   14831       }
   14832 
   14833       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
   14834                                 DAG.getConstant(SSECC, DL, MVT::i8));
   14835 
   14836       // If we have AVX, we can use a variable vector select (VBLENDV) instead
   14837       // of 3 logic instructions for size savings and potentially speed.
   14838       // Unfortunately, there is no scalar form of VBLENDV.
   14839 
   14840       // If either operand is a constant, don't try this. We can expect to
   14841       // optimize away at least one of the logic instructions later in that
   14842       // case, so that sequence would be faster than a variable blend.
   14843 
   14844       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
   14845       // uses XMM0 as the selection register. That may need just as many
   14846       // instructions as the AND/ANDN/OR sequence due to register moves, so
   14847       // don't bother.
   14848 
   14849       if (Subtarget->hasAVX() &&
   14850           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
   14851 
   14852         // Convert to vectors, do a VSELECT, and convert back to scalar.
   14853         // All of the conversions should be optimized away.
   14854 
   14855         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
   14856         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
   14857         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
   14858         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
   14859 
   14860         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
   14861         VCmp = DAG.getBitcast(VCmpVT, VCmp);
   14862 
   14863         SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
   14864 
   14865         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
   14866                            VSel, DAG.getIntPtrConstant(0, DL));
   14867       }
   14868       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
   14869       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
   14870       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
   14871     }
   14872   }
   14873 
   14874   if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
   14875     SDValue Op1Scalar;
   14876     if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
   14877       Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
   14878     else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
   14879       Op1Scalar = Op1.getOperand(0);
   14880     SDValue Op2Scalar;
   14881     if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
   14882       Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
   14883     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
   14884       Op2Scalar = Op2.getOperand(0);
   14885     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
   14886       SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
   14887                                       Op1Scalar.getValueType(),
   14888                                       Cond, Op1Scalar, Op2Scalar);
   14889       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
   14890         return DAG.getBitcast(VT, newSelect);
   14891       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
   14892       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
   14893                          DAG.getIntPtrConstant(0, DL));
   14894     }
   14895   }
   14896 
   14897   if (VT == MVT::v4i1 || VT == MVT::v2i1) {
   14898     SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
   14899     Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
   14900                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
   14901     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
   14902                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
   14903     SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
   14904                                     Cond, Op1, Op2);
   14905     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
   14906   }
   14907 
   14908   if (Cond.getOpcode() == ISD::SETCC) {
   14909     SDValue NewCond = LowerSETCC(Cond, DAG);
   14910     if (NewCond.getNode())
   14911       Cond = NewCond;
   14912   }
   14913 
   14914   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   14915   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
   14916   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
   14917   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
   14918   if (Cond.getOpcode() == X86ISD::SETCC &&
   14919       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
   14920       isNullConstant(Cond.getOperand(1).getOperand(1))) {
   14921     SDValue Cmp = Cond.getOperand(1);
   14922 
   14923     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
   14924 
   14925     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
   14926         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
   14927       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
   14928 
   14929       SDValue CmpOp0 = Cmp.getOperand(0);
   14930       // Apply further optimizations for special cases
   14931       // (select (x != 0), -1, 0) -> neg & sbb
   14932       // (select (x == 0), 0, -1) -> neg & sbb
   14933       if (isNullConstant(Y) &&
   14934             (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
   14935           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
   14936           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
   14937                                     DAG.getConstant(0, DL,
   14938                                                     CmpOp0.getValueType()),
   14939                                     CmpOp0);
   14940           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   14941                                     DAG.getConstant(X86::COND_B, DL, MVT::i8),
   14942                                     SDValue(Neg.getNode(), 1));
   14943           return Res;
   14944         }
   14945 
   14946       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
   14947                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
   14948       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   14949 
   14950       SDValue Res =   // Res = 0 or -1.
   14951         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   14952                     DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
   14953 
   14954       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
   14955         Res = DAG.getNOT(DL, Res, Res.getValueType());
   14956 
   14957       if (!isNullConstant(Op2))
   14958         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
   14959       return Res;
   14960     }
   14961   }
   14962 
   14963   // Look past (and (setcc_carry (cmp ...)), 1).
   14964   if (Cond.getOpcode() == ISD::AND &&
   14965       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
   14966       isOneConstant(Cond.getOperand(1)))
   14967     Cond = Cond.getOperand(0);
   14968 
   14969   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   14970   // setting operand in place of the X86ISD::SETCC.
   14971   unsigned CondOpcode = Cond.getOpcode();
   14972   if (CondOpcode == X86ISD::SETCC ||
   14973       CondOpcode == X86ISD::SETCC_CARRY) {
   14974     CC = Cond.getOperand(0);
   14975 
   14976     SDValue Cmp = Cond.getOperand(1);
   14977     unsigned Opc = Cmp.getOpcode();
   14978     MVT VT = Op.getSimpleValueType();
   14979 
   14980     bool IllegalFPCMov = false;
   14981     if (VT.isFloatingPoint() && !VT.isVector() &&
   14982         !isScalarFPTypeInSSEReg(VT))  // FPStack?
   14983       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
   14984 
   14985     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
   14986         Opc == X86ISD::BT) { // FIXME
   14987       Cond = Cmp;
   14988       addTest = false;
   14989     }
   14990   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   14991              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   14992              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   14993               Cond.getOperand(0).getValueType() != MVT::i8)) {
   14994     SDValue LHS = Cond.getOperand(0);
   14995     SDValue RHS = Cond.getOperand(1);
   14996     unsigned X86Opcode;
   14997     unsigned X86Cond;
   14998     SDVTList VTs;
   14999     switch (CondOpcode) {
   15000     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   15001     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   15002     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   15003     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   15004     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   15005     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   15006     default: llvm_unreachable("unexpected overflowing operator");
   15007     }
   15008     if (CondOpcode == ISD::UMULO)
   15009       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   15010                           MVT::i32);
   15011     else
   15012       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   15013 
   15014     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
   15015 
   15016     if (CondOpcode == ISD::UMULO)
   15017       Cond = X86Op.getValue(2);
   15018     else
   15019       Cond = X86Op.getValue(1);
   15020 
   15021     CC = DAG.getConstant(X86Cond, DL, MVT::i8);
   15022     addTest = false;
   15023   }
   15024 
   15025   if (addTest) {
   15026     // Look past the truncate if the high bits are known zero.
   15027     if (isTruncWithZeroHighBitsInput(Cond, DAG))
   15028       Cond = Cond.getOperand(0);
   15029 
   15030     // We know the result of AND is compared against zero. Try to match
   15031     // it to BT.
   15032     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   15033       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
   15034         CC = NewSetCC.getOperand(0);
   15035         Cond = NewSetCC.getOperand(1);
   15036         addTest = false;
   15037       }
   15038     }
   15039   }
   15040 
   15041   if (addTest) {
   15042     CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
   15043     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
   15044   }
   15045 
   15046   // a <  b ? -1 :  0 -> RES = ~setcc_carry
   15047   // a <  b ?  0 : -1 -> RES = setcc_carry
   15048   // a >= b ? -1 :  0 -> RES = setcc_carry
   15049   // a >= b ?  0 : -1 -> RES = ~setcc_carry
   15050   if (Cond.getOpcode() == X86ISD::SUB) {
   15051     Cond = ConvertCmpIfNecessary(Cond, DAG);
   15052     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
   15053 
   15054     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
   15055         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
   15056         (isNullConstant(Op1) || isNullConstant(Op2))) {
   15057       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   15058                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
   15059                                 Cond);
   15060       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
   15061         return DAG.getNOT(DL, Res, Res.getValueType());
   15062       return Res;
   15063     }
   15064   }
   15065 
   15066   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
   15067   // widen the cmov and push the truncate through. This avoids introducing a new
   15068   // branch during isel and doesn't add any extensions.
   15069   if (Op.getValueType() == MVT::i8 &&
   15070       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
   15071     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
   15072     if (T1.getValueType() == T2.getValueType() &&
   15073         // Blacklist CopyFromReg to avoid partial register stalls.
   15074         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
   15075       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
   15076       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
   15077       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
   15078     }
   15079   }
   15080 
   15081   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   15082   // condition is true.
   15083   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   15084   SDValue Ops[] = { Op2, Op1, CC, Cond };
   15085   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
   15086 }
   15087 
   15088 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
   15089                                        const X86Subtarget *Subtarget,
   15090                                        SelectionDAG &DAG) {
   15091   MVT VT = Op->getSimpleValueType(0);
   15092   SDValue In = Op->getOperand(0);
   15093   MVT InVT = In.getSimpleValueType();
   15094   MVT VTElt = VT.getVectorElementType();
   15095   MVT InVTElt = InVT.getVectorElementType();
   15096   SDLoc dl(Op);
   15097 
   15098   // SKX processor
   15099   if ((InVTElt == MVT::i1) &&
   15100       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
   15101         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
   15102 
   15103        ((Subtarget->hasBWI() && VT.is512BitVector() &&
   15104         VTElt.getSizeInBits() <= 16)) ||
   15105 
   15106        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
   15107         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
   15108 
   15109        ((Subtarget->hasDQI() && VT.is512BitVector() &&
   15110         VTElt.getSizeInBits() >= 32))))
   15111     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   15112 
   15113   unsigned int NumElts = VT.getVectorNumElements();
   15114 
   15115   if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
   15116     return SDValue();
   15117 
   15118   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
   15119     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
   15120       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
   15121     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   15122   }
   15123 
   15124   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
   15125   MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
   15126   SDValue NegOne =
   15127    DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
   15128                    ExtVT);
   15129   SDValue Zero =
   15130    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
   15131 
   15132   SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
   15133   if (VT.is512BitVector())
   15134     return V;
   15135   return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
   15136 }
   15137 
   15138 static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
   15139                                              const X86Subtarget *Subtarget,
   15140                                              SelectionDAG &DAG) {
   15141   SDValue In = Op->getOperand(0);
   15142   MVT VT = Op->getSimpleValueType(0);
   15143   MVT InVT = In.getSimpleValueType();
   15144   assert(VT.getSizeInBits() == InVT.getSizeInBits());
   15145 
   15146   MVT InSVT = InVT.getVectorElementType();
   15147   assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits());
   15148 
   15149   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
   15150     return SDValue();
   15151   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
   15152     return SDValue();
   15153 
   15154   SDLoc dl(Op);
   15155 
   15156   // SSE41 targets can use the pmovsx* instructions directly.
   15157   if (Subtarget->hasSSE41())
   15158     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   15159 
   15160   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
   15161   SDValue Curr = In;
   15162   MVT CurrVT = InVT;
   15163 
   15164   // As SRAI is only available on i16/i32 types, we expand only up to i32
   15165   // and handle i64 separately.
   15166   while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
   15167     Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
   15168     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
   15169     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
   15170     Curr = DAG.getBitcast(CurrVT, Curr);
   15171   }
   15172 
   15173   SDValue SignExt = Curr;
   15174   if (CurrVT != InVT) {
   15175     unsigned SignExtShift =
   15176         CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits();
   15177     SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
   15178                           DAG.getConstant(SignExtShift, dl, MVT::i8));
   15179   }
   15180 
   15181   if (CurrVT == VT)
   15182     return SignExt;
   15183 
   15184   if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
   15185     SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
   15186                                DAG.getConstant(31, dl, MVT::i8));
   15187     SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
   15188     return DAG.getBitcast(VT, Ext);
   15189   }
   15190 
   15191   return SDValue();
   15192 }
   15193 
   15194 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   15195                                 SelectionDAG &DAG) {
   15196   MVT VT = Op->getSimpleValueType(0);
   15197   SDValue In = Op->getOperand(0);
   15198   MVT InVT = In.getSimpleValueType();
   15199   SDLoc dl(Op);
   15200 
   15201   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
   15202     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
   15203 
   15204   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
   15205       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
   15206       (VT != MVT::v16i16 || InVT != MVT::v16i8))
   15207     return SDValue();
   15208 
   15209   if (Subtarget->hasInt256())
   15210     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   15211 
   15212   // Optimize vectors in AVX mode
   15213   // Sign extend  v8i16 to v8i32 and
   15214   //              v4i32 to v4i64
   15215   //
   15216   // Divide input vector into two parts
   15217   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
   15218   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
   15219   // concat the vectors to original VT
   15220 
   15221   unsigned NumElems = InVT.getVectorNumElements();
   15222   SDValue Undef = DAG.getUNDEF(InVT);
   15223 
   15224   SmallVector<int,8> ShufMask1(NumElems, -1);
   15225   for (unsigned i = 0; i != NumElems/2; ++i)
   15226     ShufMask1[i] = i;
   15227 
   15228   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
   15229 
   15230   SmallVector<int,8> ShufMask2(NumElems, -1);
   15231   for (unsigned i = 0; i != NumElems/2; ++i)
   15232     ShufMask2[i] = i + NumElems/2;
   15233 
   15234   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
   15235 
   15236   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
   15237                                 VT.getVectorNumElements()/2);
   15238 
   15239   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
   15240   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
   15241 
   15242   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   15243 }
   15244 
   15245 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
   15246 // may emit an illegal shuffle but the expansion is still better than scalar
   15247 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
   15248 // we'll emit a shuffle and a arithmetic shift.
   15249 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
   15250 // TODO: It is possible to support ZExt by zeroing the undef values during
   15251 // the shuffle phase or after the shuffle.
   15252 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
   15253                                  SelectionDAG &DAG) {
   15254   MVT RegVT = Op.getSimpleValueType();
   15255   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
   15256   assert(RegVT.isInteger() &&
   15257          "We only custom lower integer vector sext loads.");
   15258 
   15259   // Nothing useful we can do without SSE2 shuffles.
   15260   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
   15261 
   15262   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   15263   SDLoc dl(Ld);
   15264   EVT MemVT = Ld->getMemoryVT();
   15265   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   15266   unsigned RegSz = RegVT.getSizeInBits();
   15267 
   15268   ISD::LoadExtType Ext = Ld->getExtensionType();
   15269 
   15270   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
   15271          && "Only anyext and sext are currently implemented.");
   15272   assert(MemVT != RegVT && "Cannot extend to the same type");
   15273   assert(MemVT.isVector() && "Must load a vector from memory");
   15274 
   15275   unsigned NumElems = RegVT.getVectorNumElements();
   15276   unsigned MemSz = MemVT.getSizeInBits();
   15277   assert(RegSz > MemSz && "Register size must be greater than the mem size");
   15278 
   15279   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
   15280     // The only way in which we have a legal 256-bit vector result but not the
   15281     // integer 256-bit operations needed to directly lower a sextload is if we
   15282     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
   15283     // a 128-bit vector and a normal sign_extend to 256-bits that should get
   15284     // correctly legalized. We do this late to allow the canonical form of
   15285     // sextload to persist throughout the rest of the DAG combiner -- it wants
   15286     // to fold together any extensions it can, and so will fuse a sign_extend
   15287     // of an sextload into a sextload targeting a wider value.
   15288     SDValue Load;
   15289     if (MemSz == 128) {
   15290       // Just switch this to a normal load.
   15291       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
   15292                                        "it must be a legal 128-bit vector "
   15293                                        "type!");
   15294       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
   15295                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
   15296                   Ld->isInvariant(), Ld->getAlignment());
   15297     } else {
   15298       assert(MemSz < 128 &&
   15299              "Can't extend a type wider than 128 bits to a 256 bit vector!");
   15300       // Do an sext load to a 128-bit vector type. We want to use the same
   15301       // number of elements, but elements half as wide. This will end up being
   15302       // recursively lowered by this routine, but will succeed as we definitely
   15303       // have all the necessary features if we're using AVX1.
   15304       EVT HalfEltVT =
   15305           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
   15306       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
   15307       Load =
   15308           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
   15309                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
   15310                          Ld->isNonTemporal(), Ld->isInvariant(),
   15311                          Ld->getAlignment());
   15312     }
   15313 
   15314     // Replace chain users with the new chain.
   15315     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
   15316     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
   15317 
   15318     // Finally, do a normal sign-extend to the desired register.
   15319     return DAG.getSExtOrTrunc(Load, dl, RegVT);
   15320   }
   15321 
   15322   // All sizes must be a power of two.
   15323   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
   15324          "Non-power-of-two elements are not custom lowered!");
   15325 
   15326   // Attempt to load the original value using scalar loads.
   15327   // Find the largest scalar type that divides the total loaded size.
   15328   MVT SclrLoadTy = MVT::i8;
   15329   for (MVT Tp : MVT::integer_valuetypes()) {
   15330     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
   15331       SclrLoadTy = Tp;
   15332     }
   15333   }
   15334 
   15335   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
   15336   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
   15337       (64 <= MemSz))
   15338     SclrLoadTy = MVT::f64;
   15339 
   15340   // Calculate the number of scalar loads that we need to perform
   15341   // in order to load our vector from memory.
   15342   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
   15343 
   15344   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
   15345          "Can only lower sext loads with a single scalar load!");
   15346 
   15347   unsigned loadRegZize = RegSz;
   15348   if (Ext == ISD::SEXTLOAD && RegSz >= 256)
   15349     loadRegZize = 128;
   15350 
   15351   // Represent our vector as a sequence of elements which are the
   15352   // largest scalar that we can load.
   15353   EVT LoadUnitVecVT = EVT::getVectorVT(
   15354       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
   15355 
   15356   // Represent the data using the same element type that is stored in
   15357   // memory. In practice, we ''widen'' MemVT.
   15358   EVT WideVecVT =
   15359       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
   15360                        loadRegZize / MemVT.getScalarSizeInBits());
   15361 
   15362   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
   15363          "Invalid vector type");
   15364 
   15365   // We can't shuffle using an illegal type.
   15366   assert(TLI.isTypeLegal(WideVecVT) &&
   15367          "We only lower types that form legal widened vector types");
   15368 
   15369   SmallVector<SDValue, 8> Chains;
   15370   SDValue Ptr = Ld->getBasePtr();
   15371   SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
   15372                                       TLI.getPointerTy(DAG.getDataLayout()));
   15373   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
   15374 
   15375   for (unsigned i = 0; i < NumLoads; ++i) {
   15376     // Perform a single load.
   15377     SDValue ScalarLoad =
   15378         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
   15379                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
   15380                     Ld->getAlignment());
   15381     Chains.push_back(ScalarLoad.getValue(1));
   15382     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
   15383     // another round of DAGCombining.
   15384     if (i == 0)
   15385       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
   15386     else
   15387       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
   15388                         ScalarLoad, DAG.getIntPtrConstant(i, dl));
   15389 
   15390     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   15391   }
   15392 
   15393   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   15394 
   15395   // Bitcast the loaded value to a vector of the original element type, in
   15396   // the size of the target vector type.
   15397   SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
   15398   unsigned SizeRatio = RegSz / MemSz;
   15399 
   15400   if (Ext == ISD::SEXTLOAD) {
   15401     // If we have SSE4.1, we can directly emit a VSEXT node.
   15402     if (Subtarget->hasSSE41()) {
   15403       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
   15404       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   15405       return Sext;
   15406     }
   15407 
   15408     // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
   15409     // lanes.
   15410     assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
   15411            "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
   15412 
   15413     SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
   15414     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   15415     return Shuff;
   15416   }
   15417 
   15418   // Redistribute the loaded elements into the different locations.
   15419   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   15420   for (unsigned i = 0; i != NumElems; ++i)
   15421     ShuffleVec[i * SizeRatio] = i;
   15422 
   15423   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
   15424                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
   15425 
   15426   // Bitcast to the requested type.
   15427   Shuff = DAG.getBitcast(RegVT, Shuff);
   15428   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   15429   return Shuff;
   15430 }
   15431 
   15432 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
   15433 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
   15434 // from the AND / OR.
   15435 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
   15436   Opc = Op.getOpcode();
   15437   if (Opc != ISD::OR && Opc != ISD::AND)
   15438     return false;
   15439   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   15440           Op.getOperand(0).hasOneUse() &&
   15441           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
   15442           Op.getOperand(1).hasOneUse());
   15443 }
   15444 
   15445 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
   15446 // 1 and that the SETCC node has a single use.
   15447 static bool isXor1OfSetCC(SDValue Op) {
   15448   if (Op.getOpcode() != ISD::XOR)
   15449     return false;
   15450   if (isOneConstant(Op.getOperand(1)))
   15451     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   15452            Op.getOperand(0).hasOneUse();
   15453   return false;
   15454 }
   15455 
   15456 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   15457   bool addTest = true;
   15458   SDValue Chain = Op.getOperand(0);
   15459   SDValue Cond  = Op.getOperand(1);
   15460   SDValue Dest  = Op.getOperand(2);
   15461   SDLoc dl(Op);
   15462   SDValue CC;
   15463   bool Inverted = false;
   15464 
   15465   if (Cond.getOpcode() == ISD::SETCC) {
   15466     // Check for setcc([su]{add,sub,mul}o == 0).
   15467     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
   15468         isNullConstant(Cond.getOperand(1)) &&
   15469         Cond.getOperand(0).getResNo() == 1 &&
   15470         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
   15471          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
   15472          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
   15473          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
   15474          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
   15475          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
   15476       Inverted = true;
   15477       Cond = Cond.getOperand(0);
   15478     } else {
   15479       SDValue NewCond = LowerSETCC(Cond, DAG);
   15480       if (NewCond.getNode())
   15481         Cond = NewCond;
   15482     }
   15483   }
   15484 #if 0
   15485   // FIXME: LowerXALUO doesn't handle these!!
   15486   else if (Cond.getOpcode() == X86ISD::ADD  ||
   15487            Cond.getOpcode() == X86ISD::SUB  ||
   15488            Cond.getOpcode() == X86ISD::SMUL ||
   15489            Cond.getOpcode() == X86ISD::UMUL)
   15490     Cond = LowerXALUO(Cond, DAG);
   15491 #endif
   15492 
   15493   // Look pass (and (setcc_carry (cmp ...)), 1).
   15494   if (Cond.getOpcode() == ISD::AND &&
   15495       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
   15496       isOneConstant(Cond.getOperand(1)))
   15497     Cond = Cond.getOperand(0);
   15498 
   15499   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   15500   // setting operand in place of the X86ISD::SETCC.
   15501   unsigned CondOpcode = Cond.getOpcode();
   15502   if (CondOpcode == X86ISD::SETCC ||
   15503       CondOpcode == X86ISD::SETCC_CARRY) {
   15504     CC = Cond.getOperand(0);
   15505 
   15506     SDValue Cmp = Cond.getOperand(1);
   15507     unsigned Opc = Cmp.getOpcode();
   15508     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
   15509     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
   15510       Cond = Cmp;
   15511       addTest = false;
   15512     } else {
   15513       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
   15514       default: break;
   15515       case X86::COND_O:
   15516       case X86::COND_B:
   15517         // These can only come from an arithmetic instruction with overflow,
   15518         // e.g. SADDO, UADDO.
   15519         Cond = Cond.getNode()->getOperand(1);
   15520         addTest = false;
   15521         break;
   15522       }
   15523     }
   15524   }
   15525   CondOpcode = Cond.getOpcode();
   15526   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   15527       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   15528       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   15529        Cond.getOperand(0).getValueType() != MVT::i8)) {
   15530     SDValue LHS = Cond.getOperand(0);
   15531     SDValue RHS = Cond.getOperand(1);
   15532     unsigned X86Opcode;
   15533     unsigned X86Cond;
   15534     SDVTList VTs;
   15535     // Keep this in sync with LowerXALUO, otherwise we might create redundant
   15536     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
   15537     // X86ISD::INC).
   15538     switch (CondOpcode) {
   15539     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   15540     case ISD::SADDO:
   15541       if (isOneConstant(RHS)) {
   15542           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
   15543           break;
   15544         }
   15545       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   15546     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   15547     case ISD::SSUBO:
   15548       if (isOneConstant(RHS)) {
   15549           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
   15550           break;
   15551         }
   15552       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   15553     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   15554     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   15555     default: llvm_unreachable("unexpected overflowing operator");
   15556     }
   15557     if (Inverted)
   15558       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
   15559     if (CondOpcode == ISD::UMULO)
   15560       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   15561                           MVT::i32);
   15562     else
   15563       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   15564 
   15565     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
   15566 
   15567     if (CondOpcode == ISD::UMULO)
   15568       Cond = X86Op.getValue(2);
   15569     else
   15570       Cond = X86Op.getValue(1);
   15571 
   15572     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
   15573     addTest = false;
   15574   } else {
   15575     unsigned CondOpc;
   15576     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
   15577       SDValue Cmp = Cond.getOperand(0).getOperand(1);
   15578       if (CondOpc == ISD::OR) {
   15579         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
   15580         // two branches instead of an explicit OR instruction with a
   15581         // separate test.
   15582         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   15583             isX86LogicalCmp(Cmp)) {
   15584           CC = Cond.getOperand(0).getOperand(0);
   15585           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   15586                               Chain, Dest, CC, Cmp);
   15587           CC = Cond.getOperand(1).getOperand(0);
   15588           Cond = Cmp;
   15589           addTest = false;
   15590         }
   15591       } else { // ISD::AND
   15592         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
   15593         // two branches instead of an explicit AND instruction with a
   15594         // separate test. However, we only do this if this block doesn't
   15595         // have a fall-through edge, because this requires an explicit
   15596         // jmp when the condition is false.
   15597         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   15598             isX86LogicalCmp(Cmp) &&
   15599             Op.getNode()->hasOneUse()) {
   15600           X86::CondCode CCode =
   15601             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   15602           CCode = X86::GetOppositeBranchCondition(CCode);
   15603           CC = DAG.getConstant(CCode, dl, MVT::i8);
   15604           SDNode *User = *Op.getNode()->use_begin();
   15605           // Look for an unconditional branch following this conditional branch.
   15606           // We need this because we need to reverse the successors in order
   15607           // to implement FCMP_OEQ.
   15608           if (User->getOpcode() == ISD::BR) {
   15609             SDValue FalseBB = User->getOperand(1);
   15610             SDNode *NewBR =
   15611               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   15612             assert(NewBR == User);
   15613             (void)NewBR;
   15614             Dest = FalseBB;
   15615 
   15616             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   15617                                 Chain, Dest, CC, Cmp);
   15618             X86::CondCode CCode =
   15619               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
   15620             CCode = X86::GetOppositeBranchCondition(CCode);
   15621             CC = DAG.getConstant(CCode, dl, MVT::i8);
   15622             Cond = Cmp;
   15623             addTest = false;
   15624           }
   15625         }
   15626       }
   15627     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
   15628       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
   15629       // It should be transformed during dag combiner except when the condition
   15630       // is set by a arithmetics with overflow node.
   15631       X86::CondCode CCode =
   15632         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   15633       CCode = X86::GetOppositeBranchCondition(CCode);
   15634       CC = DAG.getConstant(CCode, dl, MVT::i8);
   15635       Cond = Cond.getOperand(0).getOperand(1);
   15636       addTest = false;
   15637     } else if (Cond.getOpcode() == ISD::SETCC &&
   15638                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
   15639       // For FCMP_OEQ, we can emit
   15640       // two branches instead of an explicit AND instruction with a
   15641       // separate test. However, we only do this if this block doesn't
   15642       // have a fall-through edge, because this requires an explicit
   15643       // jmp when the condition is false.
   15644       if (Op.getNode()->hasOneUse()) {
   15645         SDNode *User = *Op.getNode()->use_begin();
   15646         // Look for an unconditional branch following this conditional branch.
   15647         // We need this because we need to reverse the successors in order
   15648         // to implement FCMP_OEQ.
   15649         if (User->getOpcode() == ISD::BR) {
   15650           SDValue FalseBB = User->getOperand(1);
   15651           SDNode *NewBR =
   15652             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   15653           assert(NewBR == User);
   15654           (void)NewBR;
   15655           Dest = FalseBB;
   15656 
   15657           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   15658                                     Cond.getOperand(0), Cond.getOperand(1));
   15659           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   15660           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
   15661           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   15662                               Chain, Dest, CC, Cmp);
   15663           CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
   15664           Cond = Cmp;
   15665           addTest = false;
   15666         }
   15667       }
   15668     } else if (Cond.getOpcode() == ISD::SETCC &&
   15669                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
   15670       // For FCMP_UNE, we can emit
   15671       // two branches instead of an explicit AND instruction with a
   15672       // separate test. However, we only do this if this block doesn't
   15673       // have a fall-through edge, because this requires an explicit
   15674       // jmp when the condition is false.
   15675       if (Op.getNode()->hasOneUse()) {
   15676         SDNode *User = *Op.getNode()->use_begin();
   15677         // Look for an unconditional branch following this conditional branch.
   15678         // We need this because we need to reverse the successors in order
   15679         // to implement FCMP_UNE.
   15680         if (User->getOpcode() == ISD::BR) {
   15681           SDValue FalseBB = User->getOperand(1);
   15682           SDNode *NewBR =
   15683             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   15684           assert(NewBR == User);
   15685           (void)NewBR;
   15686 
   15687           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   15688                                     Cond.getOperand(0), Cond.getOperand(1));
   15689           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   15690           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
   15691           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   15692                               Chain, Dest, CC, Cmp);
   15693           CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
   15694           Cond = Cmp;
   15695           addTest = false;
   15696           Dest = FalseBB;
   15697         }
   15698       }
   15699     }
   15700   }
   15701 
   15702   if (addTest) {
   15703     // Look pass the truncate if the high bits are known zero.
   15704     if (isTruncWithZeroHighBitsInput(Cond, DAG))
   15705         Cond = Cond.getOperand(0);
   15706 
   15707     // We know the result of AND is compared against zero. Try to match
   15708     // it to BT.
   15709     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   15710       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
   15711         CC = NewSetCC.getOperand(0);
   15712         Cond = NewSetCC.getOperand(1);
   15713         addTest = false;
   15714       }
   15715     }
   15716   }
   15717 
   15718   if (addTest) {
   15719     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
   15720     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
   15721     Cond = EmitTest(Cond, X86Cond, dl, DAG);
   15722   }
   15723   Cond = ConvertCmpIfNecessary(Cond, DAG);
   15724   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   15725                      Chain, Dest, CC, Cond);
   15726 }
   15727 
   15728 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
   15729 // Calls to _alloca are needed to probe the stack when allocating more than 4k
   15730 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
   15731 // that the guard pages used by the OS virtual memory manager are allocated in
   15732 // correct sequence.
   15733 SDValue
   15734 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   15735                                            SelectionDAG &DAG) const {
   15736   MachineFunction &MF = DAG.getMachineFunction();
   15737   bool SplitStack = MF.shouldSplitStack();
   15738   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
   15739                SplitStack;
   15740   SDLoc dl(Op);
   15741 
   15742   // Get the inputs.
   15743   SDNode *Node = Op.getNode();
   15744   SDValue Chain = Op.getOperand(0);
   15745   SDValue Size  = Op.getOperand(1);
   15746   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   15747   EVT VT = Node->getValueType(0);
   15748 
   15749   // Chain the dynamic stack allocation so that it doesn't modify the stack
   15750   // pointer when other instructions are using the stack.
   15751   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
   15752 
   15753   bool Is64Bit = Subtarget->is64Bit();
   15754   MVT SPTy = getPointerTy(DAG.getDataLayout());
   15755 
   15756   SDValue Result;
   15757   if (!Lower) {
   15758     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   15759     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
   15760     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
   15761                     " not tell us which reg is the stack pointer!");
   15762     EVT VT = Node->getValueType(0);
   15763     SDValue Tmp3 = Node->getOperand(2);
   15764 
   15765     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
   15766     Chain = SP.getValue(1);
   15767     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
   15768     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   15769     unsigned StackAlign = TFI.getStackAlignment();
   15770     Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
   15771     if (Align > StackAlign)
   15772       Result = DAG.getNode(ISD::AND, dl, VT, Result,
   15773                          DAG.getConstant(-(uint64_t)Align, dl, VT));
   15774     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
   15775   } else if (SplitStack) {
   15776     MachineRegisterInfo &MRI = MF.getRegInfo();
   15777 
   15778     if (Is64Bit) {
   15779       // The 64 bit implementation of segmented stacks needs to clobber both r10
   15780       // r11. This makes it impossible to use it along with nested parameters.
   15781       const Function *F = MF.getFunction();
   15782 
   15783       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
   15784            I != E; ++I)
   15785         if (I->hasNestAttr())
   15786           report_fatal_error("Cannot use segmented stacks with functions that "
   15787                              "have nested arguments.");
   15788     }
   15789 
   15790     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
   15791     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
   15792     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
   15793     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
   15794                                 DAG.getRegister(Vreg, SPTy));
   15795   } else {
   15796     SDValue Flag;
   15797     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
   15798 
   15799     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
   15800     Flag = Chain.getValue(1);
   15801     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   15802 
   15803     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
   15804 
   15805     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   15806     unsigned SPReg = RegInfo->getStackRegister();
   15807     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
   15808     Chain = SP.getValue(1);
   15809 
   15810     if (Align) {
   15811       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
   15812                        DAG.getConstant(-(uint64_t)Align, dl, VT));
   15813       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
   15814     }
   15815 
   15816     Result = SP;
   15817   }
   15818 
   15819   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
   15820                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
   15821 
   15822   SDValue Ops[2] = {Result, Chain};
   15823   return DAG.getMergeValues(Ops, dl);
   15824 }
   15825 
   15826 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   15827   MachineFunction &MF = DAG.getMachineFunction();
   15828   auto PtrVT = getPointerTy(MF.getDataLayout());
   15829   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   15830 
   15831   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   15832   SDLoc DL(Op);
   15833 
   15834   if (!Subtarget->is64Bit() ||
   15835       Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) {
   15836     // vastart just stores the address of the VarArgsFrameIndex slot into the
   15837     // memory location argument.
   15838     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   15839     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
   15840                         MachinePointerInfo(SV), false, false, 0);
   15841   }
   15842 
   15843   // __va_list_tag:
   15844   //   gp_offset         (0 - 6 * 8)
   15845   //   fp_offset         (48 - 48 + 8 * 16)
   15846   //   overflow_arg_area (point to parameters coming in memory).
   15847   //   reg_save_area
   15848   SmallVector<SDValue, 8> MemOps;
   15849   SDValue FIN = Op.getOperand(1);
   15850   // Store gp_offset
   15851   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
   15852                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
   15853                                                DL, MVT::i32),
   15854                                FIN, MachinePointerInfo(SV), false, false, 0);
   15855   MemOps.push_back(Store);
   15856 
   15857   // Store fp_offset
   15858   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
   15859   Store = DAG.getStore(Op.getOperand(0), DL,
   15860                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
   15861                                        MVT::i32),
   15862                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
   15863   MemOps.push_back(Store);
   15864 
   15865   // Store ptr to overflow_arg_area
   15866   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
   15867   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   15868   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
   15869                        MachinePointerInfo(SV, 8),
   15870                        false, false, 0);
   15871   MemOps.push_back(Store);
   15872 
   15873   // Store ptr to reg_save_area.
   15874   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
   15875       Subtarget->isTarget64BitLP64() ? 8 : 4, DL));
   15876   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
   15877   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(
   15878       SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0);
   15879   MemOps.push_back(Store);
   15880   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
   15881 }
   15882 
   15883 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   15884   assert(Subtarget->is64Bit() &&
   15885          "LowerVAARG only handles 64-bit va_arg!");
   15886   assert(Op.getNode()->getNumOperands() == 4);
   15887 
   15888   MachineFunction &MF = DAG.getMachineFunction();
   15889   if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
   15890     // The Win64 ABI uses char* instead of a structure.
   15891     return DAG.expandVAArg(Op.getNode());
   15892 
   15893   SDValue Chain = Op.getOperand(0);
   15894   SDValue SrcPtr = Op.getOperand(1);
   15895   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   15896   unsigned Align = Op.getConstantOperandVal(3);
   15897   SDLoc dl(Op);
   15898 
   15899   EVT ArgVT = Op.getNode()->getValueType(0);
   15900   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   15901   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
   15902   uint8_t ArgMode;
   15903 
   15904   // Decide which area this value should be read from.
   15905   // TODO: Implement the AMD64 ABI in its entirety. This simple
   15906   // selection mechanism works only for the basic types.
   15907   if (ArgVT == MVT::f80) {
   15908     llvm_unreachable("va_arg for f80 not yet implemented");
   15909   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
   15910     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
   15911   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
   15912     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
   15913   } else {
   15914     llvm_unreachable("Unhandled argument type in LowerVAARG");
   15915   }
   15916 
   15917   if (ArgMode == 2) {
   15918     // Sanity Check: Make sure using fp_offset makes sense.
   15919     assert(!Subtarget->useSoftFloat() &&
   15920            !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
   15921            Subtarget->hasSSE1());
   15922   }
   15923 
   15924   // Insert VAARG_64 node into the DAG
   15925   // VAARG_64 returns two values: Variable Argument Address, Chain
   15926   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
   15927                        DAG.getConstant(ArgMode, dl, MVT::i8),
   15928                        DAG.getConstant(Align, dl, MVT::i32)};
   15929   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
   15930   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
   15931                                           VTs, InstOps, MVT::i64,
   15932                                           MachinePointerInfo(SV),
   15933                                           /*Align=*/0,
   15934                                           /*Volatile=*/false,
   15935                                           /*ReadMem=*/true,
   15936                                           /*WriteMem=*/true);
   15937   Chain = VAARG.getValue(1);
   15938 
   15939   // Load the next argument and return it
   15940   return DAG.getLoad(ArgVT, dl,
   15941                      Chain,
   15942                      VAARG,
   15943                      MachinePointerInfo(),
   15944                      false, false, false, 0);
   15945 }
   15946 
   15947 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
   15948                            SelectionDAG &DAG) {
   15949   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
   15950   // where a va_list is still an i8*.
   15951   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
   15952   if (Subtarget->isCallingConvWin64(
   15953         DAG.getMachineFunction().getFunction()->getCallingConv()))
   15954     // Probably a Win64 va_copy.
   15955     return DAG.expandVACopy(Op.getNode());
   15956 
   15957   SDValue Chain = Op.getOperand(0);
   15958   SDValue DstPtr = Op.getOperand(1);
   15959   SDValue SrcPtr = Op.getOperand(2);
   15960   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   15961   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   15962   SDLoc DL(Op);
   15963 
   15964   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
   15965                        DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
   15966                        false, false,
   15967                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
   15968 }
   15969 
   15970 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
   15971 // amount is a constant. Takes immediate version of shift as input.
   15972 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
   15973                                           SDValue SrcOp, uint64_t ShiftAmt,
   15974                                           SelectionDAG &DAG) {
   15975   MVT ElementType = VT.getVectorElementType();
   15976 
   15977   // Fold this packed shift into its first operand if ShiftAmt is 0.
   15978   if (ShiftAmt == 0)
   15979     return SrcOp;
   15980 
   15981   // Check for ShiftAmt >= element width
   15982   if (ShiftAmt >= ElementType.getSizeInBits()) {
   15983     if (Opc == X86ISD::VSRAI)
   15984       ShiftAmt = ElementType.getSizeInBits() - 1;
   15985     else
   15986       return DAG.getConstant(0, dl, VT);
   15987   }
   15988 
   15989   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
   15990          && "Unknown target vector shift-by-constant node");
   15991 
   15992   // Fold this packed vector shift into a build vector if SrcOp is a
   15993   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
   15994   if (VT == SrcOp.getSimpleValueType() &&
   15995       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
   15996     SmallVector<SDValue, 8> Elts;
   15997     unsigned NumElts = SrcOp->getNumOperands();
   15998     ConstantSDNode *ND;
   15999 
   16000     switch(Opc) {
   16001     default: llvm_unreachable(nullptr);
   16002     case X86ISD::VSHLI:
   16003       for (unsigned i=0; i!=NumElts; ++i) {
   16004         SDValue CurrentOp = SrcOp->getOperand(i);
   16005         if (CurrentOp->getOpcode() == ISD::UNDEF) {
   16006           Elts.push_back(CurrentOp);
   16007           continue;
   16008         }
   16009         ND = cast<ConstantSDNode>(CurrentOp);
   16010         const APInt &C = ND->getAPIntValue();
   16011         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
   16012       }
   16013       break;
   16014     case X86ISD::VSRLI:
   16015       for (unsigned i=0; i!=NumElts; ++i) {
   16016         SDValue CurrentOp = SrcOp->getOperand(i);
   16017         if (CurrentOp->getOpcode() == ISD::UNDEF) {
   16018           Elts.push_back(CurrentOp);
   16019           continue;
   16020         }
   16021         ND = cast<ConstantSDNode>(CurrentOp);
   16022         const APInt &C = ND->getAPIntValue();
   16023         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
   16024       }
   16025       break;
   16026     case X86ISD::VSRAI:
   16027       for (unsigned i=0; i!=NumElts; ++i) {
   16028         SDValue CurrentOp = SrcOp->getOperand(i);
   16029         if (CurrentOp->getOpcode() == ISD::UNDEF) {
   16030           Elts.push_back(CurrentOp);
   16031           continue;
   16032         }
   16033         ND = cast<ConstantSDNode>(CurrentOp);
   16034         const APInt &C = ND->getAPIntValue();
   16035         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
   16036       }
   16037       break;
   16038     }
   16039 
   16040     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
   16041   }
   16042 
   16043   return DAG.getNode(Opc, dl, VT, SrcOp,
   16044                      DAG.getConstant(ShiftAmt, dl, MVT::i8));
   16045 }
   16046 
   16047 // getTargetVShiftNode - Handle vector element shifts where the shift amount
   16048 // may or may not be a constant. Takes immediate version of shift as input.
   16049 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
   16050                                    SDValue SrcOp, SDValue ShAmt,
   16051                                    SelectionDAG &DAG) {
   16052   MVT SVT = ShAmt.getSimpleValueType();
   16053   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
   16054 
   16055   // Catch shift-by-constant.
   16056   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
   16057     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
   16058                                       CShAmt->getZExtValue(), DAG);
   16059 
   16060   // Change opcode to non-immediate version
   16061   switch (Opc) {
   16062     default: llvm_unreachable("Unknown target vector shift node");
   16063     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
   16064     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
   16065     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
   16066   }
   16067 
   16068   const X86Subtarget &Subtarget =
   16069       static_cast<const X86Subtarget &>(DAG.getSubtarget());
   16070   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
   16071       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
   16072     // Let the shuffle legalizer expand this shift amount node.
   16073     SDValue Op0 = ShAmt.getOperand(0);
   16074     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
   16075     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
   16076   } else {
   16077     // Need to build a vector containing shift amount.
   16078     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
   16079     SmallVector<SDValue, 4> ShOps;
   16080     ShOps.push_back(ShAmt);
   16081     if (SVT == MVT::i32) {
   16082       ShOps.push_back(DAG.getConstant(0, dl, SVT));
   16083       ShOps.push_back(DAG.getUNDEF(SVT));
   16084     }
   16085     ShOps.push_back(DAG.getUNDEF(SVT));
   16086 
   16087     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
   16088     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
   16089   }
   16090 
   16091   // The return type has to be a 128-bit type with the same element
   16092   // type as the input type.
   16093   MVT EltVT = VT.getVectorElementType();
   16094   MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
   16095 
   16096   ShAmt = DAG.getBitcast(ShVT, ShAmt);
   16097   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
   16098 }
   16099 
   16100 /// \brief Return Mask with the necessary casting or extending
   16101 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
   16102 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
   16103                            const X86Subtarget *Subtarget,
   16104                            SelectionDAG &DAG, SDLoc dl) {
   16105 
   16106   if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
   16107     // Mask should be extended
   16108     Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
   16109                        MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
   16110   }
   16111 
   16112   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) {
   16113     if (MaskVT == MVT::v64i1) {
   16114       assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
   16115       // In case 32bit mode, bitcast i64 is illegal, extend/split it.
   16116       SDValue Lo, Hi;
   16117       Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
   16118                           DAG.getConstant(0, dl, MVT::i32));
   16119       Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
   16120                           DAG.getConstant(1, dl, MVT::i32));
   16121 
   16122       Lo = DAG.getBitcast(MVT::v32i1, Lo);
   16123       Hi = DAG.getBitcast(MVT::v32i1, Hi);
   16124 
   16125       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
   16126     } else {
   16127       // MaskVT require < 64bit. Truncate mask (should succeed in any case),
   16128       // and bitcast.
   16129       MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
   16130       return DAG.getBitcast(MaskVT,
   16131                             DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
   16132     }
   16133 
   16134   } else {
   16135     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
   16136                                      Mask.getSimpleValueType().getSizeInBits());
   16137     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
   16138     // are extracted by EXTRACT_SUBVECTOR.
   16139     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
   16140                        DAG.getBitcast(BitcastVT, Mask),
   16141                        DAG.getIntPtrConstant(0, dl));
   16142   }
   16143 }
   16144 
   16145 /// \brief Return (and \p Op, \p Mask) for compare instructions or
   16146 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
   16147 /// necessary casting or extending for \p Mask when lowering masking intrinsics
   16148 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
   16149                   SDValue PreservedSrc,
   16150                   const X86Subtarget *Subtarget,
   16151                   SelectionDAG &DAG) {
   16152   MVT VT = Op.getSimpleValueType();
   16153   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   16154   unsigned OpcodeSelect = ISD::VSELECT;
   16155   SDLoc dl(Op);
   16156 
   16157   if (isAllOnesConstant(Mask))
   16158     return Op;
   16159 
   16160   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   16161 
   16162   switch (Op.getOpcode()) {
   16163   default: break;
   16164   case X86ISD::PCMPEQM:
   16165   case X86ISD::PCMPGTM:
   16166   case X86ISD::CMPM:
   16167   case X86ISD::CMPMU:
   16168     return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
   16169   case X86ISD::VFPCLASS:
   16170     case X86ISD::VFPCLASSS:
   16171     return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
   16172   case X86ISD::VTRUNC:
   16173   case X86ISD::VTRUNCS:
   16174   case X86ISD::VTRUNCUS:
   16175     // We can't use ISD::VSELECT here because it is not always "Legal"
   16176     // for the destination type. For example vpmovqb require only AVX512
   16177     // and vselect that can operate on byte element type require BWI
   16178     OpcodeSelect = X86ISD::SELECT;
   16179     break;
   16180   }
   16181   if (PreservedSrc.getOpcode() == ISD::UNDEF)
   16182     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   16183   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
   16184 }
   16185 
   16186 /// \brief Creates an SDNode for a predicated scalar operation.
   16187 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
   16188 /// The mask is coming as MVT::i8 and it should be truncated
   16189 /// to MVT::i1 while lowering masking intrinsics.
   16190 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
   16191 /// "X86select" instead of "vselect". We just can't create the "vselect" node
   16192 /// for a scalar instruction.
   16193 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
   16194                                     SDValue PreservedSrc,
   16195                                     const X86Subtarget *Subtarget,
   16196                                     SelectionDAG &DAG) {
   16197   if (isAllOnesConstant(Mask))
   16198     return Op;
   16199 
   16200   MVT VT = Op.getSimpleValueType();
   16201   SDLoc dl(Op);
   16202   // The mask should be of type MVT::i1
   16203   SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
   16204 
   16205   if (Op.getOpcode() == X86ISD::FSETCC)
   16206     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
   16207   if (Op.getOpcode() == X86ISD::VFPCLASS ||
   16208       Op.getOpcode() == X86ISD::VFPCLASSS)
   16209     return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
   16210 
   16211   if (PreservedSrc.getOpcode() == ISD::UNDEF)
   16212     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   16213   return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
   16214 }
   16215 
   16216 static int getSEHRegistrationNodeSize(const Function *Fn) {
   16217   if (!Fn->hasPersonalityFn())
   16218     report_fatal_error(
   16219         "querying registration node size for function without personality");
   16220   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
   16221   // WinEHStatePass for the full struct definition.
   16222   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
   16223   case EHPersonality::MSVC_X86SEH: return 24;
   16224   case EHPersonality::MSVC_CXX: return 16;
   16225   default: break;
   16226   }
   16227   report_fatal_error(
   16228       "can only recover FP for 32-bit MSVC EH personality functions");
   16229 }
   16230 
   16231 /// When the MSVC runtime transfers control to us, either to an outlined
   16232 /// function or when returning to a parent frame after catching an exception, we
   16233 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
   16234 /// Here's the math:
   16235 ///   RegNodeBase = EntryEBP - RegNodeSize
   16236 ///   ParentFP = RegNodeBase - ParentFrameOffset
   16237 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
   16238 /// subtracting the offset (negative on x86) takes us back to the parent FP.
   16239 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
   16240                                    SDValue EntryEBP) {
   16241   MachineFunction &MF = DAG.getMachineFunction();
   16242   SDLoc dl;
   16243 
   16244   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   16245   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   16246 
   16247   // It's possible that the parent function no longer has a personality function
   16248   // if the exceptional code was optimized away, in which case we just return
   16249   // the incoming EBP.
   16250   if (!Fn->hasPersonalityFn())
   16251     return EntryEBP;
   16252 
   16253   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
   16254   // registration, or the .set_setframe offset.
   16255   MCSymbol *OffsetSym =
   16256       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
   16257           GlobalValue::getRealLinkageName(Fn->getName()));
   16258   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
   16259   SDValue ParentFrameOffset =
   16260       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
   16261 
   16262   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
   16263   // prologue to RBP in the parent function.
   16264   const X86Subtarget &Subtarget =
   16265       static_cast<const X86Subtarget &>(DAG.getSubtarget());
   16266   if (Subtarget.is64Bit())
   16267     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
   16268 
   16269   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
   16270   // RegNodeBase = EntryEBP - RegNodeSize
   16271   // ParentFP = RegNodeBase - ParentFrameOffset
   16272   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
   16273                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
   16274   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
   16275 }
   16276 
   16277 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   16278                                        SelectionDAG &DAG) {
   16279   SDLoc dl(Op);
   16280   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   16281   MVT VT = Op.getSimpleValueType();
   16282   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
   16283   if (IntrData) {
   16284     switch(IntrData->Type) {
   16285     case INTR_TYPE_1OP:
   16286       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
   16287     case INTR_TYPE_2OP:
   16288       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
   16289         Op.getOperand(2));
   16290     case INTR_TYPE_2OP_IMM8:
   16291       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
   16292                          DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
   16293     case INTR_TYPE_3OP:
   16294       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
   16295         Op.getOperand(2), Op.getOperand(3));
   16296     case INTR_TYPE_4OP:
   16297       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
   16298         Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
   16299     case INTR_TYPE_1OP_MASK_RM: {
   16300       SDValue Src = Op.getOperand(1);
   16301       SDValue PassThru = Op.getOperand(2);
   16302       SDValue Mask = Op.getOperand(3);
   16303       SDValue RoundingMode;
   16304       // We allways add rounding mode to the Node.
   16305       // If the rounding mode is not specified, we add the
   16306       // "current direction" mode.
   16307       if (Op.getNumOperands() == 4)
   16308         RoundingMode =
   16309           DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
   16310       else
   16311         RoundingMode = Op.getOperand(4);
   16312       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   16313       if (IntrWithRoundingModeOpcode != 0)
   16314         if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() !=
   16315             X86::STATIC_ROUNDING::CUR_DIRECTION)
   16316           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   16317                                       dl, Op.getValueType(), Src, RoundingMode),
   16318                                       Mask, PassThru, Subtarget, DAG);
   16319       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
   16320                                               RoundingMode),
   16321                                   Mask, PassThru, Subtarget, DAG);
   16322     }
   16323     case INTR_TYPE_1OP_MASK: {
   16324       SDValue Src = Op.getOperand(1);
   16325       SDValue PassThru = Op.getOperand(2);
   16326       SDValue Mask = Op.getOperand(3);
   16327       // We add rounding mode to the Node when
   16328       //   - RM Opcode is specified and
   16329       //   - RM is not "current direction".
   16330       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   16331       if (IntrWithRoundingModeOpcode != 0) {
   16332         SDValue Rnd = Op.getOperand(4);
   16333         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
   16334         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
   16335           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   16336                                       dl, Op.getValueType(),
   16337                                       Src, Rnd),
   16338                                       Mask, PassThru, Subtarget, DAG);
   16339         }
   16340       }
   16341       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
   16342                                   Mask, PassThru, Subtarget, DAG);
   16343     }
   16344     case INTR_TYPE_SCALAR_MASK: {
   16345       SDValue Src1 = Op.getOperand(1);
   16346       SDValue Src2 = Op.getOperand(2);
   16347       SDValue passThru = Op.getOperand(3);
   16348       SDValue Mask = Op.getOperand(4);
   16349       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
   16350                                   Mask, passThru, Subtarget, DAG);
   16351     }
   16352     case INTR_TYPE_SCALAR_MASK_RM: {
   16353       SDValue Src1 = Op.getOperand(1);
   16354       SDValue Src2 = Op.getOperand(2);
   16355       SDValue Src0 = Op.getOperand(3);
   16356       SDValue Mask = Op.getOperand(4);
   16357       // There are 2 kinds of intrinsics in this group:
   16358       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
   16359       // (2) With rounding mode and sae - 7 operands.
   16360       if (Op.getNumOperands() == 6) {
   16361         SDValue Sae  = Op.getOperand(5);
   16362         unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
   16363         return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
   16364                                                 Sae),
   16365                                     Mask, Src0, Subtarget, DAG);
   16366       }
   16367       assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
   16368       SDValue RoundingMode  = Op.getOperand(5);
   16369       SDValue Sae  = Op.getOperand(6);
   16370       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
   16371                                               RoundingMode, Sae),
   16372                                   Mask, Src0, Subtarget, DAG);
   16373     }
   16374     case INTR_TYPE_2OP_MASK:
   16375     case INTR_TYPE_2OP_IMM8_MASK: {
   16376       SDValue Src1 = Op.getOperand(1);
   16377       SDValue Src2 = Op.getOperand(2);
   16378       SDValue PassThru = Op.getOperand(3);
   16379       SDValue Mask = Op.getOperand(4);
   16380 
   16381       if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
   16382         Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
   16383 
   16384       // We specify 2 possible opcodes for intrinsics with rounding modes.
   16385       // First, we check if the intrinsic may have non-default rounding mode,
   16386       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   16387       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   16388       if (IntrWithRoundingModeOpcode != 0) {
   16389         SDValue Rnd = Op.getOperand(5);
   16390         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
   16391         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
   16392           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   16393                                       dl, Op.getValueType(),
   16394                                       Src1, Src2, Rnd),
   16395                                       Mask, PassThru, Subtarget, DAG);
   16396         }
   16397       }
   16398       // TODO: Intrinsics should have fast-math-flags to propagate.
   16399       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
   16400                                   Mask, PassThru, Subtarget, DAG);
   16401     }
   16402     case INTR_TYPE_2OP_MASK_RM: {
   16403       SDValue Src1 = Op.getOperand(1);
   16404       SDValue Src2 = Op.getOperand(2);
   16405       SDValue PassThru = Op.getOperand(3);
   16406       SDValue Mask = Op.getOperand(4);
   16407       // We specify 2 possible modes for intrinsics, with/without rounding
   16408       // modes.
   16409       // First, we check if the intrinsic have rounding mode (6 operands),
   16410       // if not, we set rounding mode to "current".
   16411       SDValue Rnd;
   16412       if (Op.getNumOperands() == 6)
   16413         Rnd = Op.getOperand(5);
   16414       else
   16415         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
   16416       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   16417                                               Src1, Src2, Rnd),
   16418                                   Mask, PassThru, Subtarget, DAG);
   16419     }
   16420     case INTR_TYPE_3OP_SCALAR_MASK_RM: {
   16421       SDValue Src1 = Op.getOperand(1);
   16422       SDValue Src2 = Op.getOperand(2);
   16423       SDValue Src3 = Op.getOperand(3);
   16424       SDValue PassThru = Op.getOperand(4);
   16425       SDValue Mask = Op.getOperand(5);
   16426       SDValue Sae  = Op.getOperand(6);
   16427 
   16428       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
   16429                                               Src2, Src3, Sae),
   16430                                   Mask, PassThru, Subtarget, DAG);
   16431     }
   16432     case INTR_TYPE_3OP_MASK_RM: {
   16433       SDValue Src1 = Op.getOperand(1);
   16434       SDValue Src2 = Op.getOperand(2);
   16435       SDValue Imm = Op.getOperand(3);
   16436       SDValue PassThru = Op.getOperand(4);
   16437       SDValue Mask = Op.getOperand(5);
   16438       // We specify 2 possible modes for intrinsics, with/without rounding
   16439       // modes.
   16440       // First, we check if the intrinsic have rounding mode (7 operands),
   16441       // if not, we set rounding mode to "current".
   16442       SDValue Rnd;
   16443       if (Op.getNumOperands() == 7)
   16444         Rnd = Op.getOperand(6);
   16445       else
   16446         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
   16447       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   16448         Src1, Src2, Imm, Rnd),
   16449         Mask, PassThru, Subtarget, DAG);
   16450     }
   16451     case INTR_TYPE_3OP_IMM8_MASK:
   16452     case INTR_TYPE_3OP_MASK:
   16453     case INSERT_SUBVEC: {
   16454       SDValue Src1 = Op.getOperand(1);
   16455       SDValue Src2 = Op.getOperand(2);
   16456       SDValue Src3 = Op.getOperand(3);
   16457       SDValue PassThru = Op.getOperand(4);
   16458       SDValue Mask = Op.getOperand(5);
   16459 
   16460       if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
   16461         Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
   16462       else if (IntrData->Type == INSERT_SUBVEC) {
   16463         // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
   16464         assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
   16465         unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
   16466         Imm *= Src2.getSimpleValueType().getVectorNumElements();
   16467         Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
   16468       }
   16469 
   16470       // We specify 2 possible opcodes for intrinsics with rounding modes.
   16471       // First, we check if the intrinsic may have non-default rounding mode,
   16472       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   16473       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   16474       if (IntrWithRoundingModeOpcode != 0) {
   16475         SDValue Rnd = Op.getOperand(6);
   16476         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
   16477         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
   16478           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   16479                                       dl, Op.getValueType(),
   16480                                       Src1, Src2, Src3, Rnd),
   16481                                       Mask, PassThru, Subtarget, DAG);
   16482         }
   16483       }
   16484       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   16485                                               Src1, Src2, Src3),
   16486                                   Mask, PassThru, Subtarget, DAG);
   16487     }
   16488     case VPERM_3OP_MASKZ:
   16489     case VPERM_3OP_MASK:{
   16490       // Src2 is the PassThru
   16491       SDValue Src1 = Op.getOperand(1);
   16492       SDValue Src2 = Op.getOperand(2);
   16493       SDValue Src3 = Op.getOperand(3);
   16494       SDValue Mask = Op.getOperand(4);
   16495       MVT VT = Op.getSimpleValueType();
   16496       SDValue PassThru = SDValue();
   16497 
   16498       // set PassThru element
   16499       if (IntrData->Type == VPERM_3OP_MASKZ)
   16500         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
   16501       else
   16502         PassThru = DAG.getBitcast(VT, Src2);
   16503 
   16504       // Swap Src1 and Src2 in the node creation
   16505       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
   16506                                               dl, Op.getValueType(),
   16507                                               Src2, Src1, Src3),
   16508                                   Mask, PassThru, Subtarget, DAG);
   16509     }
   16510     case FMA_OP_MASK3:
   16511     case FMA_OP_MASKZ:
   16512     case FMA_OP_MASK: {
   16513       SDValue Src1 = Op.getOperand(1);
   16514       SDValue Src2 = Op.getOperand(2);
   16515       SDValue Src3 = Op.getOperand(3);
   16516       SDValue Mask = Op.getOperand(4);
   16517       MVT VT = Op.getSimpleValueType();
   16518       SDValue PassThru = SDValue();
   16519 
   16520       // set PassThru element
   16521       if (IntrData->Type == FMA_OP_MASKZ)
   16522         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
   16523       else if (IntrData->Type == FMA_OP_MASK3)
   16524         PassThru = Src3;
   16525       else
   16526         PassThru = Src1;
   16527 
   16528       // We specify 2 possible opcodes for intrinsics with rounding modes.
   16529       // First, we check if the intrinsic may have non-default rounding mode,
   16530       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   16531       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   16532       if (IntrWithRoundingModeOpcode != 0) {
   16533         SDValue Rnd = Op.getOperand(5);
   16534         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
   16535             X86::STATIC_ROUNDING::CUR_DIRECTION)
   16536           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   16537                                                   dl, Op.getValueType(),
   16538                                                   Src1, Src2, Src3, Rnd),
   16539                                       Mask, PassThru, Subtarget, DAG);
   16540       }
   16541       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
   16542                                               dl, Op.getValueType(),
   16543                                               Src1, Src2, Src3),
   16544                                   Mask, PassThru, Subtarget, DAG);
   16545     }
   16546     case TERLOG_OP_MASK:
   16547     case TERLOG_OP_MASKZ: {
   16548       SDValue Src1 = Op.getOperand(1);
   16549       SDValue Src2 = Op.getOperand(2);
   16550       SDValue Src3 = Op.getOperand(3);
   16551       SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
   16552       SDValue Mask = Op.getOperand(5);
   16553       MVT VT = Op.getSimpleValueType();
   16554       SDValue PassThru = Src1;
   16555       // Set PassThru element.
   16556       if (IntrData->Type == TERLOG_OP_MASKZ)
   16557         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
   16558 
   16559       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   16560                                               Src1, Src2, Src3, Src4),
   16561                                   Mask, PassThru, Subtarget, DAG);
   16562     }
   16563     case FPCLASS: {
   16564       // FPclass intrinsics with mask
   16565        SDValue Src1 = Op.getOperand(1);
   16566        MVT VT = Src1.getSimpleValueType();
   16567        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   16568        SDValue Imm = Op.getOperand(2);
   16569        SDValue Mask = Op.getOperand(3);
   16570        MVT BitcastVT = MVT::getVectorVT(MVT::i1,
   16571                                      Mask.getSimpleValueType().getSizeInBits());
   16572        SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
   16573        SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
   16574                                                  DAG.getTargetConstant(0, dl, MaskVT),
   16575                                                  Subtarget, DAG);
   16576        SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
   16577                                  DAG.getUNDEF(BitcastVT), FPclassMask,
   16578                                  DAG.getIntPtrConstant(0, dl));
   16579        return DAG.getBitcast(Op.getValueType(), Res);
   16580     }
   16581     case FPCLASSS: {
   16582       SDValue Src1 = Op.getOperand(1);
   16583       SDValue Imm = Op.getOperand(2);
   16584       SDValue Mask = Op.getOperand(3);
   16585       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
   16586       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
   16587         DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
   16588       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
   16589     }
   16590     case CMP_MASK:
   16591     case CMP_MASK_CC: {
   16592       // Comparison intrinsics with masks.
   16593       // Example of transformation:
   16594       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
   16595       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
   16596       // (i8 (bitcast
   16597       //   (v8i1 (insert_subvector undef,
   16598       //           (v2i1 (and (PCMPEQM %a, %b),
   16599       //                      (extract_subvector
   16600       //                         (v8i1 (bitcast %mask)), 0))), 0))))
   16601       MVT VT = Op.getOperand(1).getSimpleValueType();
   16602       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   16603       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
   16604       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
   16605                                        Mask.getSimpleValueType().getSizeInBits());
   16606       SDValue Cmp;
   16607       if (IntrData->Type == CMP_MASK_CC) {
   16608         SDValue CC = Op.getOperand(3);
   16609         CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
   16610         // We specify 2 possible opcodes for intrinsics with rounding modes.
   16611         // First, we check if the intrinsic may have non-default rounding mode,
   16612         // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   16613         if (IntrData->Opc1 != 0) {
   16614           SDValue Rnd = Op.getOperand(5);
   16615           if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
   16616               X86::STATIC_ROUNDING::CUR_DIRECTION)
   16617             Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
   16618                               Op.getOperand(2), CC, Rnd);
   16619         }
   16620         //default rounding mode
   16621         if(!Cmp.getNode())
   16622             Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
   16623                               Op.getOperand(2), CC);
   16624 
   16625       } else {
   16626         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
   16627         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
   16628                           Op.getOperand(2));
   16629       }
   16630       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
   16631                                              DAG.getTargetConstant(0, dl,
   16632                                                                    MaskVT),
   16633                                              Subtarget, DAG);
   16634       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
   16635                                 DAG.getUNDEF(BitcastVT), CmpMask,
   16636                                 DAG.getIntPtrConstant(0, dl));
   16637       return DAG.getBitcast(Op.getValueType(), Res);
   16638     }
   16639     case CMP_MASK_SCALAR_CC: {
   16640       SDValue Src1 = Op.getOperand(1);
   16641       SDValue Src2 = Op.getOperand(2);
   16642       SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
   16643       SDValue Mask = Op.getOperand(4);
   16644 
   16645       SDValue Cmp;
   16646       if (IntrData->Opc1 != 0) {
   16647         SDValue Rnd = Op.getOperand(5);
   16648         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
   16649             X86::STATIC_ROUNDING::CUR_DIRECTION)
   16650           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
   16651       }
   16652       //default rounding mode
   16653       if(!Cmp.getNode())
   16654         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
   16655 
   16656       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
   16657                                              DAG.getTargetConstant(0, dl,
   16658                                                                    MVT::i1),
   16659                                              Subtarget, DAG);
   16660 
   16661       return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8,
   16662                          DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask),
   16663                          DAG.getValueType(MVT::i1));
   16664     }
   16665     case COMI: { // Comparison intrinsics
   16666       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
   16667       SDValue LHS = Op.getOperand(1);
   16668       SDValue RHS = Op.getOperand(2);
   16669       unsigned X86CC = TranslateX86CC(CC, dl, true, LHS, RHS, DAG);
   16670       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
   16671       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
   16672       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   16673                                   DAG.getConstant(X86CC, dl, MVT::i8), Cond);
   16674       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   16675     }
   16676     case COMI_RM: { // Comparison intrinsics with Sae
   16677       SDValue LHS = Op.getOperand(1);
   16678       SDValue RHS = Op.getOperand(2);
   16679       SDValue CC = Op.getOperand(3);
   16680       SDValue Sae = Op.getOperand(4);
   16681       auto ComiType = TranslateX86ConstCondToX86CC(CC);
   16682       // choose between ordered and unordered (comi/ucomi)
   16683       unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1;
   16684       SDValue Cond;
   16685       if (cast<ConstantSDNode>(Sae)->getZExtValue() !=
   16686                                            X86::STATIC_ROUNDING::CUR_DIRECTION)
   16687         Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae);
   16688       else
   16689         Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS);
   16690       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   16691         DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond);
   16692       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   16693     }
   16694     case VSHIFT:
   16695       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
   16696                                  Op.getOperand(1), Op.getOperand(2), DAG);
   16697     case VSHIFT_MASK:
   16698       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
   16699                                                       Op.getSimpleValueType(),
   16700                                                       Op.getOperand(1),
   16701                                                       Op.getOperand(2), DAG),
   16702                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
   16703                                   DAG);
   16704     case COMPRESS_EXPAND_IN_REG: {
   16705       SDValue Mask = Op.getOperand(3);
   16706       SDValue DataToCompress = Op.getOperand(1);
   16707       SDValue PassThru = Op.getOperand(2);
   16708       if (isAllOnesConstant(Mask)) // return data as is
   16709         return Op.getOperand(1);
   16710 
   16711       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   16712                                               DataToCompress),
   16713                                   Mask, PassThru, Subtarget, DAG);
   16714     }
   16715     case BROADCASTM: {
   16716       SDValue Mask = Op.getOperand(1);
   16717       MVT MaskVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits());
   16718       Mask = DAG.getBitcast(MaskVT, Mask);
   16719       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
   16720     }
   16721     case BLEND: {
   16722       SDValue Mask = Op.getOperand(3);
   16723       MVT VT = Op.getSimpleValueType();
   16724       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   16725       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   16726       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
   16727                          Op.getOperand(2));
   16728     }
   16729     case KUNPCK: {
   16730       MVT VT = Op.getSimpleValueType();
   16731       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
   16732 
   16733       SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
   16734       SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
   16735       // Arguments should be swapped.
   16736       SDValue Res = DAG.getNode(IntrData->Opc0, dl,
   16737                                 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
   16738                                 Src2, Src1);
   16739       return DAG.getBitcast(VT, Res);
   16740     }
   16741     default:
   16742       break;
   16743     }
   16744   }
   16745 
   16746   switch (IntNo) {
   16747   default: return SDValue();    // Don't custom lower most intrinsics.
   16748 
   16749   case Intrinsic::x86_avx2_permd:
   16750   case Intrinsic::x86_avx2_permps:
   16751     // Operands intentionally swapped. Mask is last operand to intrinsic,
   16752     // but second operand for node/instruction.
   16753     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
   16754                        Op.getOperand(2), Op.getOperand(1));
   16755 
   16756   // ptest and testp intrinsics. The intrinsic these come from are designed to
   16757   // return an integer value, not just an instruction so lower it to the ptest
   16758   // or testp pattern and a setcc for the result.
   16759   case Intrinsic::x86_sse41_ptestz:
   16760   case Intrinsic::x86_sse41_ptestc:
   16761   case Intrinsic::x86_sse41_ptestnzc:
   16762   case Intrinsic::x86_avx_ptestz_256:
   16763   case Intrinsic::x86_avx_ptestc_256:
   16764   case Intrinsic::x86_avx_ptestnzc_256:
   16765   case Intrinsic::x86_avx_vtestz_ps:
   16766   case Intrinsic::x86_avx_vtestc_ps:
   16767   case Intrinsic::x86_avx_vtestnzc_ps:
   16768   case Intrinsic::x86_avx_vtestz_pd:
   16769   case Intrinsic::x86_avx_vtestc_pd:
   16770   case Intrinsic::x86_avx_vtestnzc_pd:
   16771   case Intrinsic::x86_avx_vtestz_ps_256:
   16772   case Intrinsic::x86_avx_vtestc_ps_256:
   16773   case Intrinsic::x86_avx_vtestnzc_ps_256:
   16774   case Intrinsic::x86_avx_vtestz_pd_256:
   16775   case Intrinsic::x86_avx_vtestc_pd_256:
   16776   case Intrinsic::x86_avx_vtestnzc_pd_256: {
   16777     bool IsTestPacked = false;
   16778     unsigned X86CC;
   16779     switch (IntNo) {
   16780     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
   16781     case Intrinsic::x86_avx_vtestz_ps:
   16782     case Intrinsic::x86_avx_vtestz_pd:
   16783     case Intrinsic::x86_avx_vtestz_ps_256:
   16784     case Intrinsic::x86_avx_vtestz_pd_256:
   16785       IsTestPacked = true; // Fallthrough
   16786     case Intrinsic::x86_sse41_ptestz:
   16787     case Intrinsic::x86_avx_ptestz_256:
   16788       // ZF = 1
   16789       X86CC = X86::COND_E;
   16790       break;
   16791     case Intrinsic::x86_avx_vtestc_ps:
   16792     case Intrinsic::x86_avx_vtestc_pd:
   16793     case Intrinsic::x86_avx_vtestc_ps_256:
   16794     case Intrinsic::x86_avx_vtestc_pd_256:
   16795       IsTestPacked = true; // Fallthrough
   16796     case Intrinsic::x86_sse41_ptestc:
   16797     case Intrinsic::x86_avx_ptestc_256:
   16798       // CF = 1
   16799       X86CC = X86::COND_B;
   16800       break;
   16801     case Intrinsic::x86_avx_vtestnzc_ps:
   16802     case Intrinsic::x86_avx_vtestnzc_pd:
   16803     case Intrinsic::x86_avx_vtestnzc_ps_256:
   16804     case Intrinsic::x86_avx_vtestnzc_pd_256:
   16805       IsTestPacked = true; // Fallthrough
   16806     case Intrinsic::x86_sse41_ptestnzc:
   16807     case Intrinsic::x86_avx_ptestnzc_256:
   16808       // ZF and CF = 0
   16809       X86CC = X86::COND_A;
   16810       break;
   16811     }
   16812 
   16813     SDValue LHS = Op.getOperand(1);
   16814     SDValue RHS = Op.getOperand(2);
   16815     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
   16816     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
   16817     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
   16818     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
   16819     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   16820   }
   16821   case Intrinsic::x86_avx512_kortestz_w:
   16822   case Intrinsic::x86_avx512_kortestc_w: {
   16823     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
   16824     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
   16825     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
   16826     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
   16827     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
   16828     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
   16829     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   16830   }
   16831 
   16832   case Intrinsic::x86_sse42_pcmpistria128:
   16833   case Intrinsic::x86_sse42_pcmpestria128:
   16834   case Intrinsic::x86_sse42_pcmpistric128:
   16835   case Intrinsic::x86_sse42_pcmpestric128:
   16836   case Intrinsic::x86_sse42_pcmpistrio128:
   16837   case Intrinsic::x86_sse42_pcmpestrio128:
   16838   case Intrinsic::x86_sse42_pcmpistris128:
   16839   case Intrinsic::x86_sse42_pcmpestris128:
   16840   case Intrinsic::x86_sse42_pcmpistriz128:
   16841   case Intrinsic::x86_sse42_pcmpestriz128: {
   16842     unsigned Opcode;
   16843     unsigned X86CC;
   16844     switch (IntNo) {
   16845     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   16846     case Intrinsic::x86_sse42_pcmpistria128:
   16847       Opcode = X86ISD::PCMPISTRI;
   16848       X86CC = X86::COND_A;
   16849       break;
   16850     case Intrinsic::x86_sse42_pcmpestria128:
   16851       Opcode = X86ISD::PCMPESTRI;
   16852       X86CC = X86::COND_A;
   16853       break;
   16854     case Intrinsic::x86_sse42_pcmpistric128:
   16855       Opcode = X86ISD::PCMPISTRI;
   16856       X86CC = X86::COND_B;
   16857       break;
   16858     case Intrinsic::x86_sse42_pcmpestric128:
   16859       Opcode = X86ISD::PCMPESTRI;
   16860       X86CC = X86::COND_B;
   16861       break;
   16862     case Intrinsic::x86_sse42_pcmpistrio128:
   16863       Opcode = X86ISD::PCMPISTRI;
   16864       X86CC = X86::COND_O;
   16865       break;
   16866     case Intrinsic::x86_sse42_pcmpestrio128:
   16867       Opcode = X86ISD::PCMPESTRI;
   16868       X86CC = X86::COND_O;
   16869       break;
   16870     case Intrinsic::x86_sse42_pcmpistris128:
   16871       Opcode = X86ISD::PCMPISTRI;
   16872       X86CC = X86::COND_S;
   16873       break;
   16874     case Intrinsic::x86_sse42_pcmpestris128:
   16875       Opcode = X86ISD::PCMPESTRI;
   16876       X86CC = X86::COND_S;
   16877       break;
   16878     case Intrinsic::x86_sse42_pcmpistriz128:
   16879       Opcode = X86ISD::PCMPISTRI;
   16880       X86CC = X86::COND_E;
   16881       break;
   16882     case Intrinsic::x86_sse42_pcmpestriz128:
   16883       Opcode = X86ISD::PCMPESTRI;
   16884       X86CC = X86::COND_E;
   16885       break;
   16886     }
   16887     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
   16888     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   16889     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
   16890     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   16891                                 DAG.getConstant(X86CC, dl, MVT::i8),
   16892                                 SDValue(PCMP.getNode(), 1));
   16893     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   16894   }
   16895 
   16896   case Intrinsic::x86_sse42_pcmpistri128:
   16897   case Intrinsic::x86_sse42_pcmpestri128: {
   16898     unsigned Opcode;
   16899     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
   16900       Opcode = X86ISD::PCMPISTRI;
   16901     else
   16902       Opcode = X86ISD::PCMPESTRI;
   16903 
   16904     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
   16905     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   16906     return DAG.getNode(Opcode, dl, VTs, NewOps);
   16907   }
   16908 
   16909   case Intrinsic::x86_seh_lsda: {
   16910     // Compute the symbol for the LSDA. We know it'll get emitted later.
   16911     MachineFunction &MF = DAG.getMachineFunction();
   16912     SDValue Op1 = Op.getOperand(1);
   16913     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
   16914     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
   16915         GlobalValue::getRealLinkageName(Fn->getName()));
   16916 
   16917     // Generate a simple absolute symbol reference. This intrinsic is only
   16918     // supported on 32-bit Windows, which isn't PIC.
   16919     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
   16920     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
   16921   }
   16922 
   16923   case Intrinsic::x86_seh_recoverfp: {
   16924     SDValue FnOp = Op.getOperand(1);
   16925     SDValue IncomingFPOp = Op.getOperand(2);
   16926     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
   16927     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
   16928     if (!Fn)
   16929       report_fatal_error(
   16930           "llvm.x86.seh.recoverfp must take a function as the first argument");
   16931     return recoverFramePointer(DAG, Fn, IncomingFPOp);
   16932   }
   16933 
   16934   case Intrinsic::localaddress: {
   16935     // Returns one of the stack, base, or frame pointer registers, depending on
   16936     // which is used to reference local variables.
   16937     MachineFunction &MF = DAG.getMachineFunction();
   16938     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   16939     unsigned Reg;
   16940     if (RegInfo->hasBasePointer(MF))
   16941       Reg = RegInfo->getBaseRegister();
   16942     else // This function handles the SP or FP case.
   16943       Reg = RegInfo->getPtrSizedFrameRegister(MF);
   16944     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
   16945   }
   16946   }
   16947 }
   16948 
   16949 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   16950                               SDValue Src, SDValue Mask, SDValue Base,
   16951                               SDValue Index, SDValue ScaleOp, SDValue Chain,
   16952                               const X86Subtarget * Subtarget) {
   16953   SDLoc dl(Op);
   16954   auto *C = cast<ConstantSDNode>(ScaleOp);
   16955   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   16956   MVT MaskVT = MVT::getVectorVT(MVT::i1,
   16957                              Index.getSimpleValueType().getVectorNumElements());
   16958   SDValue MaskInReg;
   16959   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
   16960   if (MaskC)
   16961     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
   16962   else {
   16963     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
   16964                                      Mask.getSimpleValueType().getSizeInBits());
   16965 
   16966     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
   16967     // are extracted by EXTRACT_SUBVECTOR.
   16968     MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
   16969                             DAG.getBitcast(BitcastVT, Mask),
   16970                             DAG.getIntPtrConstant(0, dl));
   16971   }
   16972   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   16973   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   16974   SDValue Segment = DAG.getRegister(0, MVT::i32);
   16975   if (Src.getOpcode() == ISD::UNDEF)
   16976     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   16977   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   16978   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   16979   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
   16980   return DAG.getMergeValues(RetOps, dl);
   16981 }
   16982 
   16983 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   16984                                SDValue Src, SDValue Mask, SDValue Base,
   16985                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
   16986   SDLoc dl(Op);
   16987   auto *C = cast<ConstantSDNode>(ScaleOp);
   16988   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   16989   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   16990   SDValue Segment = DAG.getRegister(0, MVT::i32);
   16991   MVT MaskVT = MVT::getVectorVT(MVT::i1,
   16992                              Index.getSimpleValueType().getVectorNumElements());
   16993   SDValue MaskInReg;
   16994   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
   16995   if (MaskC)
   16996     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
   16997   else {
   16998     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
   16999                                      Mask.getSimpleValueType().getSizeInBits());
   17000 
   17001     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
   17002     // are extracted by EXTRACT_SUBVECTOR.
   17003     MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
   17004                             DAG.getBitcast(BitcastVT, Mask),
   17005                             DAG.getIntPtrConstant(0, dl));
   17006   }
   17007   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   17008   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
   17009   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   17010   return SDValue(Res, 1);
   17011 }
   17012 
   17013 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   17014                                SDValue Mask, SDValue Base, SDValue Index,
   17015                                SDValue ScaleOp, SDValue Chain) {
   17016   SDLoc dl(Op);
   17017   auto *C = cast<ConstantSDNode>(ScaleOp);
   17018   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   17019   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   17020   SDValue Segment = DAG.getRegister(0, MVT::i32);
   17021   MVT MaskVT =
   17022     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   17023   SDValue MaskInReg;
   17024   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
   17025   if (MaskC)
   17026     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
   17027   else
   17028     MaskInReg = DAG.getBitcast(MaskVT, Mask);
   17029   //SDVTList VTs = DAG.getVTList(MVT::Other);
   17030   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   17031   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   17032   return SDValue(Res, 0);
   17033 }
   17034 
   17035 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
   17036 // read performance monitor counters (x86_rdpmc).
   17037 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
   17038                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
   17039                               SmallVectorImpl<SDValue> &Results) {
   17040   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   17041   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   17042   SDValue LO, HI;
   17043 
   17044   // The ECX register is used to select the index of the performance counter
   17045   // to read.
   17046   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
   17047                                    N->getOperand(2));
   17048   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
   17049 
   17050   // Reads the content of a 64-bit performance counter and returns it in the
   17051   // registers EDX:EAX.
   17052   if (Subtarget->is64Bit()) {
   17053     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
   17054     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
   17055                             LO.getValue(2));
   17056   } else {
   17057     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
   17058     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
   17059                             LO.getValue(2));
   17060   }
   17061   Chain = HI.getValue(1);
   17062 
   17063   if (Subtarget->is64Bit()) {
   17064     // The EAX register is loaded with the low-order 32 bits. The EDX register
   17065     // is loaded with the supported high-order bits of the counter.
   17066     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
   17067                               DAG.getConstant(32, DL, MVT::i8));
   17068     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
   17069     Results.push_back(Chain);
   17070     return;
   17071   }
   17072 
   17073   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   17074   SDValue Ops[] = { LO, HI };
   17075   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   17076   Results.push_back(Pair);
   17077   Results.push_back(Chain);
   17078 }
   17079 
   17080 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
   17081 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
   17082 // also used to custom lower READCYCLECOUNTER nodes.
   17083 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
   17084                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
   17085                               SmallVectorImpl<SDValue> &Results) {
   17086   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   17087   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
   17088   SDValue LO, HI;
   17089 
   17090   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
   17091   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
   17092   // and the EAX register is loaded with the low-order 32 bits.
   17093   if (Subtarget->is64Bit()) {
   17094     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
   17095     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
   17096                             LO.getValue(2));
   17097   } else {
   17098     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
   17099     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
   17100                             LO.getValue(2));
   17101   }
   17102   SDValue Chain = HI.getValue(1);
   17103 
   17104   if (Opcode == X86ISD::RDTSCP_DAG) {
   17105     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   17106 
   17107     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
   17108     // the ECX register. Add 'ecx' explicitly to the chain.
   17109     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
   17110                                      HI.getValue(2));
   17111     // Explicitly store the content of ECX at the location passed in input
   17112     // to the 'rdtscp' intrinsic.
   17113     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
   17114                          MachinePointerInfo(), false, false, 0);
   17115   }
   17116 
   17117   if (Subtarget->is64Bit()) {
   17118     // The EDX register is loaded with the high-order 32 bits of the MSR, and
   17119     // the EAX register is loaded with the low-order 32 bits.
   17120     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
   17121                               DAG.getConstant(32, DL, MVT::i8));
   17122     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
   17123     Results.push_back(Chain);
   17124     return;
   17125   }
   17126 
   17127   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   17128   SDValue Ops[] = { LO, HI };
   17129   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   17130   Results.push_back(Pair);
   17131   Results.push_back(Chain);
   17132 }
   17133 
   17134 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
   17135                                      SelectionDAG &DAG) {
   17136   SmallVector<SDValue, 2> Results;
   17137   SDLoc DL(Op);
   17138   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
   17139                           Results);
   17140   return DAG.getMergeValues(Results, DL);
   17141 }
   17142 
   17143 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
   17144   MachineFunction &MF = DAG.getMachineFunction();
   17145   SDValue Chain = Op.getOperand(0);
   17146   SDValue RegNode = Op.getOperand(2);
   17147   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
   17148   if (!EHInfo)
   17149     report_fatal_error("EH registrations only live in functions using WinEH");
   17150 
   17151   // Cast the operand to an alloca, and remember the frame index.
   17152   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
   17153   if (!FINode)
   17154     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
   17155   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
   17156 
   17157   // Return the chain operand without making any DAG nodes.
   17158   return Chain;
   17159 }
   17160 
   17161 /// \brief Lower intrinsics for TRUNCATE_TO_MEM case
   17162 /// return truncate Store/MaskedStore Node
   17163 static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op,
   17164                                                SelectionDAG &DAG,
   17165                                                MVT ElementType) {
   17166   SDLoc dl(Op);
   17167   SDValue Mask = Op.getOperand(4);
   17168   SDValue DataToTruncate = Op.getOperand(3);
   17169   SDValue Addr = Op.getOperand(2);
   17170   SDValue Chain = Op.getOperand(0);
   17171 
   17172   MVT VT  = DataToTruncate.getSimpleValueType();
   17173   MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements());
   17174 
   17175   if (isAllOnesConstant(Mask)) // return just a truncate store
   17176     return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr,
   17177                              MachinePointerInfo(), SVT, false, false,
   17178                              SVT.getScalarSizeInBits()/8);
   17179 
   17180   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   17181   MVT BitcastVT = MVT::getVectorVT(MVT::i1,
   17182                                    Mask.getSimpleValueType().getSizeInBits());
   17183   // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
   17184   // are extracted by EXTRACT_SUBVECTOR.
   17185   SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
   17186                               DAG.getBitcast(BitcastVT, Mask),
   17187                               DAG.getIntPtrConstant(0, dl));
   17188 
   17189   MachineMemOperand *MMO = DAG.getMachineFunction().
   17190     getMachineMemOperand(MachinePointerInfo(),
   17191                          MachineMemOperand::MOStore, SVT.getStoreSize(),
   17192                          SVT.getScalarSizeInBits()/8);
   17193 
   17194   return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr,
   17195                             VMask, SVT, MMO, true);
   17196 }
   17197 
   17198 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   17199                                       SelectionDAG &DAG) {
   17200   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   17201 
   17202   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
   17203   if (!IntrData) {
   17204     if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
   17205       return MarkEHRegistrationNode(Op, DAG);
   17206     return SDValue();
   17207   }
   17208 
   17209   SDLoc dl(Op);
   17210   switch(IntrData->Type) {
   17211   default: llvm_unreachable("Unknown Intrinsic Type");
   17212   case RDSEED:
   17213   case RDRAND: {
   17214     // Emit the node with the right value type.
   17215     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
   17216     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
   17217 
   17218     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
   17219     // Otherwise return the value from Rand, which is always 0, casted to i32.
   17220     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
   17221                       DAG.getConstant(1, dl, Op->getValueType(1)),
   17222                       DAG.getConstant(X86::COND_B, dl, MVT::i32),
   17223                       SDValue(Result.getNode(), 1) };
   17224     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
   17225                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
   17226                                   Ops);
   17227 
   17228     // Return { result, isValid, chain }.
   17229     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
   17230                        SDValue(Result.getNode(), 2));
   17231   }
   17232   case GATHER: {
   17233   //gather(v1, mask, index, base, scale);
   17234     SDValue Chain = Op.getOperand(0);
   17235     SDValue Src   = Op.getOperand(2);
   17236     SDValue Base  = Op.getOperand(3);
   17237     SDValue Index = Op.getOperand(4);
   17238     SDValue Mask  = Op.getOperand(5);
   17239     SDValue Scale = Op.getOperand(6);
   17240     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
   17241                          Chain, Subtarget);
   17242   }
   17243   case SCATTER: {
   17244   //scatter(base, mask, index, v1, scale);
   17245     SDValue Chain = Op.getOperand(0);
   17246     SDValue Base  = Op.getOperand(2);
   17247     SDValue Mask  = Op.getOperand(3);
   17248     SDValue Index = Op.getOperand(4);
   17249     SDValue Src   = Op.getOperand(5);
   17250     SDValue Scale = Op.getOperand(6);
   17251     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
   17252                           Scale, Chain);
   17253   }
   17254   case PREFETCH: {
   17255     SDValue Hint = Op.getOperand(6);
   17256     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
   17257     assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
   17258     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
   17259     SDValue Chain = Op.getOperand(0);
   17260     SDValue Mask  = Op.getOperand(2);
   17261     SDValue Index = Op.getOperand(3);
   17262     SDValue Base  = Op.getOperand(4);
   17263     SDValue Scale = Op.getOperand(5);
   17264     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
   17265   }
   17266   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
   17267   case RDTSC: {
   17268     SmallVector<SDValue, 2> Results;
   17269     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
   17270                             Results);
   17271     return DAG.getMergeValues(Results, dl);
   17272   }
   17273   // Read Performance Monitoring Counters.
   17274   case RDPMC: {
   17275     SmallVector<SDValue, 2> Results;
   17276     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
   17277     return DAG.getMergeValues(Results, dl);
   17278   }
   17279   // XTEST intrinsics.
   17280   case XTEST: {
   17281     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
   17282     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
   17283     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   17284                                 DAG.getConstant(X86::COND_NE, dl, MVT::i8),
   17285                                 InTrans);
   17286     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
   17287     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
   17288                        Ret, SDValue(InTrans.getNode(), 1));
   17289   }
   17290   // ADC/ADCX/SBB
   17291   case ADX: {
   17292     SmallVector<SDValue, 2> Results;
   17293     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
   17294     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
   17295     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
   17296                                 DAG.getConstant(-1, dl, MVT::i8));
   17297     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
   17298                               Op.getOperand(4), GenCF.getValue(1));
   17299     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
   17300                                  Op.getOperand(5), MachinePointerInfo(),
   17301                                  false, false, 0);
   17302     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   17303                                 DAG.getConstant(X86::COND_B, dl, MVT::i8),
   17304                                 Res.getValue(1));
   17305     Results.push_back(SetCC);
   17306     Results.push_back(Store);
   17307     return DAG.getMergeValues(Results, dl);
   17308   }
   17309   case COMPRESS_TO_MEM: {
   17310     SDLoc dl(Op);
   17311     SDValue Mask = Op.getOperand(4);
   17312     SDValue DataToCompress = Op.getOperand(3);
   17313     SDValue Addr = Op.getOperand(2);
   17314     SDValue Chain = Op.getOperand(0);
   17315 
   17316     MVT VT = DataToCompress.getSimpleValueType();
   17317     if (isAllOnesConstant(Mask)) // return just a store
   17318       return DAG.getStore(Chain, dl, DataToCompress, Addr,
   17319                           MachinePointerInfo(), false, false,
   17320                           VT.getScalarSizeInBits()/8);
   17321 
   17322     SDValue Compressed =
   17323       getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
   17324                            Mask, DAG.getUNDEF(VT), Subtarget, DAG);
   17325     return DAG.getStore(Chain, dl, Compressed, Addr,
   17326                         MachinePointerInfo(), false, false,
   17327                         VT.getScalarSizeInBits()/8);
   17328   }
   17329   case TRUNCATE_TO_MEM_VI8:
   17330     return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8);
   17331   case TRUNCATE_TO_MEM_VI16:
   17332     return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16);
   17333   case TRUNCATE_TO_MEM_VI32:
   17334     return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32);
   17335   case EXPAND_FROM_MEM: {
   17336     SDLoc dl(Op);
   17337     SDValue Mask = Op.getOperand(4);
   17338     SDValue PassThru = Op.getOperand(3);
   17339     SDValue Addr = Op.getOperand(2);
   17340     SDValue Chain = Op.getOperand(0);
   17341     MVT VT = Op.getSimpleValueType();
   17342 
   17343     if (isAllOnesConstant(Mask)) // return just a load
   17344       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
   17345                          false, VT.getScalarSizeInBits()/8);
   17346 
   17347     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
   17348                                        false, false, false,
   17349                                        VT.getScalarSizeInBits()/8);
   17350 
   17351     SDValue Results[] = {
   17352       getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
   17353                            Mask, PassThru, Subtarget, DAG), Chain};
   17354     return DAG.getMergeValues(Results, dl);
   17355   }
   17356   }
   17357 }
   17358 
   17359 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
   17360                                            SelectionDAG &DAG) const {
   17361   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   17362   MFI->setReturnAddressIsTaken(true);
   17363 
   17364   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
   17365     return SDValue();
   17366 
   17367   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   17368   SDLoc dl(Op);
   17369   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   17370 
   17371   if (Depth > 0) {
   17372     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
   17373     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   17374     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
   17375     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
   17376                        DAG.getNode(ISD::ADD, dl, PtrVT,
   17377                                    FrameAddr, Offset),
   17378                        MachinePointerInfo(), false, false, false, 0);
   17379   }
   17380 
   17381   // Just load the return address.
   17382   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
   17383   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
   17384                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
   17385 }
   17386 
   17387 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   17388   MachineFunction &MF = DAG.getMachineFunction();
   17389   MachineFrameInfo *MFI = MF.getFrameInfo();
   17390   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   17391   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   17392   EVT VT = Op.getValueType();
   17393 
   17394   MFI->setFrameAddressIsTaken(true);
   17395 
   17396   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
   17397     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
   17398     // is not possible to crawl up the stack without looking at the unwind codes
   17399     // simultaneously.
   17400     int FrameAddrIndex = FuncInfo->getFAIndex();
   17401     if (!FrameAddrIndex) {
   17402       // Set up a frame object for the return address.
   17403       unsigned SlotSize = RegInfo->getSlotSize();
   17404       FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
   17405           SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
   17406       FuncInfo->setFAIndex(FrameAddrIndex);
   17407     }
   17408     return DAG.getFrameIndex(FrameAddrIndex, VT);
   17409   }
   17410 
   17411   unsigned FrameReg =
   17412       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   17413   SDLoc dl(Op);  // FIXME probably not meaningful
   17414   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   17415   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
   17416           (FrameReg == X86::EBP && VT == MVT::i32)) &&
   17417          "Invalid Frame Register!");
   17418   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   17419   while (Depth--)
   17420     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
   17421                             MachinePointerInfo(),
   17422                             false, false, false, 0);
   17423   return FrameAddr;
   17424 }
   17425 
   17426 // FIXME? Maybe this could be a TableGen attribute on some registers and
   17427 // this table could be generated automatically from RegInfo.
   17428 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
   17429                                               SelectionDAG &DAG) const {
   17430   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   17431   const MachineFunction &MF = DAG.getMachineFunction();
   17432 
   17433   unsigned Reg = StringSwitch<unsigned>(RegName)
   17434                        .Case("esp", X86::ESP)
   17435                        .Case("rsp", X86::RSP)
   17436                        .Case("ebp", X86::EBP)
   17437                        .Case("rbp", X86::RBP)
   17438                        .Default(0);
   17439 
   17440   if (Reg == X86::EBP || Reg == X86::RBP) {
   17441     if (!TFI.hasFP(MF))
   17442       report_fatal_error("register " + StringRef(RegName) +
   17443                          " is allocatable: function has no frame pointer");
   17444 #ifndef NDEBUG
   17445     else {
   17446       const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   17447       unsigned FrameReg =
   17448           RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   17449       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
   17450              "Invalid Frame Register!");
   17451     }
   17452 #endif
   17453   }
   17454 
   17455   if (Reg)
   17456     return Reg;
   17457 
   17458   report_fatal_error("Invalid register name global variable");
   17459 }
   17460 
   17461 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
   17462                                                      SelectionDAG &DAG) const {
   17463   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   17464   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
   17465 }
   17466 
   17467 unsigned X86TargetLowering::getExceptionPointerRegister(
   17468     const Constant *PersonalityFn) const {
   17469   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
   17470     return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
   17471 
   17472   return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
   17473 }
   17474 
   17475 unsigned X86TargetLowering::getExceptionSelectorRegister(
   17476     const Constant *PersonalityFn) const {
   17477   // Funclet personalities don't use selectors (the runtime does the selection).
   17478   assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
   17479   return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
   17480 }
   17481 
   17482 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   17483   SDValue Chain     = Op.getOperand(0);
   17484   SDValue Offset    = Op.getOperand(1);
   17485   SDValue Handler   = Op.getOperand(2);
   17486   SDLoc dl      (Op);
   17487 
   17488   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   17489   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   17490   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   17491   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
   17492           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
   17493          "Invalid Frame Register!");
   17494   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
   17495   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
   17496 
   17497   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
   17498                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
   17499                                                        dl));
   17500   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
   17501   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
   17502                        false, false, 0);
   17503   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
   17504 
   17505   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
   17506                      DAG.getRegister(StoreAddrReg, PtrVT));
   17507 }
   17508 
   17509 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
   17510                                                SelectionDAG &DAG) const {
   17511   SDLoc DL(Op);
   17512   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
   17513                      DAG.getVTList(MVT::i32, MVT::Other),
   17514                      Op.getOperand(0), Op.getOperand(1));
   17515 }
   17516 
   17517 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
   17518                                                 SelectionDAG &DAG) const {
   17519   SDLoc DL(Op);
   17520   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
   17521                      Op.getOperand(0), Op.getOperand(1));
   17522 }
   17523 
   17524 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
   17525   return Op.getOperand(0);
   17526 }
   17527 
   17528 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   17529                                                 SelectionDAG &DAG) const {
   17530   SDValue Root = Op.getOperand(0);
   17531   SDValue Trmp = Op.getOperand(1); // trampoline
   17532   SDValue FPtr = Op.getOperand(2); // nested function
   17533   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   17534   SDLoc dl (Op);
   17535 
   17536   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   17537   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   17538 
   17539   if (Subtarget->is64Bit()) {
   17540     SDValue OutChains[6];
   17541 
   17542     // Large code-model.
   17543     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
   17544     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
   17545 
   17546     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
   17547     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
   17548 
   17549     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
   17550 
   17551     // Load the pointer to the nested function into R11.
   17552     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
   17553     SDValue Addr = Trmp;
   17554     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
   17555                                 Addr, MachinePointerInfo(TrmpAddr),
   17556                                 false, false, 0);
   17557 
   17558     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   17559                        DAG.getConstant(2, dl, MVT::i64));
   17560     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
   17561                                 MachinePointerInfo(TrmpAddr, 2),
   17562                                 false, false, 2);
   17563 
   17564     // Load the 'nest' parameter value into R10.
   17565     // R10 is specified in X86CallingConv.td
   17566     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
   17567     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   17568                        DAG.getConstant(10, dl, MVT::i64));
   17569     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
   17570                                 Addr, MachinePointerInfo(TrmpAddr, 10),
   17571                                 false, false, 0);
   17572 
   17573     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   17574                        DAG.getConstant(12, dl, MVT::i64));
   17575     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
   17576                                 MachinePointerInfo(TrmpAddr, 12),
   17577                                 false, false, 2);
   17578 
   17579     // Jump to the nested function.
   17580     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
   17581     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   17582                        DAG.getConstant(20, dl, MVT::i64));
   17583     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
   17584                                 Addr, MachinePointerInfo(TrmpAddr, 20),
   17585                                 false, false, 0);
   17586 
   17587     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
   17588     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   17589                        DAG.getConstant(22, dl, MVT::i64));
   17590     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
   17591                                 Addr, MachinePointerInfo(TrmpAddr, 22),
   17592                                 false, false, 0);
   17593 
   17594     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   17595   } else {
   17596     const Function *Func =
   17597       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
   17598     CallingConv::ID CC = Func->getCallingConv();
   17599     unsigned NestReg;
   17600 
   17601     switch (CC) {
   17602     default:
   17603       llvm_unreachable("Unsupported calling convention");
   17604     case CallingConv::C:
   17605     case CallingConv::X86_StdCall: {
   17606       // Pass 'nest' parameter in ECX.
   17607       // Must be kept in sync with X86CallingConv.td
   17608       NestReg = X86::ECX;
   17609 
   17610       // Check that ECX wasn't needed by an 'inreg' parameter.
   17611       FunctionType *FTy = Func->getFunctionType();
   17612       const AttributeSet &Attrs = Func->getAttributes();
   17613 
   17614       if (!Attrs.isEmpty() && !Func->isVarArg()) {
   17615         unsigned InRegCount = 0;
   17616         unsigned Idx = 1;
   17617 
   17618         for (FunctionType::param_iterator I = FTy->param_begin(),
   17619              E = FTy->param_end(); I != E; ++I, ++Idx)
   17620           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
   17621             auto &DL = DAG.getDataLayout();
   17622             // FIXME: should only count parameters that are lowered to integers.
   17623             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
   17624           }
   17625 
   17626         if (InRegCount > 2) {
   17627           report_fatal_error("Nest register in use - reduce number of inreg"
   17628                              " parameters!");
   17629         }
   17630       }
   17631       break;
   17632     }
   17633     case CallingConv::X86_FastCall:
   17634     case CallingConv::X86_ThisCall:
   17635     case CallingConv::Fast:
   17636       // Pass 'nest' parameter in EAX.
   17637       // Must be kept in sync with X86CallingConv.td
   17638       NestReg = X86::EAX;
   17639       break;
   17640     }
   17641 
   17642     SDValue OutChains[4];
   17643     SDValue Addr, Disp;
   17644 
   17645     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   17646                        DAG.getConstant(10, dl, MVT::i32));
   17647     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
   17648 
   17649     // This is storing the opcode for MOV32ri.
   17650     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
   17651     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
   17652     OutChains[0] = DAG.getStore(Root, dl,
   17653                                 DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8),
   17654                                 Trmp, MachinePointerInfo(TrmpAddr),
   17655                                 false, false, 0);
   17656 
   17657     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   17658                        DAG.getConstant(1, dl, MVT::i32));
   17659     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
   17660                                 MachinePointerInfo(TrmpAddr, 1),
   17661                                 false, false, 1);
   17662 
   17663     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
   17664     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   17665                        DAG.getConstant(5, dl, MVT::i32));
   17666     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
   17667                                 Addr, MachinePointerInfo(TrmpAddr, 5),
   17668                                 false, false, 1);
   17669 
   17670     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   17671                        DAG.getConstant(6, dl, MVT::i32));
   17672     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
   17673                                 MachinePointerInfo(TrmpAddr, 6),
   17674                                 false, false, 1);
   17675 
   17676     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   17677   }
   17678 }
   17679 
   17680 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   17681                                             SelectionDAG &DAG) const {
   17682   /*
   17683    The rounding mode is in bits 11:10 of FPSR, and has the following
   17684    settings:
   17685      00 Round to nearest
   17686      01 Round to -inf
   17687      10 Round to +inf
   17688      11 Round to 0
   17689 
   17690   FLT_ROUNDS, on the other hand, expects the following:
   17691     -1 Undefined
   17692      0 Round to 0
   17693      1 Round to nearest
   17694      2 Round to +inf
   17695      3 Round to -inf
   17696 
   17697   To perform the conversion, we do:
   17698     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
   17699   */
   17700 
   17701   MachineFunction &MF = DAG.getMachineFunction();
   17702   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   17703   unsigned StackAlignment = TFI.getStackAlignment();
   17704   MVT VT = Op.getSimpleValueType();
   17705   SDLoc DL(Op);
   17706 
   17707   // Save FP Control Word to stack slot
   17708   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
   17709   SDValue StackSlot =
   17710       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
   17711 
   17712   MachineMemOperand *MMO =
   17713       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
   17714                               MachineMemOperand::MOStore, 2, 2);
   17715 
   17716   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
   17717   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
   17718                                           DAG.getVTList(MVT::Other),
   17719                                           Ops, MVT::i16, MMO);
   17720 
   17721   // Load FP Control Word from stack slot
   17722   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
   17723                             MachinePointerInfo(), false, false, false, 0);
   17724 
   17725   // Transform as necessary
   17726   SDValue CWD1 =
   17727     DAG.getNode(ISD::SRL, DL, MVT::i16,
   17728                 DAG.getNode(ISD::AND, DL, MVT::i16,
   17729                             CWD, DAG.getConstant(0x800, DL, MVT::i16)),
   17730                 DAG.getConstant(11, DL, MVT::i8));
   17731   SDValue CWD2 =
   17732     DAG.getNode(ISD::SRL, DL, MVT::i16,
   17733                 DAG.getNode(ISD::AND, DL, MVT::i16,
   17734                             CWD, DAG.getConstant(0x400, DL, MVT::i16)),
   17735                 DAG.getConstant(9, DL, MVT::i8));
   17736 
   17737   SDValue RetVal =
   17738     DAG.getNode(ISD::AND, DL, MVT::i16,
   17739                 DAG.getNode(ISD::ADD, DL, MVT::i16,
   17740                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
   17741                             DAG.getConstant(1, DL, MVT::i16)),
   17742                 DAG.getConstant(3, DL, MVT::i16));
   17743 
   17744   return DAG.getNode((VT.getSizeInBits() < 16 ?
   17745                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
   17746 }
   17747 
   17748 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
   17749 //
   17750 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
   17751 //    to 512-bit vector.
   17752 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
   17753 //    ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
   17754 //    split the vector, perform operation on it's Lo a Hi part and
   17755 //    concatenate the results.
   17756 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
   17757   SDLoc dl(Op);
   17758   MVT VT = Op.getSimpleValueType();
   17759   MVT EltVT = VT.getVectorElementType();
   17760   unsigned NumElems = VT.getVectorNumElements();
   17761 
   17762   if (EltVT == MVT::i64 || EltVT == MVT::i32) {
   17763     // Extend to 512 bit vector.
   17764     assert((VT.is256BitVector() || VT.is128BitVector()) &&
   17765               "Unsupported value type for operation");
   17766 
   17767     MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
   17768     SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
   17769                                  DAG.getUNDEF(NewVT),
   17770                                  Op.getOperand(0),
   17771                                  DAG.getIntPtrConstant(0, dl));
   17772     SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
   17773 
   17774     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
   17775                        DAG.getIntPtrConstant(0, dl));
   17776   }
   17777 
   17778   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
   17779           "Unsupported element type");
   17780 
   17781   if (16 < NumElems) {
   17782     // Split vector, it's Lo and Hi parts will be handled in next iteration.
   17783     SDValue Lo, Hi;
   17784     std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
   17785     MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
   17786 
   17787     Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo);
   17788     Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi);
   17789 
   17790     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
   17791   }
   17792 
   17793   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
   17794 
   17795   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
   17796           "Unsupported value type for operation");
   17797 
   17798   // Use native supported vector instruction vplzcntd.
   17799   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
   17800   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
   17801   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
   17802   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
   17803 
   17804   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
   17805 }
   17806 
   17807 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget,
   17808                          SelectionDAG &DAG) {
   17809   MVT VT = Op.getSimpleValueType();
   17810   MVT OpVT = VT;
   17811   unsigned NumBits = VT.getSizeInBits();
   17812   SDLoc dl(Op);
   17813 
   17814   if (VT.isVector() && Subtarget->hasAVX512())
   17815     return LowerVectorCTLZ_AVX512(Op, DAG);
   17816 
   17817   Op = Op.getOperand(0);
   17818   if (VT == MVT::i8) {
   17819     // Zero extend to i32 since there is not an i8 bsr.
   17820     OpVT = MVT::i32;
   17821     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   17822   }
   17823 
   17824   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
   17825   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   17826   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
   17827 
   17828   // If src is zero (i.e. bsr sets ZF), returns NumBits.
   17829   SDValue Ops[] = {
   17830     Op,
   17831     DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
   17832     DAG.getConstant(X86::COND_E, dl, MVT::i8),
   17833     Op.getValue(1)
   17834   };
   17835   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
   17836 
   17837   // Finally xor with NumBits-1.
   17838   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
   17839                    DAG.getConstant(NumBits - 1, dl, OpVT));
   17840 
   17841   if (VT == MVT::i8)
   17842     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
   17843   return Op;
   17844 }
   17845 
   17846 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget,
   17847                                     SelectionDAG &DAG) {
   17848   MVT VT = Op.getSimpleValueType();
   17849   EVT OpVT = VT;
   17850   unsigned NumBits = VT.getSizeInBits();
   17851   SDLoc dl(Op);
   17852 
   17853   if (VT.isVector() && Subtarget->hasAVX512())
   17854     return LowerVectorCTLZ_AVX512(Op, DAG);
   17855 
   17856   Op = Op.getOperand(0);
   17857   if (VT == MVT::i8) {
   17858     // Zero extend to i32 since there is not an i8 bsr.
   17859     OpVT = MVT::i32;
   17860     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   17861   }
   17862 
   17863   // Issue a bsr (scan bits in reverse).
   17864   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   17865   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
   17866 
   17867   // And xor with NumBits-1.
   17868   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
   17869                    DAG.getConstant(NumBits - 1, dl, OpVT));
   17870 
   17871   if (VT == MVT::i8)
   17872     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
   17873   return Op;
   17874 }
   17875 
   17876 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   17877   MVT VT = Op.getSimpleValueType();
   17878   unsigned NumBits = VT.getScalarSizeInBits();
   17879   SDLoc dl(Op);
   17880 
   17881   if (VT.isVector()) {
   17882     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   17883 
   17884     SDValue N0 = Op.getOperand(0);
   17885     SDValue Zero = DAG.getConstant(0, dl, VT);
   17886 
   17887     // lsb(x) = (x & -x)
   17888     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
   17889                               DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
   17890 
   17891     // cttz_undef(x) = (width - 1) - ctlz(lsb)
   17892     if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
   17893         TLI.isOperationLegal(ISD::CTLZ, VT)) {
   17894       SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
   17895       return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
   17896                          DAG.getNode(ISD::CTLZ, dl, VT, LSB));
   17897     }
   17898 
   17899     // cttz(x) = ctpop(lsb - 1)
   17900     SDValue One = DAG.getConstant(1, dl, VT);
   17901     return DAG.getNode(ISD::CTPOP, dl, VT,
   17902                        DAG.getNode(ISD::SUB, dl, VT, LSB, One));
   17903   }
   17904 
   17905   assert(Op.getOpcode() == ISD::CTTZ &&
   17906          "Only scalar CTTZ requires custom lowering");
   17907 
   17908   // Issue a bsf (scan bits forward) which also sets EFLAGS.
   17909   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   17910   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
   17911 
   17912   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   17913   SDValue Ops[] = {
   17914     Op,
   17915     DAG.getConstant(NumBits, dl, VT),
   17916     DAG.getConstant(X86::COND_E, dl, MVT::i8),
   17917     Op.getValue(1)
   17918   };
   17919   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
   17920 }
   17921 
   17922 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
   17923 // ones, and then concatenate the result back.
   17924 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
   17925   MVT VT = Op.getSimpleValueType();
   17926 
   17927   assert(VT.is256BitVector() && VT.isInteger() &&
   17928          "Unsupported value type for operation");
   17929 
   17930   unsigned NumElems = VT.getVectorNumElements();
   17931   SDLoc dl(Op);
   17932 
   17933   // Extract the LHS vectors
   17934   SDValue LHS = Op.getOperand(0);
   17935   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
   17936   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
   17937 
   17938   // Extract the RHS vectors
   17939   SDValue RHS = Op.getOperand(1);
   17940   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
   17941   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
   17942 
   17943   MVT EltVT = VT.getVectorElementType();
   17944   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   17945 
   17946   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   17947                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
   17948                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
   17949 }
   17950 
   17951 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
   17952   if (Op.getValueType() == MVT::i1)
   17953     return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
   17954                        Op.getOperand(0), Op.getOperand(1));
   17955   assert(Op.getSimpleValueType().is256BitVector() &&
   17956          Op.getSimpleValueType().isInteger() &&
   17957          "Only handle AVX 256-bit vector integer operation");
   17958   return Lower256IntArith(Op, DAG);
   17959 }
   17960 
   17961 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
   17962   if (Op.getValueType() == MVT::i1)
   17963     return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
   17964                        Op.getOperand(0), Op.getOperand(1));
   17965   assert(Op.getSimpleValueType().is256BitVector() &&
   17966          Op.getSimpleValueType().isInteger() &&
   17967          "Only handle AVX 256-bit vector integer operation");
   17968   return Lower256IntArith(Op, DAG);
   17969 }
   17970 
   17971 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
   17972   assert(Op.getSimpleValueType().is256BitVector() &&
   17973          Op.getSimpleValueType().isInteger() &&
   17974          "Only handle AVX 256-bit vector integer operation");
   17975   return Lower256IntArith(Op, DAG);
   17976 }
   17977 
   17978 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   17979                         SelectionDAG &DAG) {
   17980   SDLoc dl(Op);
   17981   MVT VT = Op.getSimpleValueType();
   17982 
   17983   if (VT == MVT::i1)
   17984     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
   17985 
   17986   // Decompose 256-bit ops into smaller 128-bit ops.
   17987   if (VT.is256BitVector() && !Subtarget->hasInt256())
   17988     return Lower256IntArith(Op, DAG);
   17989 
   17990   SDValue A = Op.getOperand(0);
   17991   SDValue B = Op.getOperand(1);
   17992 
   17993   // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector
   17994   // pairs, multiply and truncate.
   17995   if (VT == MVT::v16i8 || VT == MVT::v32i8) {
   17996     if (Subtarget->hasInt256()) {
   17997       if (VT == MVT::v32i8) {
   17998         MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2);
   17999         SDValue Lo = DAG.getIntPtrConstant(0, dl);
   18000         SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
   18001         SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo);
   18002         SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo);
   18003         SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi);
   18004         SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi);
   18005         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   18006                            DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo),
   18007                            DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi));
   18008       }
   18009 
   18010       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
   18011       return DAG.getNode(
   18012           ISD::TRUNCATE, dl, VT,
   18013           DAG.getNode(ISD::MUL, dl, ExVT,
   18014                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
   18015                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
   18016     }
   18017 
   18018     assert(VT == MVT::v16i8 &&
   18019            "Pre-AVX2 support only supports v16i8 multiplication");
   18020     MVT ExVT = MVT::v8i16;
   18021 
   18022     // Extract the lo parts and sign extend to i16
   18023     SDValue ALo, BLo;
   18024     if (Subtarget->hasSSE41()) {
   18025       ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
   18026       BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
   18027     } else {
   18028       const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
   18029                               -1, 4, -1, 5, -1, 6, -1, 7};
   18030       ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
   18031       BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
   18032       ALo = DAG.getBitcast(ExVT, ALo);
   18033       BLo = DAG.getBitcast(ExVT, BLo);
   18034       ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
   18035       BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
   18036     }
   18037 
   18038     // Extract the hi parts and sign extend to i16
   18039     SDValue AHi, BHi;
   18040     if (Subtarget->hasSSE41()) {
   18041       const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
   18042                               -1, -1, -1, -1, -1, -1, -1, -1};
   18043       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
   18044       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
   18045       AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
   18046       BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
   18047     } else {
   18048       const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
   18049                               -1, 12, -1, 13, -1, 14, -1, 15};
   18050       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
   18051       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
   18052       AHi = DAG.getBitcast(ExVT, AHi);
   18053       BHi = DAG.getBitcast(ExVT, BHi);
   18054       AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
   18055       BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
   18056     }
   18057 
   18058     // Multiply, mask the lower 8bits of the lo/hi results and pack
   18059     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
   18060     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
   18061     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
   18062     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
   18063     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
   18064   }
   18065 
   18066   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
   18067   if (VT == MVT::v4i32) {
   18068     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
   18069            "Should not custom lower when pmuldq is available!");
   18070 
   18071     // Extract the odd parts.
   18072     static const int UnpackMask[] = { 1, -1, 3, -1 };
   18073     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
   18074     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
   18075 
   18076     // Multiply the even parts.
   18077     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
   18078     // Now multiply odd parts.
   18079     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
   18080 
   18081     Evens = DAG.getBitcast(VT, Evens);
   18082     Odds = DAG.getBitcast(VT, Odds);
   18083 
   18084     // Merge the two vectors back together with a shuffle. This expands into 2
   18085     // shuffles.
   18086     static const int ShufMask[] = { 0, 4, 2, 6 };
   18087     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
   18088   }
   18089 
   18090   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
   18091          "Only know how to lower V2I64/V4I64/V8I64 multiply");
   18092 
   18093   //  Ahi = psrlqi(a, 32);
   18094   //  Bhi = psrlqi(b, 32);
   18095   //
   18096   //  AloBlo = pmuludq(a, b);
   18097   //  AloBhi = pmuludq(a, Bhi);
   18098   //  AhiBlo = pmuludq(Ahi, b);
   18099 
   18100   //  AloBhi = psllqi(AloBhi, 32);
   18101   //  AhiBlo = psllqi(AhiBlo, 32);
   18102   //  return AloBlo + AloBhi + AhiBlo;
   18103 
   18104   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
   18105   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
   18106 
   18107   SDValue AhiBlo = Ahi;
   18108   SDValue AloBhi = Bhi;
   18109   // Bit cast to 32-bit vectors for MULUDQ
   18110   MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
   18111                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
   18112   A = DAG.getBitcast(MulVT, A);
   18113   B = DAG.getBitcast(MulVT, B);
   18114   Ahi = DAG.getBitcast(MulVT, Ahi);
   18115   Bhi = DAG.getBitcast(MulVT, Bhi);
   18116 
   18117   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
   18118   // After shifting right const values the result may be all-zero.
   18119   if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) {
   18120     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
   18121     AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
   18122   }
   18123   if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) {
   18124     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
   18125     AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
   18126   }
   18127 
   18128   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
   18129   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
   18130 }
   18131 
   18132 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
   18133   assert(Subtarget->isTargetWin64() && "Unexpected target");
   18134   EVT VT = Op.getValueType();
   18135   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
   18136          "Unexpected return type for lowering");
   18137 
   18138   RTLIB::Libcall LC;
   18139   bool isSigned;
   18140   switch (Op->getOpcode()) {
   18141   default: llvm_unreachable("Unexpected request for libcall!");
   18142   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
   18143   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
   18144   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
   18145   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
   18146   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
   18147   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
   18148   }
   18149 
   18150   SDLoc dl(Op);
   18151   SDValue InChain = DAG.getEntryNode();
   18152 
   18153   TargetLowering::ArgListTy Args;
   18154   TargetLowering::ArgListEntry Entry;
   18155   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
   18156     EVT ArgVT = Op->getOperand(i).getValueType();
   18157     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
   18158            "Unexpected argument type for lowering");
   18159     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
   18160     Entry.Node = StackPtr;
   18161     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
   18162                            false, false, 16);
   18163     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   18164     Entry.Ty = PointerType::get(ArgTy,0);
   18165     Entry.isSExt = false;
   18166     Entry.isZExt = false;
   18167     Args.push_back(Entry);
   18168   }
   18169 
   18170   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
   18171                                          getPointerTy(DAG.getDataLayout()));
   18172 
   18173   TargetLowering::CallLoweringInfo CLI(DAG);
   18174   CLI.setDebugLoc(dl).setChain(InChain)
   18175     .setCallee(getLibcallCallingConv(LC),
   18176                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
   18177                Callee, std::move(Args), 0)
   18178     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
   18179 
   18180   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   18181   return DAG.getBitcast(VT, CallInfo.first);
   18182 }
   18183 
   18184 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
   18185                              SelectionDAG &DAG) {
   18186   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
   18187   MVT VT = Op0.getSimpleValueType();
   18188   SDLoc dl(Op);
   18189 
   18190   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
   18191          (VT == MVT::v8i32 && Subtarget->hasInt256()));
   18192 
   18193   // PMULxD operations multiply each even value (starting at 0) of LHS with
   18194   // the related value of RHS and produce a widen result.
   18195   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   18196   // => <2 x i64> <ae|cg>
   18197   //
   18198   // In other word, to have all the results, we need to perform two PMULxD:
   18199   // 1. one with the even values.
   18200   // 2. one with the odd values.
   18201   // To achieve #2, with need to place the odd values at an even position.
   18202   //
   18203   // Place the odd value at an even position (basically, shift all values 1
   18204   // step to the left):
   18205   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
   18206   // <a|b|c|d> => <b|undef|d|undef>
   18207   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
   18208   // <e|f|g|h> => <f|undef|h|undef>
   18209   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
   18210 
   18211   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
   18212   // ints.
   18213   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
   18214   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
   18215   unsigned Opcode =
   18216       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
   18217   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   18218   // => <2 x i64> <ae|cg>
   18219   SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
   18220   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
   18221   // => <2 x i64> <bf|dh>
   18222   SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
   18223 
   18224   // Shuffle it back into the right order.
   18225   SDValue Highs, Lows;
   18226   if (VT == MVT::v8i32) {
   18227     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
   18228     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
   18229     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
   18230     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
   18231   } else {
   18232     const int HighMask[] = {1, 5, 3, 7};
   18233     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
   18234     const int LowMask[] = {0, 4, 2, 6};
   18235     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
   18236   }
   18237 
   18238   // If we have a signed multiply but no PMULDQ fix up the high parts of a
   18239   // unsigned multiply.
   18240   if (IsSigned && !Subtarget->hasSSE41()) {
   18241     SDValue ShAmt = DAG.getConstant(
   18242         31, dl,
   18243         DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
   18244     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
   18245                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
   18246     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
   18247                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
   18248 
   18249     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
   18250     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
   18251   }
   18252 
   18253   // The first result of MUL_LOHI is actually the low value, followed by the
   18254   // high value.
   18255   SDValue Ops[] = {Lows, Highs};
   18256   return DAG.getMergeValues(Ops, dl);
   18257 }
   18258 
   18259 // Return true if the required (according to Opcode) shift-imm form is natively
   18260 // supported by the Subtarget
   18261 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
   18262                                         unsigned Opcode) {
   18263   if (VT.getScalarSizeInBits() < 16)
   18264     return false;
   18265 
   18266   if (VT.is512BitVector() &&
   18267       (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI()))
   18268     return true;
   18269 
   18270   bool LShift = VT.is128BitVector() ||
   18271     (VT.is256BitVector() && Subtarget->hasInt256());
   18272 
   18273   bool AShift = LShift && (Subtarget->hasVLX() ||
   18274     (VT != MVT::v2i64 && VT != MVT::v4i64));
   18275   return (Opcode == ISD::SRA) ? AShift : LShift;
   18276 }
   18277 
   18278 // The shift amount is a variable, but it is the same for all vector lanes.
   18279 // These instructions are defined together with shift-immediate.
   18280 static
   18281 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget,
   18282                                       unsigned Opcode) {
   18283   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
   18284 }
   18285 
   18286 // Return true if the required (according to Opcode) variable-shift form is
   18287 // natively supported by the Subtarget
   18288 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
   18289                                     unsigned Opcode) {
   18290 
   18291   if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16)
   18292     return false;
   18293 
   18294   // vXi16 supported only on AVX-512, BWI
   18295   if (VT.getScalarSizeInBits() == 16 && !Subtarget->hasBWI())
   18296     return false;
   18297 
   18298   if (VT.is512BitVector() || Subtarget->hasVLX())
   18299     return true;
   18300 
   18301   bool LShift = VT.is128BitVector() || VT.is256BitVector();
   18302   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
   18303   return (Opcode == ISD::SRA) ? AShift : LShift;
   18304 }
   18305 
   18306 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   18307                                          const X86Subtarget *Subtarget) {
   18308   MVT VT = Op.getSimpleValueType();
   18309   SDLoc dl(Op);
   18310   SDValue R = Op.getOperand(0);
   18311   SDValue Amt = Op.getOperand(1);
   18312 
   18313   unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
   18314     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
   18315 
   18316   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
   18317     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
   18318     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
   18319     SDValue Ex = DAG.getBitcast(ExVT, R);
   18320 
   18321     if (ShiftAmt >= 32) {
   18322       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
   18323       SDValue Upper =
   18324           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
   18325       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
   18326                                                  ShiftAmt - 32, DAG);
   18327       if (VT == MVT::v2i64)
   18328         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
   18329       if (VT == MVT::v4i64)
   18330         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
   18331                                   {9, 1, 11, 3, 13, 5, 15, 7});
   18332     } else {
   18333       // SRA upper i32, SHL whole i64 and select lower i32.
   18334       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
   18335                                                  ShiftAmt, DAG);
   18336       SDValue Lower =
   18337           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
   18338       Lower = DAG.getBitcast(ExVT, Lower);
   18339       if (VT == MVT::v2i64)
   18340         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
   18341       if (VT == MVT::v4i64)
   18342         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
   18343                                   {8, 1, 10, 3, 12, 5, 14, 7});
   18344     }
   18345     return DAG.getBitcast(VT, Ex);
   18346   };
   18347 
   18348   // Optimize shl/srl/sra with constant shift amount.
   18349   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
   18350     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
   18351       uint64_t ShiftAmt = ShiftConst->getZExtValue();
   18352 
   18353       if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
   18354         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
   18355 
   18356       // i64 SRA needs to be performed as partial shifts.
   18357       if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
   18358           Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP())
   18359         return ArithmeticShiftRight64(ShiftAmt);
   18360 
   18361       if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
   18362         unsigned NumElts = VT.getVectorNumElements();
   18363         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
   18364 
   18365         // Simple i8 add case
   18366         if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
   18367           return DAG.getNode(ISD::ADD, dl, VT, R, R);
   18368 
   18369         // ashr(R, 7)  === cmp_slt(R, 0)
   18370         if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
   18371           SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   18372           return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
   18373         }
   18374 
   18375         // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
   18376         if (VT == MVT::v16i8 && Subtarget->hasXOP())
   18377           return SDValue();
   18378 
   18379         if (Op.getOpcode() == ISD::SHL) {
   18380           // Make a large shift.
   18381           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
   18382                                                    R, ShiftAmt, DAG);
   18383           SHL = DAG.getBitcast(VT, SHL);
   18384           // Zero out the rightmost bits.
   18385           SmallVector<SDValue, 32> V(
   18386               NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8));
   18387           return DAG.getNode(ISD::AND, dl, VT, SHL,
   18388                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
   18389         }
   18390         if (Op.getOpcode() == ISD::SRL) {
   18391           // Make a large shift.
   18392           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
   18393                                                    R, ShiftAmt, DAG);
   18394           SRL = DAG.getBitcast(VT, SRL);
   18395           // Zero out the leftmost bits.
   18396           SmallVector<SDValue, 32> V(
   18397               NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8));
   18398           return DAG.getNode(ISD::AND, dl, VT, SRL,
   18399                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
   18400         }
   18401         if (Op.getOpcode() == ISD::SRA) {
   18402           // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
   18403           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
   18404           SmallVector<SDValue, 32> V(NumElts,
   18405                                      DAG.getConstant(128 >> ShiftAmt, dl,
   18406                                                      MVT::i8));
   18407           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
   18408           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
   18409           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
   18410           return Res;
   18411         }
   18412         llvm_unreachable("Unknown shift opcode.");
   18413       }
   18414     }
   18415   }
   18416 
   18417   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
   18418   if (!Subtarget->is64Bit() && !Subtarget->hasXOP() &&
   18419       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) {
   18420 
   18421     // Peek through any splat that was introduced for i64 shift vectorization.
   18422     int SplatIndex = -1;
   18423     if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
   18424       if (SVN->isSplat()) {
   18425         SplatIndex = SVN->getSplatIndex();
   18426         Amt = Amt.getOperand(0);
   18427         assert(SplatIndex < (int)VT.getVectorNumElements() &&
   18428                "Splat shuffle referencing second operand");
   18429       }
   18430 
   18431     if (Amt.getOpcode() != ISD::BITCAST ||
   18432         Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
   18433       return SDValue();
   18434 
   18435     Amt = Amt.getOperand(0);
   18436     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
   18437                      VT.getVectorNumElements();
   18438     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
   18439     uint64_t ShiftAmt = 0;
   18440     unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
   18441     for (unsigned i = 0; i != Ratio; ++i) {
   18442       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
   18443       if (!C)
   18444         return SDValue();
   18445       // 6 == Log2(64)
   18446       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
   18447     }
   18448 
   18449     // Check remaining shift amounts (if not a splat).
   18450     if (SplatIndex < 0) {
   18451       for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
   18452         uint64_t ShAmt = 0;
   18453         for (unsigned j = 0; j != Ratio; ++j) {
   18454           ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
   18455           if (!C)
   18456             return SDValue();
   18457           // 6 == Log2(64)
   18458           ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
   18459         }
   18460         if (ShAmt != ShiftAmt)
   18461           return SDValue();
   18462       }
   18463     }
   18464 
   18465     if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
   18466       return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
   18467 
   18468     if (Op.getOpcode() == ISD::SRA)
   18469       return ArithmeticShiftRight64(ShiftAmt);
   18470   }
   18471 
   18472   return SDValue();
   18473 }
   18474 
   18475 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   18476                                         const X86Subtarget* Subtarget) {
   18477   MVT VT = Op.getSimpleValueType();
   18478   SDLoc dl(Op);
   18479   SDValue R = Op.getOperand(0);
   18480   SDValue Amt = Op.getOperand(1);
   18481 
   18482   unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
   18483     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
   18484 
   18485   unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
   18486     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
   18487 
   18488   if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
   18489     SDValue BaseShAmt;
   18490     MVT EltVT = VT.getVectorElementType();
   18491 
   18492     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
   18493       // Check if this build_vector node is doing a splat.
   18494       // If so, then set BaseShAmt equal to the splat value.
   18495       BaseShAmt = BV->getSplatValue();
   18496       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
   18497         BaseShAmt = SDValue();
   18498     } else {
   18499       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
   18500         Amt = Amt.getOperand(0);
   18501 
   18502       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
   18503       if (SVN && SVN->isSplat()) {
   18504         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
   18505         SDValue InVec = Amt.getOperand(0);
   18506         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
   18507           assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
   18508                  "Unexpected shuffle index found!");
   18509           BaseShAmt = InVec.getOperand(SplatIdx);
   18510         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
   18511            if (ConstantSDNode *C =
   18512                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
   18513              if (C->getZExtValue() == SplatIdx)
   18514                BaseShAmt = InVec.getOperand(1);
   18515            }
   18516         }
   18517 
   18518         if (!BaseShAmt)
   18519           // Avoid introducing an extract element from a shuffle.
   18520           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
   18521                                   DAG.getIntPtrConstant(SplatIdx, dl));
   18522       }
   18523     }
   18524 
   18525     if (BaseShAmt.getNode()) {
   18526       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
   18527       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
   18528         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
   18529       else if (EltVT.bitsLT(MVT::i32))
   18530         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
   18531 
   18532       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
   18533     }
   18534   }
   18535 
   18536   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
   18537   if (!Subtarget->is64Bit() && VT == MVT::v2i64  &&
   18538       Amt.getOpcode() == ISD::BITCAST &&
   18539       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
   18540     Amt = Amt.getOperand(0);
   18541     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
   18542                      VT.getVectorNumElements();
   18543     std::vector<SDValue> Vals(Ratio);
   18544     for (unsigned i = 0; i != Ratio; ++i)
   18545       Vals[i] = Amt.getOperand(i);
   18546     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
   18547       for (unsigned j = 0; j != Ratio; ++j)
   18548         if (Vals[j] != Amt.getOperand(i + j))
   18549           return SDValue();
   18550     }
   18551 
   18552     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
   18553       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
   18554   }
   18555   return SDValue();
   18556 }
   18557 
   18558 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   18559                           SelectionDAG &DAG) {
   18560   MVT VT = Op.getSimpleValueType();
   18561   SDLoc dl(Op);
   18562   SDValue R = Op.getOperand(0);
   18563   SDValue Amt = Op.getOperand(1);
   18564 
   18565   assert(VT.isVector() && "Custom lowering only for vector shifts!");
   18566   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
   18567 
   18568   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
   18569     return V;
   18570 
   18571   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
   18572     return V;
   18573 
   18574   if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
   18575     return Op;
   18576 
   18577   // XOP has 128-bit variable logical/arithmetic shifts.
   18578   // +ve/-ve Amt = shift left/right.
   18579   if (Subtarget->hasXOP() &&
   18580       (VT == MVT::v2i64 || VT == MVT::v4i32 ||
   18581        VT == MVT::v8i16 || VT == MVT::v16i8)) {
   18582     if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
   18583       SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
   18584       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
   18585     }
   18586     if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
   18587       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
   18588     if (Op.getOpcode() == ISD::SRA)
   18589       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
   18590   }
   18591 
   18592   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
   18593   // shifts per-lane and then shuffle the partial results back together.
   18594   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
   18595     // Splat the shift amounts so the scalar shifts above will catch it.
   18596     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
   18597     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
   18598     SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
   18599     SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
   18600     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
   18601   }
   18602 
   18603   // i64 vector arithmetic shift can be emulated with the transform:
   18604   // M = lshr(SIGN_BIT, Amt)
   18605   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
   18606   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) &&
   18607       Op.getOpcode() == ISD::SRA) {
   18608     SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
   18609     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
   18610     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
   18611     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
   18612     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
   18613     return R;
   18614   }
   18615 
   18616   // If possible, lower this packed shift into a vector multiply instead of
   18617   // expanding it into a sequence of scalar shifts.
   18618   // Do this only if the vector shift count is a constant build_vector.
   18619   if (Op.getOpcode() == ISD::SHL &&
   18620       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
   18621        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
   18622       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
   18623     SmallVector<SDValue, 8> Elts;
   18624     MVT SVT = VT.getVectorElementType();
   18625     unsigned SVTBits = SVT.getSizeInBits();
   18626     APInt One(SVTBits, 1);
   18627     unsigned NumElems = VT.getVectorNumElements();
   18628 
   18629     for (unsigned i=0; i !=NumElems; ++i) {
   18630       SDValue Op = Amt->getOperand(i);
   18631       if (Op->getOpcode() == ISD::UNDEF) {
   18632         Elts.push_back(Op);
   18633         continue;
   18634       }
   18635 
   18636       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
   18637       APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
   18638       uint64_t ShAmt = C.getZExtValue();
   18639       if (ShAmt >= SVTBits) {
   18640         Elts.push_back(DAG.getUNDEF(SVT));
   18641         continue;
   18642       }
   18643       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
   18644     }
   18645     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
   18646     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
   18647   }
   18648 
   18649   // Lower SHL with variable shift amount.
   18650   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
   18651     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
   18652 
   18653     Op = DAG.getNode(ISD::ADD, dl, VT, Op,
   18654                      DAG.getConstant(0x3f800000U, dl, VT));
   18655     Op = DAG.getBitcast(MVT::v4f32, Op);
   18656     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
   18657     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   18658   }
   18659 
   18660   // If possible, lower this shift as a sequence of two shifts by
   18661   // constant plus a MOVSS/MOVSD instead of scalarizing it.
   18662   // Example:
   18663   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
   18664   //
   18665   // Could be rewritten as:
   18666   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
   18667   //
   18668   // The advantage is that the two shifts from the example would be
   18669   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
   18670   // the vector shift into four scalar shifts plus four pairs of vector
   18671   // insert/extract.
   18672   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
   18673       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
   18674     unsigned TargetOpcode = X86ISD::MOVSS;
   18675     bool CanBeSimplified;
   18676     // The splat value for the first packed shift (the 'X' from the example).
   18677     SDValue Amt1 = Amt->getOperand(0);
   18678     // The splat value for the second packed shift (the 'Y' from the example).
   18679     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
   18680                                         Amt->getOperand(2);
   18681 
   18682     // See if it is possible to replace this node with a sequence of
   18683     // two shifts followed by a MOVSS/MOVSD
   18684     if (VT == MVT::v4i32) {
   18685       // Check if it is legal to use a MOVSS.
   18686       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
   18687                         Amt2 == Amt->getOperand(3);
   18688       if (!CanBeSimplified) {
   18689         // Otherwise, check if we can still simplify this node using a MOVSD.
   18690         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
   18691                           Amt->getOperand(2) == Amt->getOperand(3);
   18692         TargetOpcode = X86ISD::MOVSD;
   18693         Amt2 = Amt->getOperand(2);
   18694       }
   18695     } else {
   18696       // Do similar checks for the case where the machine value type
   18697       // is MVT::v8i16.
   18698       CanBeSimplified = Amt1 == Amt->getOperand(1);
   18699       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
   18700         CanBeSimplified = Amt2 == Amt->getOperand(i);
   18701 
   18702       if (!CanBeSimplified) {
   18703         TargetOpcode = X86ISD::MOVSD;
   18704         CanBeSimplified = true;
   18705         Amt2 = Amt->getOperand(4);
   18706         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
   18707           CanBeSimplified = Amt1 == Amt->getOperand(i);
   18708         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
   18709           CanBeSimplified = Amt2 == Amt->getOperand(j);
   18710       }
   18711     }
   18712 
   18713     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
   18714         isa<ConstantSDNode>(Amt2)) {
   18715       // Replace this node with two shifts followed by a MOVSS/MOVSD.
   18716       MVT CastVT = MVT::v4i32;
   18717       SDValue Splat1 =
   18718         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
   18719       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
   18720       SDValue Splat2 =
   18721         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
   18722       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
   18723       if (TargetOpcode == X86ISD::MOVSD)
   18724         CastVT = MVT::v2i64;
   18725       SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
   18726       SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
   18727       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
   18728                                             BitCast1, DAG);
   18729       return DAG.getBitcast(VT, Result);
   18730     }
   18731   }
   18732 
   18733   // v4i32 Non Uniform Shifts.
   18734   // If the shift amount is constant we can shift each lane using the SSE2
   18735   // immediate shifts, else we need to zero-extend each lane to the lower i64
   18736   // and shift using the SSE2 variable shifts.
   18737   // The separate results can then be blended together.
   18738   if (VT == MVT::v4i32) {
   18739     unsigned Opc = Op.getOpcode();
   18740     SDValue Amt0, Amt1, Amt2, Amt3;
   18741     if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
   18742       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
   18743       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
   18744       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
   18745       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
   18746     } else {
   18747       // ISD::SHL is handled above but we include it here for completeness.
   18748       switch (Opc) {
   18749       default:
   18750         llvm_unreachable("Unknown target vector shift node");
   18751       case ISD::SHL:
   18752         Opc = X86ISD::VSHL;
   18753         break;
   18754       case ISD::SRL:
   18755         Opc = X86ISD::VSRL;
   18756         break;
   18757       case ISD::SRA:
   18758         Opc = X86ISD::VSRA;
   18759         break;
   18760       }
   18761       // The SSE2 shifts use the lower i64 as the same shift amount for
   18762       // all lanes and the upper i64 is ignored. These shuffle masks
   18763       // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
   18764       SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
   18765       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
   18766       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
   18767       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
   18768       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
   18769     }
   18770 
   18771     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
   18772     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
   18773     SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
   18774     SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
   18775     SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
   18776     SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
   18777     return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
   18778   }
   18779 
   18780   if (VT == MVT::v16i8 ||
   18781       (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) {
   18782     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
   18783     unsigned ShiftOpcode = Op->getOpcode();
   18784 
   18785     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
   18786       // On SSE41 targets we make use of the fact that VSELECT lowers
   18787       // to PBLENDVB which selects bytes based just on the sign bit.
   18788       if (Subtarget->hasSSE41()) {
   18789         V0 = DAG.getBitcast(VT, V0);
   18790         V1 = DAG.getBitcast(VT, V1);
   18791         Sel = DAG.getBitcast(VT, Sel);
   18792         return DAG.getBitcast(SelVT,
   18793                               DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
   18794       }
   18795       // On pre-SSE41 targets we test for the sign bit by comparing to
   18796       // zero - a negative value will set all bits of the lanes to true
   18797       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
   18798       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
   18799       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
   18800       return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
   18801     };
   18802 
   18803     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
   18804     // We can safely do this using i16 shifts as we're only interested in
   18805     // the 3 lower bits of each byte.
   18806     Amt = DAG.getBitcast(ExtVT, Amt);
   18807     Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
   18808     Amt = DAG.getBitcast(VT, Amt);
   18809 
   18810     if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
   18811       // r = VSELECT(r, shift(r, 4), a);
   18812       SDValue M =
   18813           DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
   18814       R = SignBitSelect(VT, Amt, M, R);
   18815 
   18816       // a += a
   18817       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   18818 
   18819       // r = VSELECT(r, shift(r, 2), a);
   18820       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
   18821       R = SignBitSelect(VT, Amt, M, R);
   18822 
   18823       // a += a
   18824       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   18825 
   18826       // return VSELECT(r, shift(r, 1), a);
   18827       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
   18828       R = SignBitSelect(VT, Amt, M, R);
   18829       return R;
   18830     }
   18831 
   18832     if (Op->getOpcode() == ISD::SRA) {
   18833       // For SRA we need to unpack each byte to the higher byte of a i16 vector
   18834       // so we can correctly sign extend. We don't care what happens to the
   18835       // lower byte.
   18836       SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
   18837       SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
   18838       SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
   18839       SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
   18840       ALo = DAG.getBitcast(ExtVT, ALo);
   18841       AHi = DAG.getBitcast(ExtVT, AHi);
   18842       RLo = DAG.getBitcast(ExtVT, RLo);
   18843       RHi = DAG.getBitcast(ExtVT, RHi);
   18844 
   18845       // r = VSELECT(r, shift(r, 4), a);
   18846       SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
   18847                                 DAG.getConstant(4, dl, ExtVT));
   18848       SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
   18849                                 DAG.getConstant(4, dl, ExtVT));
   18850       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
   18851       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
   18852 
   18853       // a += a
   18854       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
   18855       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
   18856 
   18857       // r = VSELECT(r, shift(r, 2), a);
   18858       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
   18859                         DAG.getConstant(2, dl, ExtVT));
   18860       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
   18861                         DAG.getConstant(2, dl, ExtVT));
   18862       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
   18863       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
   18864 
   18865       // a += a
   18866       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
   18867       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
   18868 
   18869       // r = VSELECT(r, shift(r, 1), a);
   18870       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
   18871                         DAG.getConstant(1, dl, ExtVT));
   18872       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
   18873                         DAG.getConstant(1, dl, ExtVT));
   18874       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
   18875       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
   18876 
   18877       // Logical shift the result back to the lower byte, leaving a zero upper
   18878       // byte
   18879       // meaning that we can safely pack with PACKUSWB.
   18880       RLo =
   18881           DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
   18882       RHi =
   18883           DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
   18884       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
   18885     }
   18886   }
   18887 
   18888   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
   18889   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
   18890   // solution better.
   18891   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
   18892     MVT ExtVT = MVT::v8i32;
   18893     unsigned ExtOpc =
   18894         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
   18895     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
   18896     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
   18897     return DAG.getNode(ISD::TRUNCATE, dl, VT,
   18898                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
   18899   }
   18900 
   18901   if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) {
   18902     MVT ExtVT = MVT::v8i32;
   18903     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
   18904     SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
   18905     SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
   18906     SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R);
   18907     SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R);
   18908     ALo = DAG.getBitcast(ExtVT, ALo);
   18909     AHi = DAG.getBitcast(ExtVT, AHi);
   18910     RLo = DAG.getBitcast(ExtVT, RLo);
   18911     RHi = DAG.getBitcast(ExtVT, RHi);
   18912     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
   18913     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
   18914     Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
   18915     Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
   18916     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
   18917   }
   18918 
   18919   if (VT == MVT::v8i16) {
   18920     unsigned ShiftOpcode = Op->getOpcode();
   18921 
   18922     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
   18923       // On SSE41 targets we make use of the fact that VSELECT lowers
   18924       // to PBLENDVB which selects bytes based just on the sign bit.
   18925       if (Subtarget->hasSSE41()) {
   18926         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
   18927         V0 = DAG.getBitcast(ExtVT, V0);
   18928         V1 = DAG.getBitcast(ExtVT, V1);
   18929         Sel = DAG.getBitcast(ExtVT, Sel);
   18930         return DAG.getBitcast(
   18931             VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
   18932       }
   18933       // On pre-SSE41 targets we splat the sign bit - a negative value will
   18934       // set all bits of the lanes to true and VSELECT uses that in
   18935       // its OR(AND(V0,C),AND(V1,~C)) lowering.
   18936       SDValue C =
   18937           DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
   18938       return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
   18939     };
   18940 
   18941     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
   18942     if (Subtarget->hasSSE41()) {
   18943       // On SSE41 targets we need to replicate the shift mask in both
   18944       // bytes for PBLENDVB.
   18945       Amt = DAG.getNode(
   18946           ISD::OR, dl, VT,
   18947           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
   18948           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
   18949     } else {
   18950       Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
   18951     }
   18952 
   18953     // r = VSELECT(r, shift(r, 8), a);
   18954     SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
   18955     R = SignBitSelect(Amt, M, R);
   18956 
   18957     // a += a
   18958     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   18959 
   18960     // r = VSELECT(r, shift(r, 4), a);
   18961     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
   18962     R = SignBitSelect(Amt, M, R);
   18963 
   18964     // a += a
   18965     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   18966 
   18967     // r = VSELECT(r, shift(r, 2), a);
   18968     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
   18969     R = SignBitSelect(Amt, M, R);
   18970 
   18971     // a += a
   18972     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   18973 
   18974     // return VSELECT(r, shift(r, 1), a);
   18975     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
   18976     R = SignBitSelect(Amt, M, R);
   18977     return R;
   18978   }
   18979 
   18980   // Decompose 256-bit shifts into smaller 128-bit shifts.
   18981   if (VT.is256BitVector()) {
   18982     unsigned NumElems = VT.getVectorNumElements();
   18983     MVT EltVT = VT.getVectorElementType();
   18984     MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   18985 
   18986     // Extract the two vectors
   18987     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
   18988     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
   18989 
   18990     // Recreate the shift amount vectors
   18991     SDValue Amt1, Amt2;
   18992     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
   18993       // Constant shift amount
   18994       SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems);
   18995       ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2);
   18996       ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2);
   18997 
   18998       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
   18999       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
   19000     } else {
   19001       // Variable shift amount
   19002       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
   19003       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
   19004     }
   19005 
   19006     // Issue new vector shifts for the smaller types
   19007     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
   19008     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
   19009 
   19010     // Concatenate the result back
   19011     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
   19012   }
   19013 
   19014   return SDValue();
   19015 }
   19016 
   19017 static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget,
   19018                            SelectionDAG &DAG) {
   19019   MVT VT = Op.getSimpleValueType();
   19020   SDLoc DL(Op);
   19021   SDValue R = Op.getOperand(0);
   19022   SDValue Amt = Op.getOperand(1);
   19023 
   19024   assert(VT.isVector() && "Custom lowering only for vector rotates!");
   19025   assert(Subtarget->hasXOP() && "XOP support required for vector rotates!");
   19026   assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
   19027 
   19028   // XOP has 128-bit vector variable + immediate rotates.
   19029   // +ve/-ve Amt = rotate left/right.
   19030 
   19031   // Split 256-bit integers.
   19032   if (VT.is256BitVector())
   19033     return Lower256IntArith(Op, DAG);
   19034 
   19035   assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
   19036 
   19037   // Attempt to rotate by immediate.
   19038   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
   19039     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
   19040       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
   19041       assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
   19042       return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
   19043                          DAG.getConstant(RotateAmt, DL, MVT::i8));
   19044     }
   19045   }
   19046 
   19047   // Use general rotate by variable (per-element).
   19048   return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
   19049 }
   19050 
   19051 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   19052   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
   19053   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
   19054   // looks for this combo and may remove the "setcc" instruction if the "setcc"
   19055   // has only one use.
   19056   SDNode *N = Op.getNode();
   19057   SDValue LHS = N->getOperand(0);
   19058   SDValue RHS = N->getOperand(1);
   19059   unsigned BaseOp = 0;
   19060   unsigned Cond = 0;
   19061   SDLoc DL(Op);
   19062   switch (Op.getOpcode()) {
   19063   default: llvm_unreachable("Unknown ovf instruction!");
   19064   case ISD::SADDO:
   19065     // A subtract of one will be selected as a INC. Note that INC doesn't
   19066     // set CF, so we can't do this for UADDO.
   19067     if (isOneConstant(RHS)) {
   19068         BaseOp = X86ISD::INC;
   19069         Cond = X86::COND_O;
   19070         break;
   19071       }
   19072     BaseOp = X86ISD::ADD;
   19073     Cond = X86::COND_O;
   19074     break;
   19075   case ISD::UADDO:
   19076     BaseOp = X86ISD::ADD;
   19077     Cond = X86::COND_B;
   19078     break;
   19079   case ISD::SSUBO:
   19080     // A subtract of one will be selected as a DEC. Note that DEC doesn't
   19081     // set CF, so we can't do this for USUBO.
   19082     if (isOneConstant(RHS)) {
   19083         BaseOp = X86ISD::DEC;
   19084         Cond = X86::COND_O;
   19085         break;
   19086       }
   19087     BaseOp = X86ISD::SUB;
   19088     Cond = X86::COND_O;
   19089     break;
   19090   case ISD::USUBO:
   19091     BaseOp = X86ISD::SUB;
   19092     Cond = X86::COND_B;
   19093     break;
   19094   case ISD::SMULO:
   19095     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
   19096     Cond = X86::COND_O;
   19097     break;
   19098   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
   19099     if (N->getValueType(0) == MVT::i8) {
   19100       BaseOp = X86ISD::UMUL8;
   19101       Cond = X86::COND_O;
   19102       break;
   19103     }
   19104     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
   19105                                  MVT::i32);
   19106     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
   19107 
   19108     SDValue SetCC =
   19109       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   19110                   DAG.getConstant(X86::COND_O, DL, MVT::i32),
   19111                   SDValue(Sum.getNode(), 2));
   19112 
   19113     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   19114   }
   19115   }
   19116 
   19117   // Also sets EFLAGS.
   19118   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
   19119   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
   19120 
   19121   SDValue SetCC =
   19122     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
   19123                 DAG.getConstant(Cond, DL, MVT::i32),
   19124                 SDValue(Sum.getNode(), 1));
   19125 
   19126   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   19127 }
   19128 
   19129 /// Returns true if the operand type is exactly twice the native width, and
   19130 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
   19131 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
   19132 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
   19133 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
   19134   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
   19135 
   19136   if (OpWidth == 64)
   19137     return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
   19138   else if (OpWidth == 128)
   19139     return Subtarget->hasCmpxchg16b();
   19140   else
   19141     return false;
   19142 }
   19143 
   19144 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   19145   return needsCmpXchgNb(SI->getValueOperand()->getType());
   19146 }
   19147 
   19148 // Note: this turns large loads into lock cmpxchg8b/16b.
   19149 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
   19150 TargetLowering::AtomicExpansionKind
   19151 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   19152   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
   19153   return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
   19154                                                : AtomicExpansionKind::None;
   19155 }
   19156 
   19157 TargetLowering::AtomicExpansionKind
   19158 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   19159   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
   19160   Type *MemType = AI->getType();
   19161 
   19162   // If the operand is too big, we must see if cmpxchg8/16b is available
   19163   // and default to library calls otherwise.
   19164   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
   19165     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
   19166                                    : AtomicExpansionKind::None;
   19167   }
   19168 
   19169   AtomicRMWInst::BinOp Op = AI->getOperation();
   19170   switch (Op) {
   19171   default:
   19172     llvm_unreachable("Unknown atomic operation");
   19173   case AtomicRMWInst::Xchg:
   19174   case AtomicRMWInst::Add:
   19175   case AtomicRMWInst::Sub:
   19176     // It's better to use xadd, xsub or xchg for these in all cases.
   19177     return AtomicExpansionKind::None;
   19178   case AtomicRMWInst::Or:
   19179   case AtomicRMWInst::And:
   19180   case AtomicRMWInst::Xor:
   19181     // If the atomicrmw's result isn't actually used, we can just add a "lock"
   19182     // prefix to a normal instruction for these operations.
   19183     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
   19184                             : AtomicExpansionKind::None;
   19185   case AtomicRMWInst::Nand:
   19186   case AtomicRMWInst::Max:
   19187   case AtomicRMWInst::Min:
   19188   case AtomicRMWInst::UMax:
   19189   case AtomicRMWInst::UMin:
   19190     // These always require a non-trivial set of data operations on x86. We must
   19191     // use a cmpxchg loop.
   19192     return AtomicExpansionKind::CmpXChg;
   19193   }
   19194 }
   19195 
   19196 static bool hasMFENCE(const X86Subtarget& Subtarget) {
   19197   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
   19198   // no-sse2). There isn't any reason to disable it if the target processor
   19199   // supports it.
   19200   return Subtarget.hasSSE2() || Subtarget.is64Bit();
   19201 }
   19202 
   19203 LoadInst *
   19204 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   19205   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
   19206   Type *MemType = AI->getType();
   19207   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
   19208   // there is no benefit in turning such RMWs into loads, and it is actually
   19209   // harmful as it introduces a mfence.
   19210   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
   19211     return nullptr;
   19212 
   19213   auto Builder = IRBuilder<>(AI);
   19214   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   19215   auto SynchScope = AI->getSynchScope();
   19216   // We must restrict the ordering to avoid generating loads with Release or
   19217   // ReleaseAcquire orderings.
   19218   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
   19219   auto Ptr = AI->getPointerOperand();
   19220 
   19221   // Before the load we need a fence. Here is an example lifted from
   19222   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
   19223   // is required:
   19224   // Thread 0:
   19225   //   x.store(1, relaxed);
   19226   //   r1 = y.fetch_add(0, release);
   19227   // Thread 1:
   19228   //   y.fetch_add(42, acquire);
   19229   //   r2 = x.load(relaxed);
   19230   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
   19231   // lowered to just a load without a fence. A mfence flushes the store buffer,
   19232   // making the optimization clearly correct.
   19233   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
   19234   // otherwise, we might be able to be more aggressive on relaxed idempotent
   19235   // rmw. In practice, they do not look useful, so we don't try to be
   19236   // especially clever.
   19237   if (SynchScope == SingleThread)
   19238     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
   19239     // the IR level, so we must wrap it in an intrinsic.
   19240     return nullptr;
   19241 
   19242   if (!hasMFENCE(*Subtarget))
   19243     // FIXME: it might make sense to use a locked operation here but on a
   19244     // different cache-line to prevent cache-line bouncing. In practice it
   19245     // is probably a small win, and x86 processors without mfence are rare
   19246     // enough that we do not bother.
   19247     return nullptr;
   19248 
   19249   Function *MFence =
   19250       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
   19251   Builder.CreateCall(MFence, {});
   19252 
   19253   // Finally we can emit the atomic load.
   19254   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
   19255           AI->getType()->getPrimitiveSizeInBits());
   19256   Loaded->setAtomic(Order, SynchScope);
   19257   AI->replaceAllUsesWith(Loaded);
   19258   AI->eraseFromParent();
   19259   return Loaded;
   19260 }
   19261 
   19262 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
   19263                                  SelectionDAG &DAG) {
   19264   SDLoc dl(Op);
   19265   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
   19266     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   19267   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
   19268     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
   19269 
   19270   // The only fence that needs an instruction is a sequentially-consistent
   19271   // cross-thread fence.
   19272   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
   19273     if (hasMFENCE(*Subtarget))
   19274       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
   19275 
   19276     SDValue Chain = Op.getOperand(0);
   19277     SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
   19278     SDValue Ops[] = {
   19279       DAG.getRegister(X86::ESP, MVT::i32),     // Base
   19280       DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
   19281       DAG.getRegister(0, MVT::i32),            // Index
   19282       DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
   19283       DAG.getRegister(0, MVT::i32),            // Segment.
   19284       Zero,
   19285       Chain
   19286     };
   19287     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
   19288     return SDValue(Res, 0);
   19289   }
   19290 
   19291   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
   19292   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
   19293 }
   19294 
   19295 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
   19296                              SelectionDAG &DAG) {
   19297   MVT T = Op.getSimpleValueType();
   19298   SDLoc DL(Op);
   19299   unsigned Reg = 0;
   19300   unsigned size = 0;
   19301   switch(T.SimpleTy) {
   19302   default: llvm_unreachable("Invalid value type!");
   19303   case MVT::i8:  Reg = X86::AL;  size = 1; break;
   19304   case MVT::i16: Reg = X86::AX;  size = 2; break;
   19305   case MVT::i32: Reg = X86::EAX; size = 4; break;
   19306   case MVT::i64:
   19307     assert(Subtarget->is64Bit() && "Node not type legal!");
   19308     Reg = X86::RAX; size = 8;
   19309     break;
   19310   }
   19311   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
   19312                                   Op.getOperand(2), SDValue());
   19313   SDValue Ops[] = { cpIn.getValue(0),
   19314                     Op.getOperand(1),
   19315                     Op.getOperand(3),
   19316                     DAG.getTargetConstant(size, DL, MVT::i8),
   19317                     cpIn.getValue(1) };
   19318   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   19319   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   19320   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
   19321                                            Ops, T, MMO);
   19322 
   19323   SDValue cpOut =
   19324     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
   19325   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
   19326                                       MVT::i32, cpOut.getValue(2));
   19327   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
   19328                                 DAG.getConstant(X86::COND_E, DL, MVT::i8),
   19329                                 EFLAGS);
   19330 
   19331   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
   19332   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
   19333   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
   19334   return SDValue();
   19335 }
   19336 
   19337 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
   19338                             SelectionDAG &DAG) {
   19339   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   19340   MVT DstVT = Op.getSimpleValueType();
   19341 
   19342   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
   19343     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
   19344     if (DstVT != MVT::f64)
   19345       // This conversion needs to be expanded.
   19346       return SDValue();
   19347 
   19348     SDValue InVec = Op->getOperand(0);
   19349     SDLoc dl(Op);
   19350     unsigned NumElts = SrcVT.getVectorNumElements();
   19351     MVT SVT = SrcVT.getVectorElementType();
   19352 
   19353     // Widen the vector in input in the case of MVT::v2i32.
   19354     // Example: from MVT::v2i32 to MVT::v4i32.
   19355     SmallVector<SDValue, 16> Elts;
   19356     for (unsigned i = 0, e = NumElts; i != e; ++i)
   19357       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
   19358                                  DAG.getIntPtrConstant(i, dl)));
   19359 
   19360     // Explicitly mark the extra elements as Undef.
   19361     Elts.append(NumElts, DAG.getUNDEF(SVT));
   19362 
   19363     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
   19364     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
   19365     SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
   19366     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
   19367                        DAG.getIntPtrConstant(0, dl));
   19368   }
   19369 
   19370   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
   19371          Subtarget->hasMMX() && "Unexpected custom BITCAST");
   19372   assert((DstVT == MVT::i64 ||
   19373           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
   19374          "Unexpected custom BITCAST");
   19375   // i64 <=> MMX conversions are Legal.
   19376   if (SrcVT==MVT::i64 && DstVT.isVector())
   19377     return Op;
   19378   if (DstVT==MVT::i64 && SrcVT.isVector())
   19379     return Op;
   19380   // MMX <=> MMX conversions are Legal.
   19381   if (SrcVT.isVector() && DstVT.isVector())
   19382     return Op;
   19383   // All other conversions need to be expanded.
   19384   return SDValue();
   19385 }
   19386 
   19387 /// Compute the horizontal sum of bytes in V for the elements of VT.
   19388 ///
   19389 /// Requires V to be a byte vector and VT to be an integer vector type with
   19390 /// wider elements than V's type. The width of the elements of VT determines
   19391 /// how many bytes of V are summed horizontally to produce each element of the
   19392 /// result.
   19393 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
   19394                                       const X86Subtarget *Subtarget,
   19395                                       SelectionDAG &DAG) {
   19396   SDLoc DL(V);
   19397   MVT ByteVecVT = V.getSimpleValueType();
   19398   MVT EltVT = VT.getVectorElementType();
   19399   int NumElts = VT.getVectorNumElements();
   19400   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
   19401          "Expected value to have byte element type.");
   19402   assert(EltVT != MVT::i8 &&
   19403          "Horizontal byte sum only makes sense for wider elements!");
   19404   unsigned VecSize = VT.getSizeInBits();
   19405   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
   19406 
   19407   // PSADBW instruction horizontally add all bytes and leave the result in i64
   19408   // chunks, thus directly computes the pop count for v2i64 and v4i64.
   19409   if (EltVT == MVT::i64) {
   19410     SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
   19411     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
   19412     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
   19413     return DAG.getBitcast(VT, V);
   19414   }
   19415 
   19416   if (EltVT == MVT::i32) {
   19417     // We unpack the low half and high half into i32s interleaved with zeros so
   19418     // that we can use PSADBW to horizontally sum them. The most useful part of
   19419     // this is that it lines up the results of two PSADBW instructions to be
   19420     // two v2i64 vectors which concatenated are the 4 population counts. We can
   19421     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
   19422     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
   19423     SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
   19424     SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
   19425 
   19426     // Do the horizontal sums into two v2i64s.
   19427     Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
   19428     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
   19429     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
   19430                       DAG.getBitcast(ByteVecVT, Low), Zeros);
   19431     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
   19432                        DAG.getBitcast(ByteVecVT, High), Zeros);
   19433 
   19434     // Merge them together.
   19435     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
   19436     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
   19437                     DAG.getBitcast(ShortVecVT, Low),
   19438                     DAG.getBitcast(ShortVecVT, High));
   19439 
   19440     return DAG.getBitcast(VT, V);
   19441   }
   19442 
   19443   // The only element type left is i16.
   19444   assert(EltVT == MVT::i16 && "Unknown how to handle type");
   19445 
   19446   // To obtain pop count for each i16 element starting from the pop count for
   19447   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
   19448   // right by 8. It is important to shift as i16s as i8 vector shift isn't
   19449   // directly supported.
   19450   SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT));
   19451   SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters);
   19452   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter);
   19453   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
   19454                   DAG.getBitcast(ByteVecVT, V));
   19455   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter);
   19456 }
   19457 
   19458 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
   19459                                         const X86Subtarget *Subtarget,
   19460                                         SelectionDAG &DAG) {
   19461   MVT VT = Op.getSimpleValueType();
   19462   MVT EltVT = VT.getVectorElementType();
   19463   unsigned VecSize = VT.getSizeInBits();
   19464 
   19465   // Implement a lookup table in register by using an algorithm based on:
   19466   // http://wm.ite.pl/articles/sse-popcount.html
   19467   //
   19468   // The general idea is that every lower byte nibble in the input vector is an
   19469   // index into a in-register pre-computed pop count table. We then split up the
   19470   // input vector in two new ones: (1) a vector with only the shifted-right
   19471   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
   19472   // masked out higher ones) for each byte. PSHUB is used separately with both
   19473   // to index the in-register table. Next, both are added and the result is a
   19474   // i8 vector where each element contains the pop count for input byte.
   19475   //
   19476   // To obtain the pop count for elements != i8, we follow up with the same
   19477   // approach and use additional tricks as described below.
   19478   //
   19479   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
   19480                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
   19481                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
   19482                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
   19483 
   19484   int NumByteElts = VecSize / 8;
   19485   MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
   19486   SDValue In = DAG.getBitcast(ByteVecVT, Op);
   19487   SmallVector<SDValue, 16> LUTVec;
   19488   for (int i = 0; i < NumByteElts; ++i)
   19489     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
   19490   SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec);
   19491   SmallVector<SDValue, 16> Mask0F(NumByteElts,
   19492                                   DAG.getConstant(0x0F, DL, MVT::i8));
   19493   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F);
   19494 
   19495   // High nibbles
   19496   SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8));
   19497   SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four);
   19498   SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
   19499 
   19500   // Low nibbles
   19501   SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
   19502 
   19503   // The input vector is used as the shuffle mask that index elements into the
   19504   // LUT. After counting low and high nibbles, add the vector to obtain the
   19505   // final pop count per i8 element.
   19506   SDValue HighPopCnt =
   19507       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
   19508   SDValue LowPopCnt =
   19509       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
   19510   SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
   19511 
   19512   if (EltVT == MVT::i8)
   19513     return PopCnt;
   19514 
   19515   return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
   19516 }
   19517 
   19518 static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
   19519                                        const X86Subtarget *Subtarget,
   19520                                        SelectionDAG &DAG) {
   19521   MVT VT = Op.getSimpleValueType();
   19522   assert(VT.is128BitVector() &&
   19523          "Only 128-bit vector bitmath lowering supported.");
   19524 
   19525   int VecSize = VT.getSizeInBits();
   19526   MVT EltVT = VT.getVectorElementType();
   19527   int Len = EltVT.getSizeInBits();
   19528 
   19529   // This is the vectorized version of the "best" algorithm from
   19530   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
   19531   // with a minor tweak to use a series of adds + shifts instead of vector
   19532   // multiplications. Implemented for all integer vector types. We only use
   19533   // this when we don't have SSSE3 which allows a LUT-based lowering that is
   19534   // much faster, even faster than using native popcnt instructions.
   19535 
   19536   auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
   19537     MVT VT = V.getSimpleValueType();
   19538     SmallVector<SDValue, 32> Shifters(
   19539         VT.getVectorNumElements(),
   19540         DAG.getConstant(Shifter, DL, VT.getVectorElementType()));
   19541     return DAG.getNode(OpCode, DL, VT, V,
   19542                        DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters));
   19543   };
   19544   auto GetMask = [&](SDValue V, APInt Mask) {
   19545     MVT VT = V.getSimpleValueType();
   19546     SmallVector<SDValue, 32> Masks(
   19547         VT.getVectorNumElements(),
   19548         DAG.getConstant(Mask, DL, VT.getVectorElementType()));
   19549     return DAG.getNode(ISD::AND, DL, VT, V,
   19550                        DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks));
   19551   };
   19552 
   19553   // We don't want to incur the implicit masks required to SRL vNi8 vectors on
   19554   // x86, so set the SRL type to have elements at least i16 wide. This is
   19555   // correct because all of our SRLs are followed immediately by a mask anyways
   19556   // that handles any bits that sneak into the high bits of the byte elements.
   19557   MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
   19558 
   19559   SDValue V = Op;
   19560 
   19561   // v = v - ((v >> 1) & 0x55555555...)
   19562   SDValue Srl =
   19563       DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
   19564   SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
   19565   V = DAG.getNode(ISD::SUB, DL, VT, V, And);
   19566 
   19567   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
   19568   SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
   19569   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
   19570   SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
   19571   V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
   19572 
   19573   // v = (v + (v >> 4)) & 0x0F0F0F0F...
   19574   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
   19575   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
   19576   V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
   19577 
   19578   // At this point, V contains the byte-wise population count, and we are
   19579   // merely doing a horizontal sum if necessary to get the wider element
   19580   // counts.
   19581   if (EltVT == MVT::i8)
   19582     return V;
   19583 
   19584   return LowerHorizontalByteSum(
   19585       DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
   19586       DAG);
   19587 }
   19588 
   19589 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
   19590                                 SelectionDAG &DAG) {
   19591   MVT VT = Op.getSimpleValueType();
   19592   // FIXME: Need to add AVX-512 support here!
   19593   assert((VT.is256BitVector() || VT.is128BitVector()) &&
   19594          "Unknown CTPOP type to handle");
   19595   SDLoc DL(Op.getNode());
   19596   SDValue Op0 = Op.getOperand(0);
   19597 
   19598   if (!Subtarget->hasSSSE3()) {
   19599     // We can't use the fast LUT approach, so fall back on vectorized bitmath.
   19600     assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
   19601     return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
   19602   }
   19603 
   19604   if (VT.is256BitVector() && !Subtarget->hasInt256()) {
   19605     unsigned NumElems = VT.getVectorNumElements();
   19606 
   19607     // Extract each 128-bit vector, compute pop count and concat the result.
   19608     SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL);
   19609     SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL);
   19610 
   19611     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
   19612                        LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
   19613                        LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
   19614   }
   19615 
   19616   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
   19617 }
   19618 
   19619 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
   19620                           SelectionDAG &DAG) {
   19621   assert(Op.getSimpleValueType().isVector() &&
   19622          "We only do custom lowering for vector population count.");
   19623   return LowerVectorCTPOP(Op, Subtarget, DAG);
   19624 }
   19625 
   19626 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
   19627   SDNode *Node = Op.getNode();
   19628   SDLoc dl(Node);
   19629   EVT T = Node->getValueType(0);
   19630   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
   19631                               DAG.getConstant(0, dl, T), Node->getOperand(2));
   19632   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
   19633                        cast<AtomicSDNode>(Node)->getMemoryVT(),
   19634                        Node->getOperand(0),
   19635                        Node->getOperand(1), negOp,
   19636                        cast<AtomicSDNode>(Node)->getMemOperand(),
   19637                        cast<AtomicSDNode>(Node)->getOrdering(),
   19638                        cast<AtomicSDNode>(Node)->getSynchScope());
   19639 }
   19640 
   19641 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   19642   SDNode *Node = Op.getNode();
   19643   SDLoc dl(Node);
   19644   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
   19645 
   19646   // Convert seq_cst store -> xchg
   19647   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
   19648   // FIXME: On 32-bit, store -> fist or movq would be more efficient
   19649   //        (The only way to get a 16-byte store is cmpxchg16b)
   19650   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
   19651   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
   19652       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
   19653     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
   19654                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
   19655                                  Node->getOperand(0),
   19656                                  Node->getOperand(1), Node->getOperand(2),
   19657                                  cast<AtomicSDNode>(Node)->getMemOperand(),
   19658                                  cast<AtomicSDNode>(Node)->getOrdering(),
   19659                                  cast<AtomicSDNode>(Node)->getSynchScope());
   19660     return Swap.getValue(1);
   19661   }
   19662   // Other atomic stores have a simple pattern.
   19663   return Op;
   19664 }
   19665 
   19666 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   19667   MVT VT = Op.getNode()->getSimpleValueType(0);
   19668 
   19669   // Let legalize expand this if it isn't a legal type yet.
   19670   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   19671     return SDValue();
   19672 
   19673   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   19674 
   19675   unsigned Opc;
   19676   bool ExtraOp = false;
   19677   switch (Op.getOpcode()) {
   19678   default: llvm_unreachable("Invalid code");
   19679   case ISD::ADDC: Opc = X86ISD::ADD; break;
   19680   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
   19681   case ISD::SUBC: Opc = X86ISD::SUB; break;
   19682   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
   19683   }
   19684 
   19685   if (!ExtraOp)
   19686     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
   19687                        Op.getOperand(1));
   19688   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
   19689                      Op.getOperand(1), Op.getOperand(2));
   19690 }
   19691 
   19692 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
   19693                             SelectionDAG &DAG) {
   19694   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
   19695 
   19696   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   19697   // which returns the values as { float, float } (in XMM0) or
   19698   // { double, double } (which is returned in XMM0, XMM1).
   19699   SDLoc dl(Op);
   19700   SDValue Arg = Op.getOperand(0);
   19701   EVT ArgVT = Arg.getValueType();
   19702   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   19703 
   19704   TargetLowering::ArgListTy Args;
   19705   TargetLowering::ArgListEntry Entry;
   19706 
   19707   Entry.Node = Arg;
   19708   Entry.Ty = ArgTy;
   19709   Entry.isSExt = false;
   19710   Entry.isZExt = false;
   19711   Args.push_back(Entry);
   19712 
   19713   bool isF64 = ArgVT == MVT::f64;
   19714   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
   19715   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   19716   // the results are returned via SRet in memory.
   19717   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
   19718   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   19719   SDValue Callee =
   19720       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
   19721 
   19722   Type *RetTy = isF64
   19723     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
   19724     : (Type*)VectorType::get(ArgTy, 4);
   19725 
   19726   TargetLowering::CallLoweringInfo CLI(DAG);
   19727   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
   19728     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
   19729 
   19730   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
   19731 
   19732   if (isF64)
   19733     // Returned in xmm0 and xmm1.
   19734     return CallResult.first;
   19735 
   19736   // Returned in bits 0:31 and 32:64 xmm0.
   19737   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
   19738                                CallResult.first, DAG.getIntPtrConstant(0, dl));
   19739   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
   19740                                CallResult.first, DAG.getIntPtrConstant(1, dl));
   19741   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
   19742   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
   19743 }
   19744 
   19745 /// Widen a vector input to a vector of NVT.  The
   19746 /// input vector must have the same element type as NVT.
   19747 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
   19748                             bool FillWithZeroes = false) {
   19749   // Check if InOp already has the right width.
   19750   MVT InVT = InOp.getSimpleValueType();
   19751   if (InVT == NVT)
   19752     return InOp;
   19753 
   19754   if (InOp.isUndef())
   19755     return DAG.getUNDEF(NVT);
   19756 
   19757   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
   19758          "input and widen element type must match");
   19759 
   19760   unsigned InNumElts = InVT.getVectorNumElements();
   19761   unsigned WidenNumElts = NVT.getVectorNumElements();
   19762   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
   19763          "Unexpected request for vector widening");
   19764 
   19765   EVT EltVT = NVT.getVectorElementType();
   19766 
   19767   SDLoc dl(InOp);
   19768   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
   19769       InOp.getNumOperands() == 2) {
   19770     SDValue N1 = InOp.getOperand(1);
   19771     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
   19772         N1.isUndef()) {
   19773       InOp = InOp.getOperand(0);
   19774       InVT = InOp.getSimpleValueType();
   19775       InNumElts = InVT.getVectorNumElements();
   19776     }
   19777   }
   19778   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
   19779       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
   19780     SmallVector<SDValue, 16> Ops;
   19781     for (unsigned i = 0; i < InNumElts; ++i)
   19782       Ops.push_back(InOp.getOperand(i));
   19783 
   19784     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
   19785       DAG.getUNDEF(EltVT);
   19786     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
   19787       Ops.push_back(FillVal);
   19788     return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
   19789   }
   19790   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
   19791     DAG.getUNDEF(NVT);
   19792   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
   19793                      InOp, DAG.getIntPtrConstant(0, dl));
   19794 }
   19795 
   19796 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
   19797                              SelectionDAG &DAG) {
   19798   assert(Subtarget->hasAVX512() &&
   19799          "MGATHER/MSCATTER are supported on AVX-512 arch only");
   19800 
   19801   // X86 scatter kills mask register, so its type should be added to
   19802   // the list of return values.
   19803   // If the "scatter" has 2 return values, it is already handled.
   19804   if (Op.getNode()->getNumValues() == 2)
   19805     return Op;
   19806 
   19807   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
   19808   SDValue Src = N->getValue();
   19809   MVT VT = Src.getSimpleValueType();
   19810   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
   19811   SDLoc dl(Op);
   19812 
   19813   SDValue NewScatter;
   19814   SDValue Index = N->getIndex();
   19815   SDValue Mask = N->getMask();
   19816   SDValue Chain = N->getChain();
   19817   SDValue BasePtr = N->getBasePtr();
   19818   MVT MemVT = N->getMemoryVT().getSimpleVT();
   19819   MVT IndexVT = Index.getSimpleValueType();
   19820   MVT MaskVT = Mask.getSimpleValueType();
   19821 
   19822   if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
   19823     // The v2i32 value was promoted to v2i64.
   19824     // Now we "redo" the type legalizer's work and widen the original
   19825     // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
   19826     // with a shuffle.
   19827     assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
   19828            "Unexpected memory type");
   19829     int ShuffleMask[] = {0, 2, -1, -1};
   19830     Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
   19831                                DAG.getUNDEF(MVT::v4i32), ShuffleMask);
   19832     // Now we have 4 elements instead of 2.
   19833     // Expand the index.
   19834     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
   19835     Index = ExtendToType(Index, NewIndexVT, DAG);
   19836 
   19837     // Expand the mask with zeroes
   19838     // Mask may be <2 x i64> or <2 x i1> at this moment
   19839     assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
   19840            "Unexpected mask type");
   19841     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
   19842     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
   19843     VT = MVT::v4i32;
   19844   }
   19845 
   19846   unsigned NumElts = VT.getVectorNumElements();
   19847   if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
   19848       !Index.getSimpleValueType().is512BitVector()) {
   19849     // AVX512F supports only 512-bit vectors. Or data or index should
   19850     // be 512 bit wide. If now the both index and data are 256-bit, but
   19851     // the vector contains 8 elements, we just sign-extend the index
   19852     if (IndexVT == MVT::v8i32)
   19853       // Just extend index
   19854       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
   19855     else {
   19856       // The minimal number of elts in scatter is 8
   19857       NumElts = 8;
   19858       // Index
   19859       MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
   19860       // Use original index here, do not modify the index twice
   19861       Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
   19862       if (IndexVT.getScalarType() == MVT::i32)
   19863         Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
   19864 
   19865       // Mask
   19866       // At this point we have promoted mask operand
   19867       assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
   19868       MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
   19869       // Use the original mask here, do not modify the mask twice
   19870       Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
   19871 
   19872       // The value that should be stored
   19873       MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
   19874       Src = ExtendToType(Src, NewVT, DAG);
   19875     }
   19876   }
   19877   // If the mask is "wide" at this point - truncate it to i1 vector
   19878   MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
   19879   Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
   19880 
   19881   // The mask is killed by scatter, add it to the values
   19882   SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
   19883   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
   19884   NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
   19885                                     N->getMemOperand());
   19886   DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
   19887   return SDValue(NewScatter.getNode(), 0);
   19888 }
   19889 
   19890 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
   19891                           SelectionDAG &DAG) {
   19892 
   19893   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
   19894   MVT VT = Op.getSimpleValueType();
   19895   SDValue Mask = N->getMask();
   19896   SDLoc dl(Op);
   19897 
   19898   if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
   19899       !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
   19900     // This operation is legal for targets with VLX, but without
   19901     // VLX the vector should be widened to 512 bit
   19902     unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
   19903     MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
   19904     MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
   19905     SDValue Src0 = N->getSrc0();
   19906     Src0 = ExtendToType(Src0, WideDataVT, DAG);
   19907     Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
   19908     SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
   19909                                         N->getBasePtr(), Mask, Src0,
   19910                                         N->getMemoryVT(), N->getMemOperand(),
   19911                                         N->getExtensionType());
   19912 
   19913     SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
   19914                                  NewLoad.getValue(0),
   19915                                  DAG.getIntPtrConstant(0, dl));
   19916     SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
   19917     return DAG.getMergeValues(RetOps, dl);
   19918   }
   19919   return Op;
   19920 }
   19921 
   19922 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget,
   19923                            SelectionDAG &DAG) {
   19924   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
   19925   SDValue DataToStore = N->getValue();
   19926   MVT VT = DataToStore.getSimpleValueType();
   19927   SDValue Mask = N->getMask();
   19928   SDLoc dl(Op);
   19929 
   19930   if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
   19931       !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
   19932     // This operation is legal for targets with VLX, but without
   19933     // VLX the vector should be widened to 512 bit
   19934     unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
   19935     MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
   19936     MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
   19937     DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
   19938     Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
   19939     return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
   19940                               Mask, N->getMemoryVT(), N->getMemOperand(),
   19941                               N->isTruncatingStore());
   19942   }
   19943   return Op;
   19944 }
   19945 
   19946 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
   19947                             SelectionDAG &DAG) {
   19948   assert(Subtarget->hasAVX512() &&
   19949          "MGATHER/MSCATTER are supported on AVX-512 arch only");
   19950 
   19951   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
   19952   SDLoc dl(Op);
   19953   MVT VT = Op.getSimpleValueType();
   19954   SDValue Index = N->getIndex();
   19955   SDValue Mask = N->getMask();
   19956   SDValue Src0 = N->getValue();
   19957   MVT IndexVT = Index.getSimpleValueType();
   19958   MVT MaskVT = Mask.getSimpleValueType();
   19959 
   19960   unsigned NumElts = VT.getVectorNumElements();
   19961   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
   19962 
   19963   if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
   19964       !Index.getSimpleValueType().is512BitVector()) {
   19965     // AVX512F supports only 512-bit vectors. Or data or index should
   19966     // be 512 bit wide. If now the both index and data are 256-bit, but
   19967     // the vector contains 8 elements, we just sign-extend the index
   19968     if (NumElts == 8) {
   19969       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
   19970       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
   19971                         N->getOperand(3), Index };
   19972       DAG.UpdateNodeOperands(N, Ops);
   19973       return Op;
   19974     }
   19975 
   19976     // Minimal number of elements in Gather
   19977     NumElts = 8;
   19978     // Index
   19979     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
   19980     Index = ExtendToType(Index, NewIndexVT, DAG);
   19981     if (IndexVT.getScalarType() == MVT::i32)
   19982       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
   19983 
   19984     // Mask
   19985     MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
   19986     // At this point we have promoted mask operand
   19987     assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
   19988     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
   19989     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
   19990     Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
   19991 
   19992     // The pass-thru value
   19993     MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
   19994     Src0 = ExtendToType(Src0, NewVT, DAG);
   19995 
   19996     SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
   19997     SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
   19998                                             N->getMemoryVT(), dl, Ops,
   19999                                             N->getMemOperand());
   20000     SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
   20001                                  NewGather.getValue(0),
   20002                                  DAG.getIntPtrConstant(0, dl));
   20003     SDValue RetOps[] = {Exract, NewGather.getValue(1)};
   20004     return DAG.getMergeValues(RetOps, dl);
   20005   }
   20006   return Op;
   20007 }
   20008 
   20009 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
   20010                                                     SelectionDAG &DAG) const {
   20011   // TODO: Eventually, the lowering of these nodes should be informed by or
   20012   // deferred to the GC strategy for the function in which they appear. For
   20013   // now, however, they must be lowered to something. Since they are logically
   20014   // no-ops in the case of a null GC strategy (or a GC strategy which does not
   20015   // require special handling for these nodes), lower them as literal NOOPs for
   20016   // the time being.
   20017   SmallVector<SDValue, 2> Ops;
   20018 
   20019   Ops.push_back(Op.getOperand(0));
   20020   if (Op->getGluedNode())
   20021     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
   20022 
   20023   SDLoc OpDL(Op);
   20024   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
   20025   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
   20026 
   20027   return NOOP;
   20028 }
   20029 
   20030 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
   20031                                                   SelectionDAG &DAG) const {
   20032   // TODO: Eventually, the lowering of these nodes should be informed by or
   20033   // deferred to the GC strategy for the function in which they appear. For
   20034   // now, however, they must be lowered to something. Since they are logically
   20035   // no-ops in the case of a null GC strategy (or a GC strategy which does not
   20036   // require special handling for these nodes), lower them as literal NOOPs for
   20037   // the time being.
   20038   SmallVector<SDValue, 2> Ops;
   20039 
   20040   Ops.push_back(Op.getOperand(0));
   20041   if (Op->getGluedNode())
   20042     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
   20043 
   20044   SDLoc OpDL(Op);
   20045   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
   20046   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
   20047 
   20048   return NOOP;
   20049 }
   20050 
   20051 /// LowerOperation - Provide custom lowering hooks for some operations.
   20052 ///
   20053 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   20054   switch (Op.getOpcode()) {
   20055   default: llvm_unreachable("Should not custom lower this!");
   20056   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
   20057   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
   20058     return LowerCMP_SWAP(Op, Subtarget, DAG);
   20059   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
   20060   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
   20061   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
   20062   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   20063   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
   20064   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
   20065   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
   20066   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   20067   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   20068   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
   20069   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
   20070   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
   20071   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   20072   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   20073   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   20074   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
   20075   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   20076   case ISD::SHL_PARTS:
   20077   case ISD::SRA_PARTS:
   20078   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   20079   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   20080   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   20081   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
   20082   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
   20083   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
   20084   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
   20085   case ISD::SIGN_EXTEND_VECTOR_INREG:
   20086     return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
   20087   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   20088   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
   20089   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
   20090   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
   20091   case ISD::FABS:
   20092   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
   20093   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   20094   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   20095   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   20096   case ISD::SETCCE:             return LowerSETCCE(Op, DAG);
   20097   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   20098   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   20099   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   20100   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   20101   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   20102   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
   20103   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
   20104   case ISD::INTRINSIC_VOID:
   20105   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
   20106   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   20107   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   20108   case ISD::FRAME_TO_ARGS_OFFSET:
   20109                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   20110   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   20111   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
   20112   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   20113   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
   20114   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   20115   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   20116   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   20117   case ISD::CTLZ:               return LowerCTLZ(Op, Subtarget, DAG);
   20118   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG);
   20119   case ISD::CTTZ:
   20120   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
   20121   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   20122   case ISD::UMUL_LOHI:
   20123   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
   20124   case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
   20125   case ISD::SRA:
   20126   case ISD::SRL:
   20127   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
   20128   case ISD::SADDO:
   20129   case ISD::UADDO:
   20130   case ISD::SSUBO:
   20131   case ISD::USUBO:
   20132   case ISD::SMULO:
   20133   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   20134   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
   20135   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
   20136   case ISD::ADDC:
   20137   case ISD::ADDE:
   20138   case ISD::SUBC:
   20139   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   20140   case ISD::ADD:                return LowerADD(Op, DAG);
   20141   case ISD::SUB:                return LowerSUB(Op, DAG);
   20142   case ISD::SMAX:
   20143   case ISD::SMIN:
   20144   case ISD::UMAX:
   20145   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
   20146   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   20147   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   20148   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
   20149   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
   20150   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
   20151   case ISD::GC_TRANSITION_START:
   20152                                 return LowerGC_TRANSITION_START(Op, DAG);
   20153   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
   20154   }
   20155 }
   20156 
   20157 /// ReplaceNodeResults - Replace a node with an illegal result type
   20158 /// with a new node built out of custom code.
   20159 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   20160                                            SmallVectorImpl<SDValue>&Results,
   20161                                            SelectionDAG &DAG) const {
   20162   SDLoc dl(N);
   20163   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   20164   switch (N->getOpcode()) {
   20165   default:
   20166     llvm_unreachable("Do not know how to custom type legalize this operation!");
   20167   case X86ISD::AVG: {
   20168     // Legalize types for X86ISD::AVG by expanding vectors.
   20169     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
   20170 
   20171     auto InVT = N->getValueType(0);
   20172     auto InVTSize = InVT.getSizeInBits();
   20173     const unsigned RegSize =
   20174         (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
   20175     assert((!Subtarget->hasAVX512() || RegSize < 512) &&
   20176            "512-bit vector requires AVX512");
   20177     assert((!Subtarget->hasAVX2() || RegSize < 256) &&
   20178            "256-bit vector requires AVX2");
   20179 
   20180     auto ElemVT = InVT.getVectorElementType();
   20181     auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
   20182                                   RegSize / ElemVT.getSizeInBits());
   20183     assert(RegSize % InVT.getSizeInBits() == 0);
   20184     unsigned NumConcat = RegSize / InVT.getSizeInBits();
   20185 
   20186     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
   20187     Ops[0] = N->getOperand(0);
   20188     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
   20189     Ops[0] = N->getOperand(1);
   20190     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
   20191 
   20192     SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
   20193     Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
   20194                                   DAG.getIntPtrConstant(0, dl)));
   20195     return;
   20196   }
   20197   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
   20198   case X86ISD::FMINC:
   20199   case X86ISD::FMIN:
   20200   case X86ISD::FMAXC:
   20201   case X86ISD::FMAX: {
   20202     EVT VT = N->getValueType(0);
   20203     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
   20204     SDValue UNDEF = DAG.getUNDEF(VT);
   20205     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
   20206                               N->getOperand(0), UNDEF);
   20207     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
   20208                               N->getOperand(1), UNDEF);
   20209     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
   20210     return;
   20211   }
   20212   case ISD::SIGN_EXTEND_INREG:
   20213   case ISD::ADDC:
   20214   case ISD::ADDE:
   20215   case ISD::SUBC:
   20216   case ISD::SUBE:
   20217     // We don't want to expand or promote these.
   20218     return;
   20219   case ISD::SDIV:
   20220   case ISD::UDIV:
   20221   case ISD::SREM:
   20222   case ISD::UREM:
   20223   case ISD::SDIVREM:
   20224   case ISD::UDIVREM: {
   20225     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
   20226     Results.push_back(V);
   20227     return;
   20228   }
   20229   case ISD::FP_TO_SINT:
   20230   case ISD::FP_TO_UINT: {
   20231     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
   20232 
   20233     std::pair<SDValue,SDValue> Vals =
   20234         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
   20235     SDValue FIST = Vals.first, StackSlot = Vals.second;
   20236     if (FIST.getNode()) {
   20237       EVT VT = N->getValueType(0);
   20238       // Return a load from the stack slot.
   20239       if (StackSlot.getNode())
   20240         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
   20241                                       MachinePointerInfo(),
   20242                                       false, false, false, 0));
   20243       else
   20244         Results.push_back(FIST);
   20245     }
   20246     return;
   20247   }
   20248   case ISD::UINT_TO_FP: {
   20249     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
   20250     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
   20251         N->getValueType(0) != MVT::v2f32)
   20252       return;
   20253     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
   20254                                  N->getOperand(0));
   20255     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
   20256                                      MVT::f64);
   20257     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
   20258     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
   20259                              DAG.getBitcast(MVT::v2i64, VBias));
   20260     Or = DAG.getBitcast(MVT::v2f64, Or);
   20261     // TODO: Are there any fast-math-flags to propagate here?
   20262     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
   20263     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
   20264     return;
   20265   }
   20266   case ISD::FP_ROUND: {
   20267     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
   20268         return;
   20269     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
   20270     Results.push_back(V);
   20271     return;
   20272   }
   20273   case ISD::FP_EXTEND: {
   20274     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
   20275     // No other ValueType for FP_EXTEND should reach this point.
   20276     assert(N->getValueType(0) == MVT::v2f32 &&
   20277            "Do not know how to legalize this Node");
   20278     return;
   20279   }
   20280   case ISD::INTRINSIC_W_CHAIN: {
   20281     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   20282     switch (IntNo) {
   20283     default : llvm_unreachable("Do not know how to custom type "
   20284                                "legalize this intrinsic operation!");
   20285     case Intrinsic::x86_rdtsc:
   20286       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
   20287                                      Results);
   20288     case Intrinsic::x86_rdtscp:
   20289       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
   20290                                      Results);
   20291     case Intrinsic::x86_rdpmc:
   20292       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
   20293     }
   20294   }
   20295   case ISD::INTRINSIC_WO_CHAIN: {
   20296     if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
   20297       Results.push_back(V);
   20298     return;
   20299   }
   20300   case ISD::READCYCLECOUNTER: {
   20301     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
   20302                                    Results);
   20303   }
   20304   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
   20305     EVT T = N->getValueType(0);
   20306     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
   20307     bool Regs64bit = T == MVT::i128;
   20308     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
   20309     SDValue cpInL, cpInH;
   20310     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
   20311                         DAG.getConstant(0, dl, HalfT));
   20312     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
   20313                         DAG.getConstant(1, dl, HalfT));
   20314     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
   20315                              Regs64bit ? X86::RAX : X86::EAX,
   20316                              cpInL, SDValue());
   20317     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
   20318                              Regs64bit ? X86::RDX : X86::EDX,
   20319                              cpInH, cpInL.getValue(1));
   20320     SDValue swapInL, swapInH;
   20321     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
   20322                           DAG.getConstant(0, dl, HalfT));
   20323     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
   20324                           DAG.getConstant(1, dl, HalfT));
   20325     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
   20326                                Regs64bit ? X86::RBX : X86::EBX,
   20327                                swapInL, cpInH.getValue(1));
   20328     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
   20329                                Regs64bit ? X86::RCX : X86::ECX,
   20330                                swapInH, swapInL.getValue(1));
   20331     SDValue Ops[] = { swapInH.getValue(0),
   20332                       N->getOperand(1),
   20333                       swapInH.getValue(1) };
   20334     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   20335     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
   20336     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
   20337                                   X86ISD::LCMPXCHG8_DAG;
   20338     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
   20339     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
   20340                                         Regs64bit ? X86::RAX : X86::EAX,
   20341                                         HalfT, Result.getValue(1));
   20342     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
   20343                                         Regs64bit ? X86::RDX : X86::EDX,
   20344                                         HalfT, cpOutL.getValue(2));
   20345     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
   20346 
   20347     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
   20348                                         MVT::i32, cpOutH.getValue(2));
   20349     SDValue Success =
   20350         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   20351                     DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS);
   20352     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
   20353 
   20354     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
   20355     Results.push_back(Success);
   20356     Results.push_back(EFLAGS.getValue(1));
   20357     return;
   20358   }
   20359   case ISD::ATOMIC_SWAP:
   20360   case ISD::ATOMIC_LOAD_ADD:
   20361   case ISD::ATOMIC_LOAD_SUB:
   20362   case ISD::ATOMIC_LOAD_AND:
   20363   case ISD::ATOMIC_LOAD_OR:
   20364   case ISD::ATOMIC_LOAD_XOR:
   20365   case ISD::ATOMIC_LOAD_NAND:
   20366   case ISD::ATOMIC_LOAD_MIN:
   20367   case ISD::ATOMIC_LOAD_MAX:
   20368   case ISD::ATOMIC_LOAD_UMIN:
   20369   case ISD::ATOMIC_LOAD_UMAX:
   20370   case ISD::ATOMIC_LOAD: {
   20371     // Delegate to generic TypeLegalization. Situations we can really handle
   20372     // should have already been dealt with by AtomicExpandPass.cpp.
   20373     break;
   20374   }
   20375   case ISD::BITCAST: {
   20376     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
   20377     EVT DstVT = N->getValueType(0);
   20378     EVT SrcVT = N->getOperand(0)->getValueType(0);
   20379 
   20380     if (SrcVT != MVT::f64 ||
   20381         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
   20382       return;
   20383 
   20384     unsigned NumElts = DstVT.getVectorNumElements();
   20385     EVT SVT = DstVT.getVectorElementType();
   20386     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
   20387     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   20388                                    MVT::v2f64, N->getOperand(0));
   20389     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
   20390 
   20391     if (ExperimentalVectorWideningLegalization) {
   20392       // If we are legalizing vectors by widening, we already have the desired
   20393       // legal vector type, just return it.
   20394       Results.push_back(ToVecInt);
   20395       return;
   20396     }
   20397 
   20398     SmallVector<SDValue, 8> Elts;
   20399     for (unsigned i = 0, e = NumElts; i != e; ++i)
   20400       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
   20401                                    ToVecInt, DAG.getIntPtrConstant(i, dl)));
   20402 
   20403     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
   20404   }
   20405   }
   20406 }
   20407 
   20408 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   20409   switch ((X86ISD::NodeType)Opcode) {
   20410   case X86ISD::FIRST_NUMBER:       break;
   20411   case X86ISD::BSF:                return "X86ISD::BSF";
   20412   case X86ISD::BSR:                return "X86ISD::BSR";
   20413   case X86ISD::SHLD:               return "X86ISD::SHLD";
   20414   case X86ISD::SHRD:               return "X86ISD::SHRD";
   20415   case X86ISD::FAND:               return "X86ISD::FAND";
   20416   case X86ISD::FANDN:              return "X86ISD::FANDN";
   20417   case X86ISD::FOR:                return "X86ISD::FOR";
   20418   case X86ISD::FXOR:               return "X86ISD::FXOR";
   20419   case X86ISD::FILD:               return "X86ISD::FILD";
   20420   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
   20421   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
   20422   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
   20423   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
   20424   case X86ISD::FLD:                return "X86ISD::FLD";
   20425   case X86ISD::FST:                return "X86ISD::FST";
   20426   case X86ISD::CALL:               return "X86ISD::CALL";
   20427   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
   20428   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
   20429   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
   20430   case X86ISD::BT:                 return "X86ISD::BT";
   20431   case X86ISD::CMP:                return "X86ISD::CMP";
   20432   case X86ISD::COMI:               return "X86ISD::COMI";
   20433   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
   20434   case X86ISD::CMPM:               return "X86ISD::CMPM";
   20435   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
   20436   case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
   20437   case X86ISD::SETCC:              return "X86ISD::SETCC";
   20438   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   20439   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
   20440   case X86ISD::FGETSIGNx86:        return "X86ISD::FGETSIGNx86";
   20441   case X86ISD::CMOV:               return "X86ISD::CMOV";
   20442   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   20443   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
   20444   case X86ISD::IRET:               return "X86ISD::IRET";
   20445   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
   20446   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
   20447   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
   20448   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
   20449   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
   20450   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
   20451   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
   20452   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
   20453   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
   20454   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
   20455   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   20456   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   20457   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
   20458   case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
   20459   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   20460   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   20461   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
   20462   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
   20463   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
   20464   case X86ISD::ADDUS:              return "X86ISD::ADDUS";
   20465   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
   20466   case X86ISD::HADD:               return "X86ISD::HADD";
   20467   case X86ISD::HSUB:               return "X86ISD::HSUB";
   20468   case X86ISD::FHADD:              return "X86ISD::FHADD";
   20469   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   20470   case X86ISD::ABS:                return "X86ISD::ABS";
   20471   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
   20472   case X86ISD::FMAX:               return "X86ISD::FMAX";
   20473   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
   20474   case X86ISD::FMIN:               return "X86ISD::FMIN";
   20475   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
   20476   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
   20477   case X86ISD::FMINC:              return "X86ISD::FMINC";
   20478   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   20479   case X86ISD::FRCP:               return "X86ISD::FRCP";
   20480   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
   20481   case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
   20482   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   20483   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   20484   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   20485   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
   20486   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
   20487   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   20488   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   20489   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
   20490   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
   20491   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   20492   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
   20493   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
   20494   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   20495   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   20496   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
   20497   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
   20498   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
   20499   case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
   20500   case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
   20501   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
   20502   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   20503   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   20504   case X86ISD::CVTDQ2PD:           return "X86ISD::CVTDQ2PD";
   20505   case X86ISD::CVTUDQ2PD:          return "X86ISD::CVTUDQ2PD";
   20506   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   20507   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   20508   case X86ISD::VSHL:               return "X86ISD::VSHL";
   20509   case X86ISD::VSRL:               return "X86ISD::VSRL";
   20510   case X86ISD::VSRA:               return "X86ISD::VSRA";
   20511   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
   20512   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
   20513   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
   20514   case X86ISD::CMPP:               return "X86ISD::CMPP";
   20515   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   20516   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
   20517   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
   20518   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
   20519   case X86ISD::ADD:                return "X86ISD::ADD";
   20520   case X86ISD::SUB:                return "X86ISD::SUB";
   20521   case X86ISD::ADC:                return "X86ISD::ADC";
   20522   case X86ISD::SBB:                return "X86ISD::SBB";
   20523   case X86ISD::SMUL:               return "X86ISD::SMUL";
   20524   case X86ISD::UMUL:               return "X86ISD::UMUL";
   20525   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
   20526   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
   20527   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
   20528   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
   20529   case X86ISD::INC:                return "X86ISD::INC";
   20530   case X86ISD::DEC:                return "X86ISD::DEC";
   20531   case X86ISD::OR:                 return "X86ISD::OR";
   20532   case X86ISD::XOR:                return "X86ISD::XOR";
   20533   case X86ISD::AND:                return "X86ISD::AND";
   20534   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   20535   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   20536   case X86ISD::PTEST:              return "X86ISD::PTEST";
   20537   case X86ISD::TESTP:              return "X86ISD::TESTP";
   20538   case X86ISD::TESTM:              return "X86ISD::TESTM";
   20539   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
   20540   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
   20541   case X86ISD::KTEST:              return "X86ISD::KTEST";
   20542   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
   20543   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
   20544   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   20545   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
   20546   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   20547   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
   20548   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
   20549   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
   20550   case X86ISD::SHUF128:            return "X86ISD::SHUF128";
   20551   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
   20552   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
   20553   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
   20554   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
   20555   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
   20556   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
   20557   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
   20558   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
   20559   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
   20560   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
   20561   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
   20562   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   20563   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   20564   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
   20565   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
   20566   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
   20567   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
   20568   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
   20569   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   20570   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
   20571   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
   20572   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
   20573   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   20574   case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
   20575   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
   20576   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
   20577   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   20578   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
   20579   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
   20580   case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
   20581   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   20582   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   20583   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
   20584   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
   20585   case X86ISD::MFENCE:             return "X86ISD::MFENCE";
   20586   case X86ISD::SFENCE:             return "X86ISD::SFENCE";
   20587   case X86ISD::LFENCE:             return "X86ISD::LFENCE";
   20588   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   20589   case X86ISD::SAHF:               return "X86ISD::SAHF";
   20590   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
   20591   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
   20592   case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
   20593   case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
   20594   case X86ISD::VPROT:              return "X86ISD::VPROT";
   20595   case X86ISD::VPROTI:             return "X86ISD::VPROTI";
   20596   case X86ISD::VPSHA:              return "X86ISD::VPSHA";
   20597   case X86ISD::VPSHL:              return "X86ISD::VPSHL";
   20598   case X86ISD::VPCOM:              return "X86ISD::VPCOM";
   20599   case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
   20600   case X86ISD::FMADD:              return "X86ISD::FMADD";
   20601   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
   20602   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
   20603   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
   20604   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
   20605   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
   20606   case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
   20607   case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
   20608   case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
   20609   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
   20610   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
   20611   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
   20612   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
   20613   case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
   20614   case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
   20615   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
   20616   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
   20617   case X86ISD::XTEST:              return "X86ISD::XTEST";
   20618   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
   20619   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
   20620   case X86ISD::SELECT:             return "X86ISD::SELECT";
   20621   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
   20622   case X86ISD::RCP28:              return "X86ISD::RCP28";
   20623   case X86ISD::EXP2:               return "X86ISD::EXP2";
   20624   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
   20625   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
   20626   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
   20627   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
   20628   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
   20629   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
   20630   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
   20631   case X86ISD::SCALEF:             return "X86ISD::SCALEF";
   20632   case X86ISD::ADDS:               return "X86ISD::ADDS";
   20633   case X86ISD::SUBS:               return "X86ISD::SUBS";
   20634   case X86ISD::AVG:                return "X86ISD::AVG";
   20635   case X86ISD::MULHRS:             return "X86ISD::MULHRS";
   20636   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
   20637   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
   20638   case X86ISD::FP_TO_SINT_RND:     return "X86ISD::FP_TO_SINT_RND";
   20639   case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
   20640   case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
   20641   case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
   20642   }
   20643   return nullptr;
   20644 }
   20645 
   20646 // isLegalAddressingMode - Return true if the addressing mode represented
   20647 // by AM is legal for this target, for a load/store of the specified type.
   20648 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
   20649                                               const AddrMode &AM, Type *Ty,
   20650                                               unsigned AS) const {
   20651   // X86 supports extremely general addressing modes.
   20652   CodeModel::Model M = getTargetMachine().getCodeModel();
   20653   Reloc::Model R = getTargetMachine().getRelocationModel();
   20654 
   20655   // X86 allows a sign-extended 32-bit immediate field as a displacement.
   20656   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
   20657     return false;
   20658 
   20659   if (AM.BaseGV) {
   20660     unsigned GVFlags =
   20661       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
   20662 
   20663     // If a reference to this global requires an extra load, we can't fold it.
   20664     if (isGlobalStubReference(GVFlags))
   20665       return false;
   20666 
   20667     // If BaseGV requires a register for the PIC base, we cannot also have a
   20668     // BaseReg specified.
   20669     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
   20670       return false;
   20671 
   20672     // If lower 4G is not available, then we must use rip-relative addressing.
   20673     if ((M != CodeModel::Small || R != Reloc::Static) &&
   20674         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
   20675       return false;
   20676   }
   20677 
   20678   switch (AM.Scale) {
   20679   case 0:
   20680   case 1:
   20681   case 2:
   20682   case 4:
   20683   case 8:
   20684     // These scales always work.
   20685     break;
   20686   case 3:
   20687   case 5:
   20688   case 9:
   20689     // These scales are formed with basereg+scalereg.  Only accept if there is
   20690     // no basereg yet.
   20691     if (AM.HasBaseReg)
   20692       return false;
   20693     break;
   20694   default:  // Other stuff never works.
   20695     return false;
   20696   }
   20697 
   20698   return true;
   20699 }
   20700 
   20701 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
   20702   unsigned Bits = Ty->getScalarSizeInBits();
   20703 
   20704   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
   20705   // particularly cheaper than those without.
   20706   if (Bits == 8)
   20707     return false;
   20708 
   20709   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
   20710   // variable shifts just as cheap as scalar ones.
   20711   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
   20712     return false;
   20713 
   20714   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
   20715   // fully general vector.
   20716   return true;
   20717 }
   20718 
   20719 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   20720   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   20721     return false;
   20722   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   20723   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   20724   return NumBits1 > NumBits2;
   20725 }
   20726 
   20727 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
   20728   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   20729     return false;
   20730 
   20731   if (!isTypeLegal(EVT::getEVT(Ty1)))
   20732     return false;
   20733 
   20734   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
   20735 
   20736   // Assuming the caller doesn't have a zeroext or signext return parameter,
   20737   // truncation all the way down to i1 is valid.
   20738   return true;
   20739 }
   20740 
   20741 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   20742   return isInt<32>(Imm);
   20743 }
   20744 
   20745 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
   20746   // Can also use sub to handle negated immediates.
   20747   return isInt<32>(Imm);
   20748 }
   20749 
   20750 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   20751   if (!VT1.isInteger() || !VT2.isInteger())
   20752     return false;
   20753   unsigned NumBits1 = VT1.getSizeInBits();
   20754   unsigned NumBits2 = VT2.getSizeInBits();
   20755   return NumBits1 > NumBits2;
   20756 }
   20757 
   20758 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   20759   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   20760   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
   20761 }
   20762 
   20763 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   20764   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   20765   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
   20766 }
   20767 
   20768 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   20769   EVT VT1 = Val.getValueType();
   20770   if (isZExtFree(VT1, VT2))
   20771     return true;
   20772 
   20773   if (Val.getOpcode() != ISD::LOAD)
   20774     return false;
   20775 
   20776   if (!VT1.isSimple() || !VT1.isInteger() ||
   20777       !VT2.isSimple() || !VT2.isInteger())
   20778     return false;
   20779 
   20780   switch (VT1.getSimpleVT().SimpleTy) {
   20781   default: break;
   20782   case MVT::i8:
   20783   case MVT::i16:
   20784   case MVT::i32:
   20785     // X86 has 8, 16, and 32-bit zero-extending loads.
   20786     return true;
   20787   }
   20788 
   20789   return false;
   20790 }
   20791 
   20792 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
   20793 
   20794 bool
   20795 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   20796   if (!Subtarget->hasAnyFMA())
   20797     return false;
   20798 
   20799   VT = VT.getScalarType();
   20800 
   20801   if (!VT.isSimple())
   20802     return false;
   20803 
   20804   switch (VT.getSimpleVT().SimpleTy) {
   20805   case MVT::f32:
   20806   case MVT::f64:
   20807     return true;
   20808   default:
   20809     break;
   20810   }
   20811 
   20812   return false;
   20813 }
   20814 
   20815 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
   20816   // i16 instructions are longer (0x66 prefix) and potentially slower.
   20817   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
   20818 }
   20819 
   20820 /// isShuffleMaskLegal - Targets can use this to indicate that they only
   20821 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
   20822 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
   20823 /// are assumed to be legal.
   20824 bool
   20825 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
   20826                                       EVT VT) const {
   20827   if (!VT.isSimple())
   20828     return false;
   20829 
   20830   // Not for i1 vectors
   20831   if (VT.getSimpleVT().getScalarType() == MVT::i1)
   20832     return false;
   20833 
   20834   // Very little shuffling can be done for 64-bit vectors right now.
   20835   if (VT.getSimpleVT().getSizeInBits() == 64)
   20836     return false;
   20837 
   20838   // We only care that the types being shuffled are legal. The lowering can
   20839   // handle any possible shuffle mask that results.
   20840   return isTypeLegal(VT.getSimpleVT());
   20841 }
   20842 
   20843 bool
   20844 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
   20845                                           EVT VT) const {
   20846   // Just delegate to the generic legality, clear masks aren't special.
   20847   return isShuffleMaskLegal(Mask, VT);
   20848 }
   20849 
   20850 //===----------------------------------------------------------------------===//
   20851 //                           X86 Scheduler Hooks
   20852 //===----------------------------------------------------------------------===//
   20853 
   20854 /// Utility function to emit xbegin specifying the start of an RTM region.
   20855 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
   20856                                      const TargetInstrInfo *TII) {
   20857   DebugLoc DL = MI->getDebugLoc();
   20858 
   20859   const BasicBlock *BB = MBB->getBasicBlock();
   20860   MachineFunction::iterator I = ++MBB->getIterator();
   20861 
   20862   // For the v = xbegin(), we generate
   20863   //
   20864   // thisMBB:
   20865   //  xbegin sinkMBB
   20866   //
   20867   // mainMBB:
   20868   //  eax = -1
   20869   //
   20870   // sinkMBB:
   20871   //  v = eax
   20872 
   20873   MachineBasicBlock *thisMBB = MBB;
   20874   MachineFunction *MF = MBB->getParent();
   20875   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   20876   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   20877   MF->insert(I, mainMBB);
   20878   MF->insert(I, sinkMBB);
   20879 
   20880   // Transfer the remainder of BB and its successor edges to sinkMBB.
   20881   sinkMBB->splice(sinkMBB->begin(), MBB,
   20882                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   20883   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
   20884 
   20885   // thisMBB:
   20886   //  xbegin sinkMBB
   20887   //  # fallthrough to mainMBB
   20888   //  # abortion to sinkMBB
   20889   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
   20890   thisMBB->addSuccessor(mainMBB);
   20891   thisMBB->addSuccessor(sinkMBB);
   20892 
   20893   // mainMBB:
   20894   //  EAX = -1
   20895   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
   20896   mainMBB->addSuccessor(sinkMBB);
   20897 
   20898   // sinkMBB:
   20899   // EAX is live into the sinkMBB
   20900   sinkMBB->addLiveIn(X86::EAX);
   20901   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
   20902           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
   20903     .addReg(X86::EAX);
   20904 
   20905   MI->eraseFromParent();
   20906   return sinkMBB;
   20907 }
   20908 
   20909 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
   20910 // or XMM0_V32I8 in AVX all of this code can be replaced with that
   20911 // in the .td file.
   20912 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
   20913                                        const TargetInstrInfo *TII) {
   20914   unsigned Opc;
   20915   switch (MI->getOpcode()) {
   20916   default: llvm_unreachable("illegal opcode!");
   20917   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
   20918   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
   20919   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
   20920   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
   20921   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
   20922   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
   20923   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
   20924   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
   20925   }
   20926 
   20927   DebugLoc dl = MI->getDebugLoc();
   20928   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
   20929 
   20930   unsigned NumArgs = MI->getNumOperands();
   20931   for (unsigned i = 1; i < NumArgs; ++i) {
   20932     MachineOperand &Op = MI->getOperand(i);
   20933     if (!(Op.isReg() && Op.isImplicit()))
   20934       MIB.addOperand(Op);
   20935   }
   20936   if (MI->hasOneMemOperand())
   20937     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   20938 
   20939   BuildMI(*BB, MI, dl,
   20940     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
   20941     .addReg(X86::XMM0);
   20942 
   20943   MI->eraseFromParent();
   20944   return BB;
   20945 }
   20946 
   20947 // FIXME: Custom handling because TableGen doesn't support multiple implicit
   20948 // defs in an instruction pattern
   20949 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
   20950                                        const TargetInstrInfo *TII) {
   20951   unsigned Opc;
   20952   switch (MI->getOpcode()) {
   20953   default: llvm_unreachable("illegal opcode!");
   20954   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
   20955   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
   20956   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
   20957   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
   20958   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
   20959   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
   20960   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
   20961   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
   20962   }
   20963 
   20964   DebugLoc dl = MI->getDebugLoc();
   20965   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
   20966 
   20967   unsigned NumArgs = MI->getNumOperands(); // remove the results
   20968   for (unsigned i = 1; i < NumArgs; ++i) {
   20969     MachineOperand &Op = MI->getOperand(i);
   20970     if (!(Op.isReg() && Op.isImplicit()))
   20971       MIB.addOperand(Op);
   20972   }
   20973   if (MI->hasOneMemOperand())
   20974     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   20975 
   20976   BuildMI(*BB, MI, dl,
   20977     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
   20978     .addReg(X86::ECX);
   20979 
   20980   MI->eraseFromParent();
   20981   return BB;
   20982 }
   20983 
   20984 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
   20985                                       const X86Subtarget *Subtarget) {
   20986   DebugLoc dl = MI->getDebugLoc();
   20987   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   20988   // Address into RAX/EAX, other two args into ECX, EDX.
   20989   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
   20990   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
   20991   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   20992   for (int i = 0; i < X86::AddrNumOperands; ++i)
   20993     MIB.addOperand(MI->getOperand(i));
   20994 
   20995   unsigned ValOps = X86::AddrNumOperands;
   20996   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
   20997     .addReg(MI->getOperand(ValOps).getReg());
   20998   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
   20999     .addReg(MI->getOperand(ValOps+1).getReg());
   21000 
   21001   // The instruction doesn't actually take any operands though.
   21002   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
   21003 
   21004   MI->eraseFromParent(); // The pseudo is gone now.
   21005   return BB;
   21006 }
   21007 
   21008 MachineBasicBlock *
   21009 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
   21010                                                  MachineBasicBlock *MBB) const {
   21011   // Emit va_arg instruction on X86-64.
   21012 
   21013   // Operands to this pseudo-instruction:
   21014   // 0  ) Output        : destination address (reg)
   21015   // 1-5) Input         : va_list address (addr, i64mem)
   21016   // 6  ) ArgSize       : Size (in bytes) of vararg type
   21017   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
   21018   // 8  ) Align         : Alignment of type
   21019   // 9  ) EFLAGS (implicit-def)
   21020 
   21021   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
   21022   static_assert(X86::AddrNumOperands == 5,
   21023                 "VAARG_64 assumes 5 address operands");
   21024 
   21025   unsigned DestReg = MI->getOperand(0).getReg();
   21026   MachineOperand &Base = MI->getOperand(1);
   21027   MachineOperand &Scale = MI->getOperand(2);
   21028   MachineOperand &Index = MI->getOperand(3);
   21029   MachineOperand &Disp = MI->getOperand(4);
   21030   MachineOperand &Segment = MI->getOperand(5);
   21031   unsigned ArgSize = MI->getOperand(6).getImm();
   21032   unsigned ArgMode = MI->getOperand(7).getImm();
   21033   unsigned Align = MI->getOperand(8).getImm();
   21034 
   21035   // Memory Reference
   21036   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
   21037   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   21038   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
   21039 
   21040   // Machine Information
   21041   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   21042   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   21043   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   21044   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
   21045   DebugLoc DL = MI->getDebugLoc();
   21046 
   21047   // struct va_list {
   21048   //   i32   gp_offset
   21049   //   i32   fp_offset
   21050   //   i64   overflow_area (address)
   21051   //   i64   reg_save_area (address)
   21052   // }
   21053   // sizeof(va_list) = 24
   21054   // alignment(va_list) = 8
   21055 
   21056   unsigned TotalNumIntRegs = 6;
   21057   unsigned TotalNumXMMRegs = 8;
   21058   bool UseGPOffset = (ArgMode == 1);
   21059   bool UseFPOffset = (ArgMode == 2);
   21060   unsigned MaxOffset = TotalNumIntRegs * 8 +
   21061                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
   21062 
   21063   /* Align ArgSize to a multiple of 8 */
   21064   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
   21065   bool NeedsAlign = (Align > 8);
   21066 
   21067   MachineBasicBlock *thisMBB = MBB;
   21068   MachineBasicBlock *overflowMBB;
   21069   MachineBasicBlock *offsetMBB;
   21070   MachineBasicBlock *endMBB;
   21071 
   21072   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
   21073   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
   21074   unsigned OffsetReg = 0;
   21075 
   21076   if (!UseGPOffset && !UseFPOffset) {
   21077     // If we only pull from the overflow region, we don't create a branch.
   21078     // We don't need to alter control flow.
   21079     OffsetDestReg = 0; // unused
   21080     OverflowDestReg = DestReg;
   21081 
   21082     offsetMBB = nullptr;
   21083     overflowMBB = thisMBB;
   21084     endMBB = thisMBB;
   21085   } else {
   21086     // First emit code to check if gp_offset (or fp_offset) is below the bound.
   21087     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
   21088     // If not, pull from overflow_area. (branch to overflowMBB)
   21089     //
   21090     //       thisMBB
   21091     //         |     .
   21092     //         |        .
   21093     //     offsetMBB   overflowMBB
   21094     //         |        .
   21095     //         |     .
   21096     //        endMBB
   21097 
   21098     // Registers for the PHI in endMBB
   21099     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
   21100     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
   21101 
   21102     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   21103     MachineFunction *MF = MBB->getParent();
   21104     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   21105     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   21106     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   21107 
   21108     MachineFunction::iterator MBBIter = ++MBB->getIterator();
   21109 
   21110     // Insert the new basic blocks
   21111     MF->insert(MBBIter, offsetMBB);
   21112     MF->insert(MBBIter, overflowMBB);
   21113     MF->insert(MBBIter, endMBB);
   21114 
   21115     // Transfer the remainder of MBB and its successor edges to endMBB.
   21116     endMBB->splice(endMBB->begin(), thisMBB,
   21117                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
   21118     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
   21119 
   21120     // Make offsetMBB and overflowMBB successors of thisMBB
   21121     thisMBB->addSuccessor(offsetMBB);
   21122     thisMBB->addSuccessor(overflowMBB);
   21123 
   21124     // endMBB is a successor of both offsetMBB and overflowMBB
   21125     offsetMBB->addSuccessor(endMBB);
   21126     overflowMBB->addSuccessor(endMBB);
   21127 
   21128     // Load the offset value into a register
   21129     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
   21130     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
   21131       .addOperand(Base)
   21132       .addOperand(Scale)
   21133       .addOperand(Index)
   21134       .addDisp(Disp, UseFPOffset ? 4 : 0)
   21135       .addOperand(Segment)
   21136       .setMemRefs(MMOBegin, MMOEnd);
   21137 
   21138     // Check if there is enough room left to pull this argument.
   21139     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
   21140       .addReg(OffsetReg)
   21141       .addImm(MaxOffset + 8 - ArgSizeA8);
   21142 
   21143     // Branch to "overflowMBB" if offset >= max
   21144     // Fall through to "offsetMBB" otherwise
   21145     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
   21146       .addMBB(overflowMBB);
   21147   }
   21148 
   21149   // In offsetMBB, emit code to use the reg_save_area.
   21150   if (offsetMBB) {
   21151     assert(OffsetReg != 0);
   21152 
   21153     // Read the reg_save_area address.
   21154     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
   21155     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
   21156       .addOperand(Base)
   21157       .addOperand(Scale)
   21158       .addOperand(Index)
   21159       .addDisp(Disp, 16)
   21160       .addOperand(Segment)
   21161       .setMemRefs(MMOBegin, MMOEnd);
   21162 
   21163     // Zero-extend the offset
   21164     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
   21165       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
   21166         .addImm(0)
   21167         .addReg(OffsetReg)
   21168         .addImm(X86::sub_32bit);
   21169 
   21170     // Add the offset to the reg_save_area to get the final address.
   21171     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
   21172       .addReg(OffsetReg64)
   21173       .addReg(RegSaveReg);
   21174 
   21175     // Compute the offset for the next argument
   21176     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
   21177     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
   21178       .addReg(OffsetReg)
   21179       .addImm(UseFPOffset ? 16 : 8);
   21180 
   21181     // Store it back into the va_list.
   21182     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
   21183       .addOperand(Base)
   21184       .addOperand(Scale)
   21185       .addOperand(Index)
   21186       .addDisp(Disp, UseFPOffset ? 4 : 0)
   21187       .addOperand(Segment)
   21188       .addReg(NextOffsetReg)
   21189       .setMemRefs(MMOBegin, MMOEnd);
   21190 
   21191     // Jump to endMBB
   21192     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
   21193       .addMBB(endMBB);
   21194   }
   21195 
   21196   //
   21197   // Emit code to use overflow area
   21198   //
   21199 
   21200   // Load the overflow_area address into a register.
   21201   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
   21202   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
   21203     .addOperand(Base)
   21204     .addOperand(Scale)
   21205     .addOperand(Index)
   21206     .addDisp(Disp, 8)
   21207     .addOperand(Segment)
   21208     .setMemRefs(MMOBegin, MMOEnd);
   21209 
   21210   // If we need to align it, do so. Otherwise, just copy the address
   21211   // to OverflowDestReg.
   21212   if (NeedsAlign) {
   21213     // Align the overflow address
   21214     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
   21215     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
   21216 
   21217     // aligned_addr = (addr + (align-1)) & ~(align-1)
   21218     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
   21219       .addReg(OverflowAddrReg)
   21220       .addImm(Align-1);
   21221 
   21222     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
   21223       .addReg(TmpReg)
   21224       .addImm(~(uint64_t)(Align-1));
   21225   } else {
   21226     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
   21227       .addReg(OverflowAddrReg);
   21228   }
   21229 
   21230   // Compute the next overflow address after this argument.
   21231   // (the overflow address should be kept 8-byte aligned)
   21232   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
   21233   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
   21234     .addReg(OverflowDestReg)
   21235     .addImm(ArgSizeA8);
   21236 
   21237   // Store the new overflow address.
   21238   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
   21239     .addOperand(Base)
   21240     .addOperand(Scale)
   21241     .addOperand(Index)
   21242     .addDisp(Disp, 8)
   21243     .addOperand(Segment)
   21244     .addReg(NextAddrReg)
   21245     .setMemRefs(MMOBegin, MMOEnd);
   21246 
   21247   // If we branched, emit the PHI to the front of endMBB.
   21248   if (offsetMBB) {
   21249     BuildMI(*endMBB, endMBB->begin(), DL,
   21250             TII->get(X86::PHI), DestReg)
   21251       .addReg(OffsetDestReg).addMBB(offsetMBB)
   21252       .addReg(OverflowDestReg).addMBB(overflowMBB);
   21253   }
   21254 
   21255   // Erase the pseudo instruction
   21256   MI->eraseFromParent();
   21257 
   21258   return endMBB;
   21259 }
   21260 
   21261 MachineBasicBlock *
   21262 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   21263                                                  MachineInstr *MI,
   21264                                                  MachineBasicBlock *MBB) const {
   21265   // Emit code to save XMM registers to the stack. The ABI says that the
   21266   // number of registers to save is given in %al, so it's theoretically
   21267   // possible to do an indirect jump trick to avoid saving all of them,
   21268   // however this code takes a simpler approach and just executes all
   21269   // of the stores if %al is non-zero. It's less code, and it's probably
   21270   // easier on the hardware branch predictor, and stores aren't all that
   21271   // expensive anyway.
   21272 
   21273   // Create the new basic blocks. One block contains all the XMM stores,
   21274   // and one block is the final destination regardless of whether any
   21275   // stores were performed.
   21276   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   21277   MachineFunction *F = MBB->getParent();
   21278   MachineFunction::iterator MBBIter = ++MBB->getIterator();
   21279   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
   21280   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
   21281   F->insert(MBBIter, XMMSaveMBB);
   21282   F->insert(MBBIter, EndMBB);
   21283 
   21284   // Transfer the remainder of MBB and its successor edges to EndMBB.
   21285   EndMBB->splice(EndMBB->begin(), MBB,
   21286                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   21287   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
   21288 
   21289   // The original block will now fall through to the XMM save block.
   21290   MBB->addSuccessor(XMMSaveMBB);
   21291   // The XMMSaveMBB will fall through to the end block.
   21292   XMMSaveMBB->addSuccessor(EndMBB);
   21293 
   21294   // Now add the instructions.
   21295   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   21296   DebugLoc DL = MI->getDebugLoc();
   21297 
   21298   unsigned CountReg = MI->getOperand(0).getReg();
   21299   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
   21300   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
   21301 
   21302   if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) {
   21303     // If %al is 0, branch around the XMM save block.
   21304     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
   21305     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
   21306     MBB->addSuccessor(EndMBB);
   21307   }
   21308 
   21309   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
   21310   // that was just emitted, but clearly shouldn't be "saved".
   21311   assert((MI->getNumOperands() <= 3 ||
   21312           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
   21313           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
   21314          && "Expected last argument to be EFLAGS");
   21315   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   21316   // In the XMM save block, save all the XMM argument registers.
   21317   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
   21318     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
   21319     MachineMemOperand *MMO = F->getMachineMemOperand(
   21320         MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
   21321         MachineMemOperand::MOStore,
   21322         /*Size=*/16, /*Align=*/16);
   21323     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
   21324       .addFrameIndex(RegSaveFrameIndex)
   21325       .addImm(/*Scale=*/1)
   21326       .addReg(/*IndexReg=*/0)
   21327       .addImm(/*Disp=*/Offset)
   21328       .addReg(/*Segment=*/0)
   21329       .addReg(MI->getOperand(i).getReg())
   21330       .addMemOperand(MMO);
   21331   }
   21332 
   21333   MI->eraseFromParent();   // The pseudo instruction is gone now.
   21334 
   21335   return EndMBB;
   21336 }
   21337 
   21338 // The EFLAGS operand of SelectItr might be missing a kill marker
   21339 // because there were multiple uses of EFLAGS, and ISel didn't know
   21340 // which to mark. Figure out whether SelectItr should have had a
   21341 // kill marker, and set it if it should. Returns the correct kill
   21342 // marker value.
   21343 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
   21344                                      MachineBasicBlock* BB,
   21345                                      const TargetRegisterInfo* TRI) {
   21346   // Scan forward through BB for a use/def of EFLAGS.
   21347   MachineBasicBlock::iterator miI(std::next(SelectItr));
   21348   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
   21349     const MachineInstr& mi = *miI;
   21350     if (mi.readsRegister(X86::EFLAGS))
   21351       return false;
   21352     if (mi.definesRegister(X86::EFLAGS))
   21353       break; // Should have kill-flag - update below.
   21354   }
   21355 
   21356   // If we hit the end of the block, check whether EFLAGS is live into a
   21357   // successor.
   21358   if (miI == BB->end()) {
   21359     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
   21360                                           sEnd = BB->succ_end();
   21361          sItr != sEnd; ++sItr) {
   21362       MachineBasicBlock* succ = *sItr;
   21363       if (succ->isLiveIn(X86::EFLAGS))
   21364         return false;
   21365     }
   21366   }
   21367 
   21368   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
   21369   // out. SelectMI should have a kill flag on EFLAGS.
   21370   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
   21371   return true;
   21372 }
   21373 
   21374 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
   21375 // together with other CMOV pseudo-opcodes into a single basic-block with
   21376 // conditional jump around it.
   21377 static bool isCMOVPseudo(MachineInstr *MI) {
   21378   switch (MI->getOpcode()) {
   21379   case X86::CMOV_FR32:
   21380   case X86::CMOV_FR64:
   21381   case X86::CMOV_GR8:
   21382   case X86::CMOV_GR16:
   21383   case X86::CMOV_GR32:
   21384   case X86::CMOV_RFP32:
   21385   case X86::CMOV_RFP64:
   21386   case X86::CMOV_RFP80:
   21387   case X86::CMOV_V2F64:
   21388   case X86::CMOV_V2I64:
   21389   case X86::CMOV_V4F32:
   21390   case X86::CMOV_V4F64:
   21391   case X86::CMOV_V4I64:
   21392   case X86::CMOV_V16F32:
   21393   case X86::CMOV_V8F32:
   21394   case X86::CMOV_V8F64:
   21395   case X86::CMOV_V8I64:
   21396   case X86::CMOV_V8I1:
   21397   case X86::CMOV_V16I1:
   21398   case X86::CMOV_V32I1:
   21399   case X86::CMOV_V64I1:
   21400     return true;
   21401 
   21402   default:
   21403     return false;
   21404   }
   21405 }
   21406 
   21407 MachineBasicBlock *
   21408 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   21409                                      MachineBasicBlock *BB) const {
   21410   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   21411   DebugLoc DL = MI->getDebugLoc();
   21412 
   21413   // To "insert" a SELECT_CC instruction, we actually have to insert the
   21414   // diamond control-flow pattern.  The incoming instruction knows the
   21415   // destination vreg to set, the condition code register to branch on, the
   21416   // true/false values to select between, and a branch opcode to use.
   21417   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   21418   MachineFunction::iterator It = ++BB->getIterator();
   21419 
   21420   //  thisMBB:
   21421   //  ...
   21422   //   TrueVal = ...
   21423   //   cmpTY ccX, r1, r2
   21424   //   bCC copy1MBB
   21425   //   fallthrough --> copy0MBB
   21426   MachineBasicBlock *thisMBB = BB;
   21427   MachineFunction *F = BB->getParent();
   21428 
   21429   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
   21430   // as described above, by inserting a BB, and then making a PHI at the join
   21431   // point to select the true and false operands of the CMOV in the PHI.
   21432   //
   21433   // The code also handles two different cases of multiple CMOV opcodes
   21434   // in a row.
   21435   //
   21436   // Case 1:
   21437   // In this case, there are multiple CMOVs in a row, all which are based on
   21438   // the same condition setting (or the exact opposite condition setting).
   21439   // In this case we can lower all the CMOVs using a single inserted BB, and
   21440   // then make a number of PHIs at the join point to model the CMOVs. The only
   21441   // trickiness here, is that in a case like:
   21442   //
   21443   // t2 = CMOV cond1 t1, f1
   21444   // t3 = CMOV cond1 t2, f2
   21445   //
   21446   // when rewriting this into PHIs, we have to perform some renaming on the
   21447   // temps since you cannot have a PHI operand refer to a PHI result earlier
   21448   // in the same block.  The "simple" but wrong lowering would be:
   21449   //
   21450   // t2 = PHI t1(BB1), f1(BB2)
   21451   // t3 = PHI t2(BB1), f2(BB2)
   21452   //
   21453   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
   21454   // renaming is to note that on the path through BB1, t2 is really just a
   21455   // copy of t1, and do that renaming, properly generating:
   21456   //
   21457   // t2 = PHI t1(BB1), f1(BB2)
   21458   // t3 = PHI t1(BB1), f2(BB2)
   21459   //
   21460   // Case 2, we lower cascaded CMOVs such as
   21461   //
   21462   //   (CMOV (CMOV F, T, cc1), T, cc2)
   21463   //
   21464   // to two successives branches.  For that, we look for another CMOV as the
   21465   // following instruction.
   21466   //
   21467   // Without this, we would add a PHI between the two jumps, which ends up
   21468   // creating a few copies all around. For instance, for
   21469   //
   21470   //    (sitofp (zext (fcmp une)))
   21471   //
   21472   // we would generate:
   21473   //
   21474   //         ucomiss %xmm1, %xmm0
   21475   //         movss  <1.0f>, %xmm0
   21476   //         movaps  %xmm0, %xmm1
   21477   //         jne     .LBB5_2
   21478   //         xorps   %xmm1, %xmm1
   21479   // .LBB5_2:
   21480   //         jp      .LBB5_4
   21481   //         movaps  %xmm1, %xmm0
   21482   // .LBB5_4:
   21483   //         retq
   21484   //
   21485   // because this custom-inserter would have generated:
   21486   //
   21487   //   A
   21488   //   | \
   21489   //   |  B
   21490   //   | /
   21491   //   C
   21492   //   | \
   21493   //   |  D
   21494   //   | /
   21495   //   E
   21496   //
   21497   // A: X = ...; Y = ...
   21498   // B: empty
   21499   // C: Z = PHI [X, A], [Y, B]
   21500   // D: empty
   21501   // E: PHI [X, C], [Z, D]
   21502   //
   21503   // If we lower both CMOVs in a single step, we can instead generate:
   21504   //
   21505   //   A
   21506   //   | \
   21507   //   |  C
   21508   //   | /|
   21509   //   |/ |
   21510   //   |  |
   21511   //   |  D
   21512   //   | /
   21513   //   E
   21514   //
   21515   // A: X = ...; Y = ...
   21516   // D: empty
   21517   // E: PHI [X, A], [X, C], [Y, D]
   21518   //
   21519   // Which, in our sitofp/fcmp example, gives us something like:
   21520   //
   21521   //         ucomiss %xmm1, %xmm0
   21522   //         movss  <1.0f>, %xmm0
   21523   //         jne     .LBB5_4
   21524   //         jp      .LBB5_4
   21525   //         xorps   %xmm0, %xmm0
   21526   // .LBB5_4:
   21527   //         retq
   21528   //
   21529   MachineInstr *CascadedCMOV = nullptr;
   21530   MachineInstr *LastCMOV = MI;
   21531   X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm());
   21532   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
   21533   MachineBasicBlock::iterator NextMIIt =
   21534       std::next(MachineBasicBlock::iterator(MI));
   21535 
   21536   // Check for case 1, where there are multiple CMOVs with the same condition
   21537   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
   21538   // number of jumps the most.
   21539 
   21540   if (isCMOVPseudo(MI)) {
   21541     // See if we have a string of CMOVS with the same condition.
   21542     while (NextMIIt != BB->end() &&
   21543            isCMOVPseudo(NextMIIt) &&
   21544            (NextMIIt->getOperand(3).getImm() == CC ||
   21545             NextMIIt->getOperand(3).getImm() == OppCC)) {
   21546       LastCMOV = &*NextMIIt;
   21547       ++NextMIIt;
   21548     }
   21549   }
   21550 
   21551   // This checks for case 2, but only do this if we didn't already find
   21552   // case 1, as indicated by LastCMOV == MI.
   21553   if (LastCMOV == MI &&
   21554       NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
   21555       NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
   21556       NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) {
   21557     CascadedCMOV = &*NextMIIt;
   21558   }
   21559 
   21560   MachineBasicBlock *jcc1MBB = nullptr;
   21561 
   21562   // If we have a cascaded CMOV, we lower it to two successive branches to
   21563   // the same block.  EFLAGS is used by both, so mark it as live in the second.
   21564   if (CascadedCMOV) {
   21565     jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
   21566     F->insert(It, jcc1MBB);
   21567     jcc1MBB->addLiveIn(X86::EFLAGS);
   21568   }
   21569 
   21570   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
   21571   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
   21572   F->insert(It, copy0MBB);
   21573   F->insert(It, sinkMBB);
   21574 
   21575   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   21576   // live into the sink and copy blocks.
   21577   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   21578 
   21579   MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
   21580   if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
   21581       !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
   21582     copy0MBB->addLiveIn(X86::EFLAGS);
   21583     sinkMBB->addLiveIn(X86::EFLAGS);
   21584   }
   21585 
   21586   // Transfer the remainder of BB and its successor edges to sinkMBB.
   21587   sinkMBB->splice(sinkMBB->begin(), BB,
   21588                   std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
   21589   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
   21590 
   21591   // Add the true and fallthrough blocks as its successors.
   21592   if (CascadedCMOV) {
   21593     // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
   21594     BB->addSuccessor(jcc1MBB);
   21595 
   21596     // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
   21597     // jump to the sinkMBB.
   21598     jcc1MBB->addSuccessor(copy0MBB);
   21599     jcc1MBB->addSuccessor(sinkMBB);
   21600   } else {
   21601     BB->addSuccessor(copy0MBB);
   21602   }
   21603 
   21604   // The true block target of the first (or only) branch is always sinkMBB.
   21605   BB->addSuccessor(sinkMBB);
   21606 
   21607   // Create the conditional branch instruction.
   21608   unsigned Opc = X86::GetCondBranchFromCond(CC);
   21609   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
   21610 
   21611   if (CascadedCMOV) {
   21612     unsigned Opc2 = X86::GetCondBranchFromCond(
   21613         (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
   21614     BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
   21615   }
   21616 
   21617   //  copy0MBB:
   21618   //   %FalseValue = ...
   21619   //   # fallthrough to sinkMBB
   21620   copy0MBB->addSuccessor(sinkMBB);
   21621 
   21622   //  sinkMBB:
   21623   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   21624   //  ...
   21625   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
   21626   MachineBasicBlock::iterator MIItEnd =
   21627     std::next(MachineBasicBlock::iterator(LastCMOV));
   21628   MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
   21629   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
   21630   MachineInstrBuilder MIB;
   21631 
   21632   // As we are creating the PHIs, we have to be careful if there is more than
   21633   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
   21634   // PHIs have to reference the individual true/false inputs from earlier PHIs.
   21635   // That also means that PHI construction must work forward from earlier to
   21636   // later, and that the code must maintain a mapping from earlier PHI's
   21637   // destination registers, and the registers that went into the PHI.
   21638 
   21639   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
   21640     unsigned DestReg = MIIt->getOperand(0).getReg();
   21641     unsigned Op1Reg = MIIt->getOperand(1).getReg();
   21642     unsigned Op2Reg = MIIt->getOperand(2).getReg();
   21643 
   21644     // If this CMOV we are generating is the opposite condition from
   21645     // the jump we generated, then we have to swap the operands for the
   21646     // PHI that is going to be generated.
   21647     if (MIIt->getOperand(3).getImm() == OppCC)
   21648         std::swap(Op1Reg, Op2Reg);
   21649 
   21650     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
   21651       Op1Reg = RegRewriteTable[Op1Reg].first;
   21652 
   21653     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
   21654       Op2Reg = RegRewriteTable[Op2Reg].second;
   21655 
   21656     MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
   21657                   TII->get(X86::PHI), DestReg)
   21658           .addReg(Op1Reg).addMBB(copy0MBB)
   21659           .addReg(Op2Reg).addMBB(thisMBB);
   21660 
   21661     // Add this PHI to the rewrite table.
   21662     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
   21663   }
   21664 
   21665   // If we have a cascaded CMOV, the second Jcc provides the same incoming
   21666   // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
   21667   if (CascadedCMOV) {
   21668     MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
   21669     // Copy the PHI result to the register defined by the second CMOV.
   21670     BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
   21671             DL, TII->get(TargetOpcode::COPY),
   21672             CascadedCMOV->getOperand(0).getReg())
   21673         .addReg(MI->getOperand(0).getReg());
   21674     CascadedCMOV->eraseFromParent();
   21675   }
   21676 
   21677   // Now remove the CMOV(s).
   21678   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
   21679     (MIIt++)->eraseFromParent();
   21680 
   21681   return sinkMBB;
   21682 }
   21683 
   21684 MachineBasicBlock *
   21685 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI,
   21686                                        MachineBasicBlock *BB) const {
   21687   // Combine the following atomic floating-point modification pattern:
   21688   //   a.store(reg OP a.load(acquire), release)
   21689   // Transform them into:
   21690   //   OPss (%gpr), %xmm
   21691   //   movss %xmm, (%gpr)
   21692   // Or sd equivalent for 64-bit operations.
   21693   unsigned MOp, FOp;
   21694   switch (MI->getOpcode()) {
   21695   default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
   21696   case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break;
   21697   case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break;
   21698   }
   21699   const X86InstrInfo *TII = Subtarget->getInstrInfo();
   21700   DebugLoc DL = MI->getDebugLoc();
   21701   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   21702   MachineOperand MSrc = MI->getOperand(0);
   21703   unsigned VSrc = MI->getOperand(5).getReg();
   21704   const MachineOperand &Disp = MI->getOperand(3);
   21705   MachineOperand ZeroDisp = MachineOperand::CreateImm(0);
   21706   bool hasDisp = Disp.isGlobal() || Disp.isImm();
   21707   if (hasDisp && MSrc.isReg())
   21708     MSrc.setIsKill(false);
   21709   MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp))
   21710                                 .addOperand(/*Base=*/MSrc)
   21711                                 .addImm(/*Scale=*/1)
   21712                                 .addReg(/*Index=*/0)
   21713                                 .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
   21714                                 .addReg(0);
   21715   MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp),
   21716                               MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
   21717                           .addReg(VSrc)
   21718                           .addOperand(/*Base=*/MSrc)
   21719                           .addImm(/*Scale=*/1)
   21720                           .addReg(/*Index=*/0)
   21721                           .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
   21722                           .addReg(/*Segment=*/0);
   21723   MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill);
   21724   MI->eraseFromParent(); // The pseudo instruction is gone now.
   21725   return BB;
   21726 }
   21727 
   21728 MachineBasicBlock *
   21729 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
   21730                                         MachineBasicBlock *BB) const {
   21731   MachineFunction *MF = BB->getParent();
   21732   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   21733   DebugLoc DL = MI->getDebugLoc();
   21734   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   21735 
   21736   assert(MF->shouldSplitStack());
   21737 
   21738   const bool Is64Bit = Subtarget->is64Bit();
   21739   const bool IsLP64 = Subtarget->isTarget64BitLP64();
   21740 
   21741   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   21742   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
   21743 
   21744   // BB:
   21745   //  ... [Till the alloca]
   21746   // If stacklet is not large enough, jump to mallocMBB
   21747   //
   21748   // bumpMBB:
   21749   //  Allocate by subtracting from RSP
   21750   //  Jump to continueMBB
   21751   //
   21752   // mallocMBB:
   21753   //  Allocate by call to runtime
   21754   //
   21755   // continueMBB:
   21756   //  ...
   21757   //  [rest of original BB]
   21758   //
   21759 
   21760   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   21761   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   21762   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   21763 
   21764   MachineRegisterInfo &MRI = MF->getRegInfo();
   21765   const TargetRegisterClass *AddrRegClass =
   21766       getRegClassFor(getPointerTy(MF->getDataLayout()));
   21767 
   21768   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
   21769     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
   21770     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
   21771     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
   21772     sizeVReg = MI->getOperand(1).getReg(),
   21773     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
   21774 
   21775   MachineFunction::iterator MBBIter = ++BB->getIterator();
   21776 
   21777   MF->insert(MBBIter, bumpMBB);
   21778   MF->insert(MBBIter, mallocMBB);
   21779   MF->insert(MBBIter, continueMBB);
   21780 
   21781   continueMBB->splice(continueMBB->begin(), BB,
   21782                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
   21783   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
   21784 
   21785   // Add code to the main basic block to check if the stack limit has been hit,
   21786   // and if so, jump to mallocMBB otherwise to bumpMBB.
   21787   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
   21788   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
   21789     .addReg(tmpSPVReg).addReg(sizeVReg);
   21790   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
   21791     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
   21792     .addReg(SPLimitVReg);
   21793   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
   21794 
   21795   // bumpMBB simply decreases the stack pointer, since we know the current
   21796   // stacklet has enough space.
   21797   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
   21798     .addReg(SPLimitVReg);
   21799   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
   21800     .addReg(SPLimitVReg);
   21801   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
   21802 
   21803   // Calls into a routine in libgcc to allocate more space from the heap.
   21804   const uint32_t *RegMask =
   21805       Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
   21806   if (IsLP64) {
   21807     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
   21808       .addReg(sizeVReg);
   21809     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
   21810       .addExternalSymbol("__morestack_allocate_stack_space")
   21811       .addRegMask(RegMask)
   21812       .addReg(X86::RDI, RegState::Implicit)
   21813       .addReg(X86::RAX, RegState::ImplicitDefine);
   21814   } else if (Is64Bit) {
   21815     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
   21816       .addReg(sizeVReg);
   21817     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
   21818       .addExternalSymbol("__morestack_allocate_stack_space")
   21819       .addRegMask(RegMask)
   21820       .addReg(X86::EDI, RegState::Implicit)
   21821       .addReg(X86::EAX, RegState::ImplicitDefine);
   21822   } else {
   21823     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
   21824       .addImm(12);
   21825     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
   21826     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
   21827       .addExternalSymbol("__morestack_allocate_stack_space")
   21828       .addRegMask(RegMask)
   21829       .addReg(X86::EAX, RegState::ImplicitDefine);
   21830   }
   21831 
   21832   if (!Is64Bit)
   21833     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
   21834       .addImm(16);
   21835 
   21836   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
   21837     .addReg(IsLP64 ? X86::RAX : X86::EAX);
   21838   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
   21839 
   21840   // Set up the CFG correctly.
   21841   BB->addSuccessor(bumpMBB);
   21842   BB->addSuccessor(mallocMBB);
   21843   mallocMBB->addSuccessor(continueMBB);
   21844   bumpMBB->addSuccessor(continueMBB);
   21845 
   21846   // Take care of the PHI nodes.
   21847   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
   21848           MI->getOperand(0).getReg())
   21849     .addReg(mallocPtrVReg).addMBB(mallocMBB)
   21850     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
   21851 
   21852   // Delete the original pseudo instruction.
   21853   MI->eraseFromParent();
   21854 
   21855   // And we're done.
   21856   return continueMBB;
   21857 }
   21858 
   21859 MachineBasicBlock *
   21860 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
   21861                                         MachineBasicBlock *BB) const {
   21862   assert(!Subtarget->isTargetMachO());
   21863   DebugLoc DL = MI->getDebugLoc();
   21864   MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe(
   21865       *BB->getParent(), *BB, MI, DL, false);
   21866   MachineBasicBlock *ResumeBB = ResumeMI->getParent();
   21867   MI->eraseFromParent(); // The pseudo instruction is gone now.
   21868   return ResumeBB;
   21869 }
   21870 
   21871 MachineBasicBlock *
   21872 X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI,
   21873                                        MachineBasicBlock *BB) const {
   21874   MachineFunction *MF = BB->getParent();
   21875   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   21876   MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB();
   21877   DebugLoc DL = MI->getDebugLoc();
   21878 
   21879   assert(!isAsynchronousEHPersonality(
   21880              classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
   21881          "SEH does not use catchret!");
   21882 
   21883   // Only 32-bit EH needs to worry about manually restoring stack pointers.
   21884   if (!Subtarget->is32Bit())
   21885     return BB;
   21886 
   21887   // C++ EH creates a new target block to hold the restore code, and wires up
   21888   // the new block to the return destination with a normal JMP_4.
   21889   MachineBasicBlock *RestoreMBB =
   21890       MF->CreateMachineBasicBlock(BB->getBasicBlock());
   21891   assert(BB->succ_size() == 1);
   21892   MF->insert(std::next(BB->getIterator()), RestoreMBB);
   21893   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
   21894   BB->addSuccessor(RestoreMBB);
   21895   MI->getOperand(0).setMBB(RestoreMBB);
   21896 
   21897   auto RestoreMBBI = RestoreMBB->begin();
   21898   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
   21899   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
   21900   return BB;
   21901 }
   21902 
   21903 MachineBasicBlock *
   21904 X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI,
   21905                                        MachineBasicBlock *BB) const {
   21906   MachineFunction *MF = BB->getParent();
   21907   const Constant *PerFn = MF->getFunction()->getPersonalityFn();
   21908   bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
   21909   // Only 32-bit SEH requires special handling for catchpad.
   21910   if (IsSEH && Subtarget->is32Bit()) {
   21911     const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   21912     DebugLoc DL = MI->getDebugLoc();
   21913     BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
   21914   }
   21915   MI->eraseFromParent();
   21916   return BB;
   21917 }
   21918 
   21919 MachineBasicBlock *
   21920 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
   21921                                       MachineBasicBlock *BB) const {
   21922   // This is pretty easy.  We're taking the value that we received from
   21923   // our load from the relocation, sticking it in either RDI (x86-64)
   21924   // or EAX and doing an indirect call.  The return value will then
   21925   // be in the normal return register.
   21926   MachineFunction *F = BB->getParent();
   21927   const X86InstrInfo *TII = Subtarget->getInstrInfo();
   21928   DebugLoc DL = MI->getDebugLoc();
   21929 
   21930   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
   21931   assert(MI->getOperand(3).isGlobal() && "This should be a global");
   21932 
   21933   // Get a register mask for the lowered call.
   21934   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   21935   // proper register mask.
   21936   const uint32_t *RegMask =
   21937       Subtarget->is64Bit() ?
   21938       Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() :
   21939       Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
   21940   if (Subtarget->is64Bit()) {
   21941     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
   21942                                       TII->get(X86::MOV64rm), X86::RDI)
   21943     .addReg(X86::RIP)
   21944     .addImm(0).addReg(0)
   21945     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
   21946                       MI->getOperand(3).getTargetFlags())
   21947     .addReg(0);
   21948     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
   21949     addDirectMem(MIB, X86::RDI);
   21950     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
   21951   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
   21952     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
   21953                                       TII->get(X86::MOV32rm), X86::EAX)
   21954     .addReg(0)
   21955     .addImm(0).addReg(0)
   21956     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
   21957                       MI->getOperand(3).getTargetFlags())
   21958     .addReg(0);
   21959     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
   21960     addDirectMem(MIB, X86::EAX);
   21961     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   21962   } else {
   21963     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
   21964                                       TII->get(X86::MOV32rm), X86::EAX)
   21965     .addReg(TII->getGlobalBaseReg(F))
   21966     .addImm(0).addReg(0)
   21967     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
   21968                       MI->getOperand(3).getTargetFlags())
   21969     .addReg(0);
   21970     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
   21971     addDirectMem(MIB, X86::EAX);
   21972     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   21973   }
   21974 
   21975   MI->eraseFromParent(); // The pseudo instruction is gone now.
   21976   return BB;
   21977 }
   21978 
   21979 MachineBasicBlock *
   21980 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   21981                                     MachineBasicBlock *MBB) const {
   21982   DebugLoc DL = MI->getDebugLoc();
   21983   MachineFunction *MF = MBB->getParent();
   21984   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   21985   MachineRegisterInfo &MRI = MF->getRegInfo();
   21986 
   21987   const BasicBlock *BB = MBB->getBasicBlock();
   21988   MachineFunction::iterator I = ++MBB->getIterator();
   21989 
   21990   // Memory Reference
   21991   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   21992   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
   21993 
   21994   unsigned DstReg;
   21995   unsigned MemOpndSlot = 0;
   21996 
   21997   unsigned CurOp = 0;
   21998 
   21999   DstReg = MI->getOperand(CurOp++).getReg();
   22000   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   22001   assert(RC->hasType(MVT::i32) && "Invalid destination!");
   22002   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   22003   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
   22004 
   22005   MemOpndSlot = CurOp;
   22006 
   22007   MVT PVT = getPointerTy(MF->getDataLayout());
   22008   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
   22009          "Invalid Pointer Size!");
   22010 
   22011   // For v = setjmp(buf), we generate
   22012   //
   22013   // thisMBB:
   22014   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
   22015   //  SjLjSetup restoreMBB
   22016   //
   22017   // mainMBB:
   22018   //  v_main = 0
   22019   //
   22020   // sinkMBB:
   22021   //  v = phi(main, restore)
   22022   //
   22023   // restoreMBB:
   22024   //  if base pointer being used, load it from frame
   22025   //  v_restore = 1
   22026 
   22027   MachineBasicBlock *thisMBB = MBB;
   22028   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   22029   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   22030   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
   22031   MF->insert(I, mainMBB);
   22032   MF->insert(I, sinkMBB);
   22033   MF->push_back(restoreMBB);
   22034   restoreMBB->setHasAddressTaken();
   22035 
   22036   MachineInstrBuilder MIB;
   22037 
   22038   // Transfer the remainder of BB and its successor edges to sinkMBB.
   22039   sinkMBB->splice(sinkMBB->begin(), MBB,
   22040                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   22041   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
   22042 
   22043   // thisMBB:
   22044   unsigned PtrStoreOpc = 0;
   22045   unsigned LabelReg = 0;
   22046   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   22047   Reloc::Model RM = MF->getTarget().getRelocationModel();
   22048   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
   22049                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
   22050 
   22051   // Prepare IP either in reg or imm.
   22052   if (!UseImmLabel) {
   22053     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
   22054     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
   22055     LabelReg = MRI.createVirtualRegister(PtrRC);
   22056     if (Subtarget->is64Bit()) {
   22057       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
   22058               .addReg(X86::RIP)
   22059               .addImm(0)
   22060               .addReg(0)
   22061               .addMBB(restoreMBB)
   22062               .addReg(0);
   22063     } else {
   22064       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
   22065       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
   22066               .addReg(XII->getGlobalBaseReg(MF))
   22067               .addImm(0)
   22068               .addReg(0)
   22069               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
   22070               .addReg(0);
   22071     }
   22072   } else
   22073     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   22074   // Store IP
   22075   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
   22076   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   22077     if (i == X86::AddrDisp)
   22078       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
   22079     else
   22080       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
   22081   }
   22082   if (!UseImmLabel)
   22083     MIB.addReg(LabelReg);
   22084   else
   22085     MIB.addMBB(restoreMBB);
   22086   MIB.setMemRefs(MMOBegin, MMOEnd);
   22087   // Setup
   22088   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
   22089           .addMBB(restoreMBB);
   22090 
   22091   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   22092   MIB.addRegMask(RegInfo->getNoPreservedMask());
   22093   thisMBB->addSuccessor(mainMBB);
   22094   thisMBB->addSuccessor(restoreMBB);
   22095 
   22096   // mainMBB:
   22097   //  EAX = 0
   22098   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
   22099   mainMBB->addSuccessor(sinkMBB);
   22100 
   22101   // sinkMBB:
   22102   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
   22103           TII->get(X86::PHI), DstReg)
   22104     .addReg(mainDstReg).addMBB(mainMBB)
   22105     .addReg(restoreDstReg).addMBB(restoreMBB);
   22106 
   22107   // restoreMBB:
   22108   if (RegInfo->hasBasePointer(*MF)) {
   22109     const bool Uses64BitFramePtr =
   22110         Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
   22111     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
   22112     X86FI->setRestoreBasePointer(MF);
   22113     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
   22114     unsigned BasePtr = RegInfo->getBaseRegister();
   22115     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
   22116     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
   22117                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
   22118       .setMIFlag(MachineInstr::FrameSetup);
   22119   }
   22120   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
   22121   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   22122   restoreMBB->addSuccessor(sinkMBB);
   22123 
   22124   MI->eraseFromParent();
   22125   return sinkMBB;
   22126 }
   22127 
   22128 MachineBasicBlock *
   22129 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   22130                                      MachineBasicBlock *MBB) const {
   22131   DebugLoc DL = MI->getDebugLoc();
   22132   MachineFunction *MF = MBB->getParent();
   22133   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   22134   MachineRegisterInfo &MRI = MF->getRegInfo();
   22135 
   22136   // Memory Reference
   22137   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   22138   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
   22139 
   22140   MVT PVT = getPointerTy(MF->getDataLayout());
   22141   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
   22142          "Invalid Pointer Size!");
   22143 
   22144   const TargetRegisterClass *RC =
   22145     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   22146   unsigned Tmp = MRI.createVirtualRegister(RC);
   22147   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   22148   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   22149   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   22150   unsigned SP = RegInfo->getStackRegister();
   22151 
   22152   MachineInstrBuilder MIB;
   22153 
   22154   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   22155   const int64_t SPOffset = 2 * PVT.getStoreSize();
   22156 
   22157   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   22158   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
   22159 
   22160   // Reload FP
   22161   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
   22162   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
   22163     MIB.addOperand(MI->getOperand(i));
   22164   MIB.setMemRefs(MMOBegin, MMOEnd);
   22165   // Reload IP
   22166   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
   22167   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   22168     if (i == X86::AddrDisp)
   22169       MIB.addDisp(MI->getOperand(i), LabelOffset);
   22170     else
   22171       MIB.addOperand(MI->getOperand(i));
   22172   }
   22173   MIB.setMemRefs(MMOBegin, MMOEnd);
   22174   // Reload SP
   22175   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
   22176   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   22177     if (i == X86::AddrDisp)
   22178       MIB.addDisp(MI->getOperand(i), SPOffset);
   22179     else
   22180       MIB.addOperand(MI->getOperand(i));
   22181   }
   22182   MIB.setMemRefs(MMOBegin, MMOEnd);
   22183   // Jump
   22184   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
   22185 
   22186   MI->eraseFromParent();
   22187   return MBB;
   22188 }
   22189 
   22190 // Replace 213-type (isel default) FMA3 instructions with 231-type for
   22191 // accumulator loops. Writing back to the accumulator allows the coalescer
   22192 // to remove extra copies in the loop.
   22193 // FIXME: Do this on AVX512.  We don't support 231 variants yet (PR23937).
   22194 MachineBasicBlock *
   22195 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
   22196                                  MachineBasicBlock *MBB) const {
   22197   MachineOperand &AddendOp = MI->getOperand(3);
   22198 
   22199   // Bail out early if the addend isn't a register - we can't switch these.
   22200   if (!AddendOp.isReg())
   22201     return MBB;
   22202 
   22203   MachineFunction &MF = *MBB->getParent();
   22204   MachineRegisterInfo &MRI = MF.getRegInfo();
   22205 
   22206   // Check whether the addend is defined by a PHI:
   22207   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
   22208   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
   22209   if (!AddendDef.isPHI())
   22210     return MBB;
   22211 
   22212   // Look for the following pattern:
   22213   // loop:
   22214   //   %addend = phi [%entry, 0], [%loop, %result]
   22215   //   ...
   22216   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
   22217 
   22218   // Replace with:
   22219   //   loop:
   22220   //   %addend = phi [%entry, 0], [%loop, %result]
   22221   //   ...
   22222   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
   22223 
   22224   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
   22225     assert(AddendDef.getOperand(i).isReg());
   22226     MachineOperand PHISrcOp = AddendDef.getOperand(i);
   22227     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
   22228     if (&PHISrcInst == MI) {
   22229       // Found a matching instruction.
   22230       unsigned NewFMAOpc = 0;
   22231       switch (MI->getOpcode()) {
   22232         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
   22233         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
   22234         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
   22235         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
   22236         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
   22237         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
   22238         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
   22239         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
   22240         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
   22241         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
   22242         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
   22243         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
   22244         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
   22245         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
   22246         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
   22247         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
   22248         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
   22249         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
   22250         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
   22251         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
   22252 
   22253         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
   22254         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
   22255         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
   22256         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
   22257         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
   22258         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
   22259         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
   22260         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
   22261         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
   22262         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
   22263         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
   22264         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
   22265         default: llvm_unreachable("Unrecognized FMA variant.");
   22266       }
   22267 
   22268       const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   22269       MachineInstrBuilder MIB =
   22270         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
   22271         .addOperand(MI->getOperand(0))
   22272         .addOperand(MI->getOperand(3))
   22273         .addOperand(MI->getOperand(2))
   22274         .addOperand(MI->getOperand(1));
   22275       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
   22276       MI->eraseFromParent();
   22277     }
   22278   }
   22279 
   22280   return MBB;
   22281 }
   22282 
   22283 MachineBasicBlock *
   22284 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   22285                                                MachineBasicBlock *BB) const {
   22286   switch (MI->getOpcode()) {
   22287   default: llvm_unreachable("Unexpected instr type to insert");
   22288   case X86::TAILJMPd64:
   22289   case X86::TAILJMPr64:
   22290   case X86::TAILJMPm64:
   22291   case X86::TAILJMPd64_REX:
   22292   case X86::TAILJMPr64_REX:
   22293   case X86::TAILJMPm64_REX:
   22294     llvm_unreachable("TAILJMP64 would not be touched here.");
   22295   case X86::TCRETURNdi64:
   22296   case X86::TCRETURNri64:
   22297   case X86::TCRETURNmi64:
   22298     return BB;
   22299   case X86::WIN_ALLOCA:
   22300     return EmitLoweredWinAlloca(MI, BB);
   22301   case X86::CATCHRET:
   22302     return EmitLoweredCatchRet(MI, BB);
   22303   case X86::CATCHPAD:
   22304     return EmitLoweredCatchPad(MI, BB);
   22305   case X86::SEG_ALLOCA_32:
   22306   case X86::SEG_ALLOCA_64:
   22307     return EmitLoweredSegAlloca(MI, BB);
   22308   case X86::TLSCall_32:
   22309   case X86::TLSCall_64:
   22310     return EmitLoweredTLSCall(MI, BB);
   22311   case X86::CMOV_FR32:
   22312   case X86::CMOV_FR64:
   22313   case X86::CMOV_FR128:
   22314   case X86::CMOV_GR8:
   22315   case X86::CMOV_GR16:
   22316   case X86::CMOV_GR32:
   22317   case X86::CMOV_RFP32:
   22318   case X86::CMOV_RFP64:
   22319   case X86::CMOV_RFP80:
   22320   case X86::CMOV_V2F64:
   22321   case X86::CMOV_V2I64:
   22322   case X86::CMOV_V4F32:
   22323   case X86::CMOV_V4F64:
   22324   case X86::CMOV_V4I64:
   22325   case X86::CMOV_V16F32:
   22326   case X86::CMOV_V8F32:
   22327   case X86::CMOV_V8F64:
   22328   case X86::CMOV_V8I64:
   22329   case X86::CMOV_V8I1:
   22330   case X86::CMOV_V16I1:
   22331   case X86::CMOV_V32I1:
   22332   case X86::CMOV_V64I1:
   22333     return EmitLoweredSelect(MI, BB);
   22334 
   22335   case X86::RELEASE_FADD32mr:
   22336   case X86::RELEASE_FADD64mr:
   22337     return EmitLoweredAtomicFP(MI, BB);
   22338 
   22339   case X86::FP32_TO_INT16_IN_MEM:
   22340   case X86::FP32_TO_INT32_IN_MEM:
   22341   case X86::FP32_TO_INT64_IN_MEM:
   22342   case X86::FP64_TO_INT16_IN_MEM:
   22343   case X86::FP64_TO_INT32_IN_MEM:
   22344   case X86::FP64_TO_INT64_IN_MEM:
   22345   case X86::FP80_TO_INT16_IN_MEM:
   22346   case X86::FP80_TO_INT32_IN_MEM:
   22347   case X86::FP80_TO_INT64_IN_MEM: {
   22348     MachineFunction *F = BB->getParent();
   22349     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   22350     DebugLoc DL = MI->getDebugLoc();
   22351 
   22352     // Change the floating point control register to use "round towards zero"
   22353     // mode when truncating to an integer value.
   22354     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
   22355     addFrameReference(BuildMI(*BB, MI, DL,
   22356                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
   22357 
   22358     // Load the old value of the high byte of the control word...
   22359     unsigned OldCW =
   22360       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
   22361     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
   22362                       CWFrameIdx);
   22363 
   22364     // Set the high part to be round to zero...
   22365     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
   22366       .addImm(0xC7F);
   22367 
   22368     // Reload the modified control word now...
   22369     addFrameReference(BuildMI(*BB, MI, DL,
   22370                               TII->get(X86::FLDCW16m)), CWFrameIdx);
   22371 
   22372     // Restore the memory image of control word to original value
   22373     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
   22374       .addReg(OldCW);
   22375 
   22376     // Get the X86 opcode to use.
   22377     unsigned Opc;
   22378     switch (MI->getOpcode()) {
   22379     default: llvm_unreachable("illegal opcode!");
   22380     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
   22381     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
   22382     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
   22383     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
   22384     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
   22385     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
   22386     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
   22387     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
   22388     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
   22389     }
   22390 
   22391     X86AddressMode AM;
   22392     MachineOperand &Op = MI->getOperand(0);
   22393     if (Op.isReg()) {
   22394       AM.BaseType = X86AddressMode::RegBase;
   22395       AM.Base.Reg = Op.getReg();
   22396     } else {
   22397       AM.BaseType = X86AddressMode::FrameIndexBase;
   22398       AM.Base.FrameIndex = Op.getIndex();
   22399     }
   22400     Op = MI->getOperand(1);
   22401     if (Op.isImm())
   22402       AM.Scale = Op.getImm();
   22403     Op = MI->getOperand(2);
   22404     if (Op.isImm())
   22405       AM.IndexReg = Op.getImm();
   22406     Op = MI->getOperand(3);
   22407     if (Op.isGlobal()) {
   22408       AM.GV = Op.getGlobal();
   22409     } else {
   22410       AM.Disp = Op.getImm();
   22411     }
   22412     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
   22413                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
   22414 
   22415     // Reload the original control word now.
   22416     addFrameReference(BuildMI(*BB, MI, DL,
   22417                               TII->get(X86::FLDCW16m)), CWFrameIdx);
   22418 
   22419     MI->eraseFromParent();   // The pseudo instruction is gone now.
   22420     return BB;
   22421   }
   22422     // String/text processing lowering.
   22423   case X86::PCMPISTRM128REG:
   22424   case X86::VPCMPISTRM128REG:
   22425   case X86::PCMPISTRM128MEM:
   22426   case X86::VPCMPISTRM128MEM:
   22427   case X86::PCMPESTRM128REG:
   22428   case X86::VPCMPESTRM128REG:
   22429   case X86::PCMPESTRM128MEM:
   22430   case X86::VPCMPESTRM128MEM:
   22431     assert(Subtarget->hasSSE42() &&
   22432            "Target must have SSE4.2 or AVX features enabled");
   22433     return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
   22434 
   22435   // String/text processing lowering.
   22436   case X86::PCMPISTRIREG:
   22437   case X86::VPCMPISTRIREG:
   22438   case X86::PCMPISTRIMEM:
   22439   case X86::VPCMPISTRIMEM:
   22440   case X86::PCMPESTRIREG:
   22441   case X86::VPCMPESTRIREG:
   22442   case X86::PCMPESTRIMEM:
   22443   case X86::VPCMPESTRIMEM:
   22444     assert(Subtarget->hasSSE42() &&
   22445            "Target must have SSE4.2 or AVX features enabled");
   22446     return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
   22447 
   22448   // Thread synchronization.
   22449   case X86::MONITOR:
   22450     return EmitMonitor(MI, BB, Subtarget);
   22451 
   22452   // xbegin
   22453   case X86::XBEGIN:
   22454     return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
   22455 
   22456   case X86::VASTART_SAVE_XMM_REGS:
   22457     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
   22458 
   22459   case X86::VAARG_64:
   22460     return EmitVAARG64WithCustomInserter(MI, BB);
   22461 
   22462   case X86::EH_SjLj_SetJmp32:
   22463   case X86::EH_SjLj_SetJmp64:
   22464     return emitEHSjLjSetJmp(MI, BB);
   22465 
   22466   case X86::EH_SjLj_LongJmp32:
   22467   case X86::EH_SjLj_LongJmp64:
   22468     return emitEHSjLjLongJmp(MI, BB);
   22469 
   22470   case TargetOpcode::STATEPOINT:
   22471     // As an implementation detail, STATEPOINT shares the STACKMAP format at
   22472     // this point in the process.  We diverge later.
   22473     return emitPatchPoint(MI, BB);
   22474 
   22475   case TargetOpcode::STACKMAP:
   22476   case TargetOpcode::PATCHPOINT:
   22477     return emitPatchPoint(MI, BB);
   22478 
   22479   case X86::VFMADDPDr213r:
   22480   case X86::VFMADDPSr213r:
   22481   case X86::VFMADDSDr213r:
   22482   case X86::VFMADDSSr213r:
   22483   case X86::VFMSUBPDr213r:
   22484   case X86::VFMSUBPSr213r:
   22485   case X86::VFMSUBSDr213r:
   22486   case X86::VFMSUBSSr213r:
   22487   case X86::VFNMADDPDr213r:
   22488   case X86::VFNMADDPSr213r:
   22489   case X86::VFNMADDSDr213r:
   22490   case X86::VFNMADDSSr213r:
   22491   case X86::VFNMSUBPDr213r:
   22492   case X86::VFNMSUBPSr213r:
   22493   case X86::VFNMSUBSDr213r:
   22494   case X86::VFNMSUBSSr213r:
   22495   case X86::VFMADDSUBPDr213r:
   22496   case X86::VFMADDSUBPSr213r:
   22497   case X86::VFMSUBADDPDr213r:
   22498   case X86::VFMSUBADDPSr213r:
   22499   case X86::VFMADDPDr213rY:
   22500   case X86::VFMADDPSr213rY:
   22501   case X86::VFMSUBPDr213rY:
   22502   case X86::VFMSUBPSr213rY:
   22503   case X86::VFNMADDPDr213rY:
   22504   case X86::VFNMADDPSr213rY:
   22505   case X86::VFNMSUBPDr213rY:
   22506   case X86::VFNMSUBPSr213rY:
   22507   case X86::VFMADDSUBPDr213rY:
   22508   case X86::VFMADDSUBPSr213rY:
   22509   case X86::VFMSUBADDPDr213rY:
   22510   case X86::VFMSUBADDPSr213rY:
   22511     return emitFMA3Instr(MI, BB);
   22512   }
   22513 }
   22514 
   22515 //===----------------------------------------------------------------------===//
   22516 //                           X86 Optimization Hooks
   22517 //===----------------------------------------------------------------------===//
   22518 
   22519 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   22520                                                       APInt &KnownZero,
   22521                                                       APInt &KnownOne,
   22522                                                       const SelectionDAG &DAG,
   22523                                                       unsigned Depth) const {
   22524   unsigned BitWidth = KnownZero.getBitWidth();
   22525   unsigned Opc = Op.getOpcode();
   22526   assert((Opc >= ISD::BUILTIN_OP_END ||
   22527           Opc == ISD::INTRINSIC_WO_CHAIN ||
   22528           Opc == ISD::INTRINSIC_W_CHAIN ||
   22529           Opc == ISD::INTRINSIC_VOID) &&
   22530          "Should use MaskedValueIsZero if you don't know whether Op"
   22531          " is a target node!");
   22532 
   22533   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
   22534   switch (Opc) {
   22535   default: break;
   22536   case X86ISD::ADD:
   22537   case X86ISD::SUB:
   22538   case X86ISD::ADC:
   22539   case X86ISD::SBB:
   22540   case X86ISD::SMUL:
   22541   case X86ISD::UMUL:
   22542   case X86ISD::INC:
   22543   case X86ISD::DEC:
   22544   case X86ISD::OR:
   22545   case X86ISD::XOR:
   22546   case X86ISD::AND:
   22547     // These nodes' second result is a boolean.
   22548     if (Op.getResNo() == 0)
   22549       break;
   22550     // Fallthrough
   22551   case X86ISD::SETCC:
   22552     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
   22553     break;
   22554   case ISD::INTRINSIC_WO_CHAIN: {
   22555     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   22556     unsigned NumLoBits = 0;
   22557     switch (IntId) {
   22558     default: break;
   22559     case Intrinsic::x86_sse_movmsk_ps:
   22560     case Intrinsic::x86_avx_movmsk_ps_256:
   22561     case Intrinsic::x86_sse2_movmsk_pd:
   22562     case Intrinsic::x86_avx_movmsk_pd_256:
   22563     case Intrinsic::x86_mmx_pmovmskb:
   22564     case Intrinsic::x86_sse2_pmovmskb_128:
   22565     case Intrinsic::x86_avx2_pmovmskb: {
   22566       // High bits of movmskp{s|d}, pmovmskb are known zero.
   22567       switch (IntId) {
   22568         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   22569         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
   22570         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
   22571         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
   22572         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
   22573         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
   22574         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
   22575         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
   22576       }
   22577       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
   22578       break;
   22579     }
   22580     }
   22581     break;
   22582   }
   22583   }
   22584 }
   22585 
   22586 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
   22587   SDValue Op,
   22588   const SelectionDAG &,
   22589   unsigned Depth) const {
   22590   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
   22591   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
   22592     return Op.getValueType().getScalarSizeInBits();
   22593 
   22594   // Fallback case.
   22595   return 1;
   22596 }
   22597 
   22598 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
   22599 /// node is a GlobalAddress + offset.
   22600 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   22601                                        const GlobalValue* &GA,
   22602                                        int64_t &Offset) const {
   22603   if (N->getOpcode() == X86ISD::Wrapper) {
   22604     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
   22605       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
   22606       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
   22607       return true;
   22608     }
   22609   }
   22610   return TargetLowering::isGAPlusOffset(N, GA, Offset);
   22611 }
   22612 
   22613 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
   22614 /// same as extracting the high 128-bit part of 256-bit vector and then
   22615 /// inserting the result into the low part of a new 256-bit vector
   22616 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
   22617   EVT VT = SVOp->getValueType(0);
   22618   unsigned NumElems = VT.getVectorNumElements();
   22619 
   22620   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   22621   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
   22622     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
   22623         SVOp->getMaskElt(j) >= 0)
   22624       return false;
   22625 
   22626   return true;
   22627 }
   22628 
   22629 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
   22630 /// same as extracting the low 128-bit part of 256-bit vector and then
   22631 /// inserting the result into the high part of a new 256-bit vector
   22632 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
   22633   EVT VT = SVOp->getValueType(0);
   22634   unsigned NumElems = VT.getVectorNumElements();
   22635 
   22636   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   22637   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
   22638     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
   22639         SVOp->getMaskElt(j) >= 0)
   22640       return false;
   22641 
   22642   return true;
   22643 }
   22644 
   22645 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
   22646 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
   22647                                         TargetLowering::DAGCombinerInfo &DCI,
   22648                                         const X86Subtarget* Subtarget) {
   22649   SDLoc dl(N);
   22650   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   22651   SDValue V1 = SVOp->getOperand(0);
   22652   SDValue V2 = SVOp->getOperand(1);
   22653   MVT VT = SVOp->getSimpleValueType(0);
   22654   unsigned NumElems = VT.getVectorNumElements();
   22655 
   22656   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
   22657       V2.getOpcode() == ISD::CONCAT_VECTORS) {
   22658     //
   22659     //                   0,0,0,...
   22660     //                      |
   22661     //    V      UNDEF    BUILD_VECTOR    UNDEF
   22662     //     \      /           \           /
   22663     //  CONCAT_VECTOR         CONCAT_VECTOR
   22664     //         \                  /
   22665     //          \                /
   22666     //          RESULT: V + zero extended
   22667     //
   22668     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
   22669         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
   22670         V1.getOperand(1).getOpcode() != ISD::UNDEF)
   22671       return SDValue();
   22672 
   22673     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
   22674       return SDValue();
   22675 
   22676     // To match the shuffle mask, the first half of the mask should
   22677     // be exactly the first vector, and all the rest a splat with the
   22678     // first element of the second one.
   22679     for (unsigned i = 0; i != NumElems/2; ++i)
   22680       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
   22681           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
   22682         return SDValue();
   22683 
   22684     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
   22685     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
   22686       if (Ld->hasNUsesOfValue(1, 0)) {
   22687         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
   22688         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
   22689         SDValue ResNode =
   22690           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
   22691                                   Ld->getMemoryVT(),
   22692                                   Ld->getPointerInfo(),
   22693                                   Ld->getAlignment(),
   22694                                   false/*isVolatile*/, true/*ReadMem*/,
   22695                                   false/*WriteMem*/);
   22696 
   22697         // Make sure the newly-created LOAD is in the same position as Ld in
   22698         // terms of dependency. We create a TokenFactor for Ld and ResNode,
   22699         // and update uses of Ld's output chain to use the TokenFactor.
   22700         if (Ld->hasAnyUseOfValue(1)) {
   22701           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   22702                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
   22703           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
   22704           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
   22705                                  SDValue(ResNode.getNode(), 1));
   22706         }
   22707 
   22708         return DAG.getBitcast(VT, ResNode);
   22709       }
   22710     }
   22711 
   22712     // Emit a zeroed vector and insert the desired subvector on its
   22713     // first half.
   22714     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   22715     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
   22716     return DCI.CombineTo(N, InsV);
   22717   }
   22718 
   22719   //===--------------------------------------------------------------------===//
   22720   // Combine some shuffles into subvector extracts and inserts:
   22721   //
   22722 
   22723   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   22724   if (isShuffleHigh128VectorInsertLow(SVOp)) {
   22725     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
   22726     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
   22727     return DCI.CombineTo(N, InsV);
   22728   }
   22729 
   22730   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   22731   if (isShuffleLow128VectorInsertHigh(SVOp)) {
   22732     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
   22733     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
   22734     return DCI.CombineTo(N, InsV);
   22735   }
   22736 
   22737   return SDValue();
   22738 }
   22739 
   22740 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
   22741 /// possible.
   22742 ///
   22743 /// This is the leaf of the recursive combinine below. When we have found some
   22744 /// chain of single-use x86 shuffle instructions and accumulated the combined
   22745 /// shuffle mask represented by them, this will try to pattern match that mask
   22746 /// into either a single instruction if there is a special purpose instruction
   22747 /// for this operation, or into a PSHUFB instruction which is a fully general
   22748 /// instruction but should only be used to replace chains over a certain depth.
   22749 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
   22750                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
   22751                                    TargetLowering::DAGCombinerInfo &DCI,
   22752                                    const X86Subtarget *Subtarget) {
   22753   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
   22754 
   22755   // Find the operand that enters the chain. Note that multiple uses are OK
   22756   // here, we're not going to remove the operand we find.
   22757   SDValue Input = Op.getOperand(0);
   22758   while (Input.getOpcode() == ISD::BITCAST)
   22759     Input = Input.getOperand(0);
   22760 
   22761   MVT VT = Input.getSimpleValueType();
   22762   MVT RootVT = Root.getSimpleValueType();
   22763   SDLoc DL(Root);
   22764 
   22765   if (Mask.size() == 1) {
   22766     int Index = Mask[0];
   22767     assert((Index >= 0 || Index == SM_SentinelUndef ||
   22768             Index == SM_SentinelZero) &&
   22769            "Invalid shuffle index found!");
   22770 
   22771     // We may end up with an accumulated mask of size 1 as a result of
   22772     // widening of shuffle operands (see function canWidenShuffleElements).
   22773     // If the only shuffle index is equal to SM_SentinelZero then propagate
   22774     // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle
   22775     // mask, and therefore the entire chain of shuffles can be folded away.
   22776     if (Index == SM_SentinelZero)
   22777       DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL));
   22778     else
   22779       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
   22780                     /*AddTo*/ true);
   22781     return true;
   22782   }
   22783 
   22784   // Use the float domain if the operand type is a floating point type.
   22785   bool FloatDomain = VT.isFloatingPoint();
   22786 
   22787   // For floating point shuffles, we don't have free copies in the shuffle
   22788   // instructions or the ability to load as part of the instruction, so
   22789   // canonicalize their shuffles to UNPCK or MOV variants.
   22790   //
   22791   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
   22792   // vectors because it can have a load folded into it that UNPCK cannot. This
   22793   // doesn't preclude something switching to the shorter encoding post-RA.
   22794   //
   22795   // FIXME: Should teach these routines about AVX vector widths.
   22796   if (FloatDomain && VT.is128BitVector()) {
   22797     if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
   22798       bool Lo = Mask.equals({0, 0});
   22799       unsigned Shuffle;
   22800       MVT ShuffleVT;
   22801       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
   22802       // is no slower than UNPCKLPD but has the option to fold the input operand
   22803       // into even an unaligned memory load.
   22804       if (Lo && Subtarget->hasSSE3()) {
   22805         Shuffle = X86ISD::MOVDDUP;
   22806         ShuffleVT = MVT::v2f64;
   22807       } else {
   22808         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
   22809         // than the UNPCK variants.
   22810         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
   22811         ShuffleVT = MVT::v4f32;
   22812       }
   22813       if (Depth == 1 && Root->getOpcode() == Shuffle)
   22814         return false; // Nothing to do!
   22815       Op = DAG.getBitcast(ShuffleVT, Input);
   22816       DCI.AddToWorklist(Op.getNode());
   22817       if (Shuffle == X86ISD::MOVDDUP)
   22818         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
   22819       else
   22820         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
   22821       DCI.AddToWorklist(Op.getNode());
   22822       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
   22823                     /*AddTo*/ true);
   22824       return true;
   22825     }
   22826     if (Subtarget->hasSSE3() &&
   22827         (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) {
   22828       bool Lo = Mask.equals({0, 0, 2, 2});
   22829       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
   22830       MVT ShuffleVT = MVT::v4f32;
   22831       if (Depth == 1 && Root->getOpcode() == Shuffle)
   22832         return false; // Nothing to do!
   22833       Op = DAG.getBitcast(ShuffleVT, Input);
   22834       DCI.AddToWorklist(Op.getNode());
   22835       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
   22836       DCI.AddToWorklist(Op.getNode());
   22837       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
   22838                     /*AddTo*/ true);
   22839       return true;
   22840     }
   22841     if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) {
   22842       bool Lo = Mask.equals({0, 0, 1, 1});
   22843       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
   22844       MVT ShuffleVT = MVT::v4f32;
   22845       if (Depth == 1 && Root->getOpcode() == Shuffle)
   22846         return false; // Nothing to do!
   22847       Op = DAG.getBitcast(ShuffleVT, Input);
   22848       DCI.AddToWorklist(Op.getNode());
   22849       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
   22850       DCI.AddToWorklist(Op.getNode());
   22851       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
   22852                     /*AddTo*/ true);
   22853       return true;
   22854     }
   22855   }
   22856 
   22857   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
   22858   // variants as none of these have single-instruction variants that are
   22859   // superior to the UNPCK formulation.
   22860   if (!FloatDomain && VT.is128BitVector() &&
   22861       (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
   22862        Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
   22863        Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
   22864        Mask.equals(
   22865            {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) {
   22866     bool Lo = Mask[0] == 0;
   22867     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
   22868     if (Depth == 1 && Root->getOpcode() == Shuffle)
   22869       return false; // Nothing to do!
   22870     MVT ShuffleVT;
   22871     switch (Mask.size()) {
   22872     case 8:
   22873       ShuffleVT = MVT::v8i16;
   22874       break;
   22875     case 16:
   22876       ShuffleVT = MVT::v16i8;
   22877       break;
   22878     default:
   22879       llvm_unreachable("Impossible mask size!");
   22880     };
   22881     Op = DAG.getBitcast(ShuffleVT, Input);
   22882     DCI.AddToWorklist(Op.getNode());
   22883     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
   22884     DCI.AddToWorklist(Op.getNode());
   22885     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
   22886                   /*AddTo*/ true);
   22887     return true;
   22888   }
   22889 
   22890   // Don't try to re-form single instruction chains under any circumstances now
   22891   // that we've done encoding canonicalization for them.
   22892   if (Depth < 2)
   22893     return false;
   22894 
   22895   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
   22896   // can replace them with a single PSHUFB instruction profitably. Intel's
   22897   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
   22898   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
   22899   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
   22900     SmallVector<SDValue, 16> PSHUFBMask;
   22901     int NumBytes = VT.getSizeInBits() / 8;
   22902     int Ratio = NumBytes / Mask.size();
   22903     for (int i = 0; i < NumBytes; ++i) {
   22904       if (Mask[i / Ratio] == SM_SentinelUndef) {
   22905         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
   22906         continue;
   22907       }
   22908       int M = Mask[i / Ratio] != SM_SentinelZero
   22909                   ? Ratio * Mask[i / Ratio] + i % Ratio
   22910                   : 255;
   22911       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
   22912     }
   22913     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
   22914     Op = DAG.getBitcast(ByteVT, Input);
   22915     DCI.AddToWorklist(Op.getNode());
   22916     SDValue PSHUFBMaskOp =
   22917         DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
   22918     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
   22919     Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
   22920     DCI.AddToWorklist(Op.getNode());
   22921     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
   22922                   /*AddTo*/ true);
   22923     return true;
   22924   }
   22925 
   22926   // Failed to find any combines.
   22927   return false;
   22928 }
   22929 
   22930 /// \brief Fully generic combining of x86 shuffle instructions.
   22931 ///
   22932 /// This should be the last combine run over the x86 shuffle instructions. Once
   22933 /// they have been fully optimized, this will recursively consider all chains
   22934 /// of single-use shuffle instructions, build a generic model of the cumulative
   22935 /// shuffle operation, and check for simpler instructions which implement this
   22936 /// operation. We use this primarily for two purposes:
   22937 ///
   22938 /// 1) Collapse generic shuffles to specialized single instructions when
   22939 ///    equivalent. In most cases, this is just an encoding size win, but
   22940 ///    sometimes we will collapse multiple generic shuffles into a single
   22941 ///    special-purpose shuffle.
   22942 /// 2) Look for sequences of shuffle instructions with 3 or more total
   22943 ///    instructions, and replace them with the slightly more expensive SSSE3
   22944 ///    PSHUFB instruction if available. We do this as the last combining step
   22945 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
   22946 ///    a suitable short sequence of other instructions. The PHUFB will either
   22947 ///    use a register or have to read from memory and so is slightly (but only
   22948 ///    slightly) more expensive than the other shuffle instructions.
   22949 ///
   22950 /// Because this is inherently a quadratic operation (for each shuffle in
   22951 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
   22952 /// This should never be an issue in practice as the shuffle lowering doesn't
   22953 /// produce sequences of more than 8 instructions.
   22954 ///
   22955 /// FIXME: We will currently miss some cases where the redundant shuffling
   22956 /// would simplify under the threshold for PSHUFB formation because of
   22957 /// combine-ordering. To fix this, we should do the redundant instruction
   22958 /// combining in this recursive walk.
   22959 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
   22960                                           ArrayRef<int> RootMask,
   22961                                           int Depth, bool HasPSHUFB,
   22962                                           SelectionDAG &DAG,
   22963                                           TargetLowering::DAGCombinerInfo &DCI,
   22964                                           const X86Subtarget *Subtarget) {
   22965   // Bound the depth of our recursive combine because this is ultimately
   22966   // quadratic in nature.
   22967   if (Depth > 8)
   22968     return false;
   22969 
   22970   // Directly rip through bitcasts to find the underlying operand.
   22971   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
   22972     Op = Op.getOperand(0);
   22973 
   22974   MVT VT = Op.getSimpleValueType();
   22975   if (!VT.isVector())
   22976     return false; // Bail if we hit a non-vector.
   22977 
   22978   assert(Root.getSimpleValueType().isVector() &&
   22979          "Shuffles operate on vector types!");
   22980   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
   22981          "Can only combine shuffles of the same vector register size.");
   22982 
   22983   if (!isTargetShuffle(Op.getOpcode()))
   22984     return false;
   22985   SmallVector<int, 16> OpMask;
   22986   bool IsUnary;
   22987   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
   22988   // We only can combine unary shuffles which we can decode the mask for.
   22989   if (!HaveMask || !IsUnary)
   22990     return false;
   22991 
   22992   assert(VT.getVectorNumElements() == OpMask.size() &&
   22993          "Different mask size from vector size!");
   22994   assert(((RootMask.size() > OpMask.size() &&
   22995            RootMask.size() % OpMask.size() == 0) ||
   22996           (OpMask.size() > RootMask.size() &&
   22997            OpMask.size() % RootMask.size() == 0) ||
   22998           OpMask.size() == RootMask.size()) &&
   22999          "The smaller number of elements must divide the larger.");
   23000   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
   23001   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
   23002   assert(((RootRatio == 1 && OpRatio == 1) ||
   23003           (RootRatio == 1) != (OpRatio == 1)) &&
   23004          "Must not have a ratio for both incoming and op masks!");
   23005 
   23006   SmallVector<int, 16> Mask;
   23007   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
   23008 
   23009   // Merge this shuffle operation's mask into our accumulated mask. Note that
   23010   // this shuffle's mask will be the first applied to the input, followed by the
   23011   // root mask to get us all the way to the root value arrangement. The reason
   23012   // for this order is that we are recursing up the operation chain.
   23013   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
   23014     int RootIdx = i / RootRatio;
   23015     if (RootMask[RootIdx] < 0) {
   23016       // This is a zero or undef lane, we're done.
   23017       Mask.push_back(RootMask[RootIdx]);
   23018       continue;
   23019     }
   23020 
   23021     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
   23022     int OpIdx = RootMaskedIdx / OpRatio;
   23023     if (OpMask[OpIdx] < 0) {
   23024       // The incoming lanes are zero or undef, it doesn't matter which ones we
   23025       // are using.
   23026       Mask.push_back(OpMask[OpIdx]);
   23027       continue;
   23028     }
   23029 
   23030     // Ok, we have non-zero lanes, map them through.
   23031     Mask.push_back(OpMask[OpIdx] * OpRatio +
   23032                    RootMaskedIdx % OpRatio);
   23033   }
   23034 
   23035   // See if we can recurse into the operand to combine more things.
   23036   switch (Op.getOpcode()) {
   23037   case X86ISD::PSHUFB:
   23038     HasPSHUFB = true;
   23039   case X86ISD::PSHUFD:
   23040   case X86ISD::PSHUFHW:
   23041   case X86ISD::PSHUFLW:
   23042     if (Op.getOperand(0).hasOneUse() &&
   23043         combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
   23044                                       HasPSHUFB, DAG, DCI, Subtarget))
   23045       return true;
   23046     break;
   23047 
   23048   case X86ISD::UNPCKL:
   23049   case X86ISD::UNPCKH:
   23050     assert(Op.getOperand(0) == Op.getOperand(1) &&
   23051            "We only combine unary shuffles!");
   23052     // We can't check for single use, we have to check that this shuffle is the
   23053     // only user.
   23054     if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
   23055         combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
   23056                                       HasPSHUFB, DAG, DCI, Subtarget))
   23057       return true;
   23058     break;
   23059   }
   23060 
   23061   // Minor canonicalization of the accumulated shuffle mask to make it easier
   23062   // to match below. All this does is detect masks with squential pairs of
   23063   // elements, and shrink them to the half-width mask. It does this in a loop
   23064   // so it will reduce the size of the mask to the minimal width mask which
   23065   // performs an equivalent shuffle.
   23066   SmallVector<int, 16> WidenedMask;
   23067   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
   23068     Mask = std::move(WidenedMask);
   23069     WidenedMask.clear();
   23070   }
   23071 
   23072   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
   23073                                 Subtarget);
   23074 }
   23075 
   23076 /// \brief Get the PSHUF-style mask from PSHUF node.
   23077 ///
   23078 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
   23079 /// PSHUF-style masks that can be reused with such instructions.
   23080 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
   23081   MVT VT = N.getSimpleValueType();
   23082   SmallVector<int, 4> Mask;
   23083   bool IsUnary;
   23084   bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
   23085   (void)HaveMask;
   23086   assert(HaveMask);
   23087 
   23088   // If we have more than 128-bits, only the low 128-bits of shuffle mask
   23089   // matter. Check that the upper masks are repeats and remove them.
   23090   if (VT.getSizeInBits() > 128) {
   23091     int LaneElts = 128 / VT.getScalarSizeInBits();
   23092 #ifndef NDEBUG
   23093     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
   23094       for (int j = 0; j < LaneElts; ++j)
   23095         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
   23096                "Mask doesn't repeat in high 128-bit lanes!");
   23097 #endif
   23098     Mask.resize(LaneElts);
   23099   }
   23100 
   23101   switch (N.getOpcode()) {
   23102   case X86ISD::PSHUFD:
   23103     return Mask;
   23104   case X86ISD::PSHUFLW:
   23105     Mask.resize(4);
   23106     return Mask;
   23107   case X86ISD::PSHUFHW:
   23108     Mask.erase(Mask.begin(), Mask.begin() + 4);
   23109     for (int &M : Mask)
   23110       M -= 4;
   23111     return Mask;
   23112   default:
   23113     llvm_unreachable("No valid shuffle instruction found!");
   23114   }
   23115 }
   23116 
   23117 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
   23118 ///
   23119 /// We walk up the chain and look for a combinable shuffle, skipping over
   23120 /// shuffles that we could hoist this shuffle's transformation past without
   23121 /// altering anything.
   23122 static SDValue
   23123 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
   23124                              SelectionDAG &DAG,
   23125                              TargetLowering::DAGCombinerInfo &DCI) {
   23126   assert(N.getOpcode() == X86ISD::PSHUFD &&
   23127          "Called with something other than an x86 128-bit half shuffle!");
   23128   SDLoc DL(N);
   23129 
   23130   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
   23131   // of the shuffles in the chain so that we can form a fresh chain to replace
   23132   // this one.
   23133   SmallVector<SDValue, 8> Chain;
   23134   SDValue V = N.getOperand(0);
   23135   for (; V.hasOneUse(); V = V.getOperand(0)) {
   23136     switch (V.getOpcode()) {
   23137     default:
   23138       return SDValue(); // Nothing combined!
   23139 
   23140     case ISD::BITCAST:
   23141       // Skip bitcasts as we always know the type for the target specific
   23142       // instructions.
   23143       continue;
   23144 
   23145     case X86ISD::PSHUFD:
   23146       // Found another dword shuffle.
   23147       break;
   23148 
   23149     case X86ISD::PSHUFLW:
   23150       // Check that the low words (being shuffled) are the identity in the
   23151       // dword shuffle, and the high words are self-contained.
   23152       if (Mask[0] != 0 || Mask[1] != 1 ||
   23153           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
   23154         return SDValue();
   23155 
   23156       Chain.push_back(V);
   23157       continue;
   23158 
   23159     case X86ISD::PSHUFHW:
   23160       // Check that the high words (being shuffled) are the identity in the
   23161       // dword shuffle, and the low words are self-contained.
   23162       if (Mask[2] != 2 || Mask[3] != 3 ||
   23163           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
   23164         return SDValue();
   23165 
   23166       Chain.push_back(V);
   23167       continue;
   23168 
   23169     case X86ISD::UNPCKL:
   23170     case X86ISD::UNPCKH:
   23171       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
   23172       // shuffle into a preceding word shuffle.
   23173       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
   23174           V.getSimpleValueType().getVectorElementType() != MVT::i16)
   23175         return SDValue();
   23176 
   23177       // Search for a half-shuffle which we can combine with.
   23178       unsigned CombineOp =
   23179           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
   23180       if (V.getOperand(0) != V.getOperand(1) ||
   23181           !V->isOnlyUserOf(V.getOperand(0).getNode()))
   23182         return SDValue();
   23183       Chain.push_back(V);
   23184       V = V.getOperand(0);
   23185       do {
   23186         switch (V.getOpcode()) {
   23187         default:
   23188           return SDValue(); // Nothing to combine.
   23189 
   23190         case X86ISD::PSHUFLW:
   23191         case X86ISD::PSHUFHW:
   23192           if (V.getOpcode() == CombineOp)
   23193             break;
   23194 
   23195           Chain.push_back(V);
   23196 
   23197           // Fallthrough!
   23198         case ISD::BITCAST:
   23199           V = V.getOperand(0);
   23200           continue;
   23201         }
   23202         break;
   23203       } while (V.hasOneUse());
   23204       break;
   23205     }
   23206     // Break out of the loop if we break out of the switch.
   23207     break;
   23208   }
   23209 
   23210   if (!V.hasOneUse())
   23211     // We fell out of the loop without finding a viable combining instruction.
   23212     return SDValue();
   23213 
   23214   // Merge this node's mask and our incoming mask.
   23215   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   23216   for (int &M : Mask)
   23217     M = VMask[M];
   23218   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
   23219                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   23220 
   23221   // Rebuild the chain around this new shuffle.
   23222   while (!Chain.empty()) {
   23223     SDValue W = Chain.pop_back_val();
   23224 
   23225     if (V.getValueType() != W.getOperand(0).getValueType())
   23226       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
   23227 
   23228     switch (W.getOpcode()) {
   23229     default:
   23230       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
   23231 
   23232     case X86ISD::UNPCKL:
   23233     case X86ISD::UNPCKH:
   23234       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
   23235       break;
   23236 
   23237     case X86ISD::PSHUFD:
   23238     case X86ISD::PSHUFLW:
   23239     case X86ISD::PSHUFHW:
   23240       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
   23241       break;
   23242     }
   23243   }
   23244   if (V.getValueType() != N.getValueType())
   23245     V = DAG.getBitcast(N.getValueType(), V);
   23246 
   23247   // Return the new chain to replace N.
   23248   return V;
   23249 }
   23250 
   23251 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
   23252 /// pshufhw.
   23253 ///
   23254 /// We walk up the chain, skipping shuffles of the other half and looking
   23255 /// through shuffles which switch halves trying to find a shuffle of the same
   23256 /// pair of dwords.
   23257 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
   23258                                         SelectionDAG &DAG,
   23259                                         TargetLowering::DAGCombinerInfo &DCI) {
   23260   assert(
   23261       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
   23262       "Called with something other than an x86 128-bit half shuffle!");
   23263   SDLoc DL(N);
   23264   unsigned CombineOpcode = N.getOpcode();
   23265 
   23266   // Walk up a single-use chain looking for a combinable shuffle.
   23267   SDValue V = N.getOperand(0);
   23268   for (; V.hasOneUse(); V = V.getOperand(0)) {
   23269     switch (V.getOpcode()) {
   23270     default:
   23271       return false; // Nothing combined!
   23272 
   23273     case ISD::BITCAST:
   23274       // Skip bitcasts as we always know the type for the target specific
   23275       // instructions.
   23276       continue;
   23277 
   23278     case X86ISD::PSHUFLW:
   23279     case X86ISD::PSHUFHW:
   23280       if (V.getOpcode() == CombineOpcode)
   23281         break;
   23282 
   23283       // Other-half shuffles are no-ops.
   23284       continue;
   23285     }
   23286     // Break out of the loop if we break out of the switch.
   23287     break;
   23288   }
   23289 
   23290   if (!V.hasOneUse())
   23291     // We fell out of the loop without finding a viable combining instruction.
   23292     return false;
   23293 
   23294   // Combine away the bottom node as its shuffle will be accumulated into
   23295   // a preceding shuffle.
   23296   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
   23297 
   23298   // Record the old value.
   23299   SDValue Old = V;
   23300 
   23301   // Merge this node's mask and our incoming mask (adjusted to account for all
   23302   // the pshufd instructions encountered).
   23303   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   23304   for (int &M : Mask)
   23305     M = VMask[M];
   23306   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
   23307                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   23308 
   23309   // Check that the shuffles didn't cancel each other out. If not, we need to
   23310   // combine to the new one.
   23311   if (Old != V)
   23312     // Replace the combinable shuffle with the combined one, updating all users
   23313     // so that we re-evaluate the chain here.
   23314     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
   23315 
   23316   return true;
   23317 }
   23318 
   23319 /// \brief Try to combine x86 target specific shuffles.
   23320 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
   23321                                            TargetLowering::DAGCombinerInfo &DCI,
   23322                                            const X86Subtarget *Subtarget) {
   23323   SDLoc DL(N);
   23324   MVT VT = N.getSimpleValueType();
   23325   SmallVector<int, 4> Mask;
   23326 
   23327   switch (N.getOpcode()) {
   23328   case X86ISD::PSHUFD:
   23329   case X86ISD::PSHUFLW:
   23330   case X86ISD::PSHUFHW:
   23331     Mask = getPSHUFShuffleMask(N);
   23332     assert(Mask.size() == 4);
   23333     break;
   23334   case X86ISD::UNPCKL: {
   23335     // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
   23336     // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
   23337     // moves upper half elements into the lower half part. For example:
   23338     //
   23339     // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
   23340     //     undef:v16i8
   23341     // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
   23342     //
   23343     // will be combined to:
   23344     //
   23345     // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
   23346 
   23347     // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
   23348     // happen due to advanced instructions.
   23349     if (!VT.is128BitVector())
   23350       return SDValue();
   23351 
   23352     auto Op0 = N.getOperand(0);
   23353     auto Op1 = N.getOperand(1);
   23354     if (Op0.getOpcode() == ISD::UNDEF &&
   23355         Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
   23356       ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
   23357 
   23358       unsigned NumElts = VT.getVectorNumElements();
   23359       SmallVector<int, 8> ExpectedMask(NumElts, -1);
   23360       std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
   23361                 NumElts / 2);
   23362 
   23363       auto ShufOp = Op1.getOperand(0);
   23364       if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
   23365         return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
   23366     }
   23367     return SDValue();
   23368   }
   23369   default:
   23370     return SDValue();
   23371   }
   23372 
   23373   // Nuke no-op shuffles that show up after combining.
   23374   if (isNoopShuffleMask(Mask))
   23375     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
   23376 
   23377   // Look for simplifications involving one or two shuffle instructions.
   23378   SDValue V = N.getOperand(0);
   23379   switch (N.getOpcode()) {
   23380   default:
   23381     break;
   23382   case X86ISD::PSHUFLW:
   23383   case X86ISD::PSHUFHW:
   23384     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
   23385 
   23386     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
   23387       return SDValue(); // We combined away this shuffle, so we're done.
   23388 
   23389     // See if this reduces to a PSHUFD which is no more expensive and can
   23390     // combine with more operations. Note that it has to at least flip the
   23391     // dwords as otherwise it would have been removed as a no-op.
   23392     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
   23393       int DMask[] = {0, 1, 2, 3};
   23394       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
   23395       DMask[DOffset + 0] = DOffset + 1;
   23396       DMask[DOffset + 1] = DOffset + 0;
   23397       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
   23398       V = DAG.getBitcast(DVT, V);
   23399       DCI.AddToWorklist(V.getNode());
   23400       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
   23401                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
   23402       DCI.AddToWorklist(V.getNode());
   23403       return DAG.getBitcast(VT, V);
   23404     }
   23405 
   23406     // Look for shuffle patterns which can be implemented as a single unpack.
   23407     // FIXME: This doesn't handle the location of the PSHUFD generically, and
   23408     // only works when we have a PSHUFD followed by two half-shuffles.
   23409     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
   23410         (V.getOpcode() == X86ISD::PSHUFLW ||
   23411          V.getOpcode() == X86ISD::PSHUFHW) &&
   23412         V.getOpcode() != N.getOpcode() &&
   23413         V.hasOneUse()) {
   23414       SDValue D = V.getOperand(0);
   23415       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
   23416         D = D.getOperand(0);
   23417       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
   23418         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   23419         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
   23420         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
   23421         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
   23422         int WordMask[8];
   23423         for (int i = 0; i < 4; ++i) {
   23424           WordMask[i + NOffset] = Mask[i] + NOffset;
   23425           WordMask[i + VOffset] = VMask[i] + VOffset;
   23426         }
   23427         // Map the word mask through the DWord mask.
   23428         int MappedMask[8];
   23429         for (int i = 0; i < 8; ++i)
   23430           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
   23431         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
   23432             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
   23433           // We can replace all three shuffles with an unpack.
   23434           V = DAG.getBitcast(VT, D.getOperand(0));
   23435           DCI.AddToWorklist(V.getNode());
   23436           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
   23437                                                 : X86ISD::UNPCKH,
   23438                              DL, VT, V, V);
   23439         }
   23440       }
   23441     }
   23442 
   23443     break;
   23444 
   23445   case X86ISD::PSHUFD:
   23446     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
   23447       return NewN;
   23448 
   23449     break;
   23450   }
   23451 
   23452   return SDValue();
   23453 }
   23454 
   23455 /// \brief Try to combine a shuffle into a target-specific add-sub node.
   23456 ///
   23457 /// We combine this directly on the abstract vector shuffle nodes so it is
   23458 /// easier to generically match. We also insert dummy vector shuffle nodes for
   23459 /// the operands which explicitly discard the lanes which are unused by this
   23460 /// operation to try to flow through the rest of the combiner the fact that
   23461 /// they're unused.
   23462 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
   23463   SDLoc DL(N);
   23464   EVT VT = N->getValueType(0);
   23465 
   23466   // We only handle target-independent shuffles.
   23467   // FIXME: It would be easy and harmless to use the target shuffle mask
   23468   // extraction tool to support more.
   23469   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
   23470     return SDValue();
   23471 
   23472   auto *SVN = cast<ShuffleVectorSDNode>(N);
   23473   SmallVector<int, 8> Mask;
   23474   for (int M : SVN->getMask())
   23475     Mask.push_back(M);
   23476 
   23477   SDValue V1 = N->getOperand(0);
   23478   SDValue V2 = N->getOperand(1);
   23479 
   23480   // We require the first shuffle operand to be the FSUB node, and the second to
   23481   // be the FADD node.
   23482   if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
   23483     ShuffleVectorSDNode::commuteMask(Mask);
   23484     std::swap(V1, V2);
   23485   } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
   23486     return SDValue();
   23487 
   23488   // If there are other uses of these operations we can't fold them.
   23489   if (!V1->hasOneUse() || !V2->hasOneUse())
   23490     return SDValue();
   23491 
   23492   // Ensure that both operations have the same operands. Note that we can
   23493   // commute the FADD operands.
   23494   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
   23495   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
   23496       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
   23497     return SDValue();
   23498 
   23499   // We're looking for blends between FADD and FSUB nodes. We insist on these
   23500   // nodes being lined up in a specific expected pattern.
   23501   if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
   23502         isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
   23503         isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
   23504     return SDValue();
   23505 
   23506   // Only specific types are legal at this point, assert so we notice if and
   23507   // when these change.
   23508   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
   23509           VT == MVT::v4f64) &&
   23510          "Unknown vector type encountered!");
   23511 
   23512   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
   23513 }
   23514 
   23515 /// PerformShuffleCombine - Performs several different shuffle combines.
   23516 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   23517                                      TargetLowering::DAGCombinerInfo &DCI,
   23518                                      const X86Subtarget *Subtarget) {
   23519   SDLoc dl(N);
   23520   SDValue N0 = N->getOperand(0);
   23521   SDValue N1 = N->getOperand(1);
   23522   EVT VT = N->getValueType(0);
   23523 
   23524   // Don't create instructions with illegal types after legalize types has run.
   23525   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   23526   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
   23527     return SDValue();
   23528 
   23529   // If we have legalized the vector types, look for blends of FADD and FSUB
   23530   // nodes that we can fuse into an ADDSUB node.
   23531   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
   23532     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
   23533       return AddSub;
   23534 
   23535   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
   23536   if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() &&
   23537       N->getOpcode() == ISD::VECTOR_SHUFFLE)
   23538     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
   23539 
   23540   // During Type Legalization, when promoting illegal vector types,
   23541   // the backend might introduce new shuffle dag nodes and bitcasts.
   23542   //
   23543   // This code performs the following transformation:
   23544   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
   23545   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
   23546   //
   23547   // We do this only if both the bitcast and the BINOP dag nodes have
   23548   // one use. Also, perform this transformation only if the new binary
   23549   // operation is legal. This is to avoid introducing dag nodes that
   23550   // potentially need to be further expanded (or custom lowered) into a
   23551   // less optimal sequence of dag nodes.
   23552   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
   23553       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
   23554       N0.getOpcode() == ISD::BITCAST) {
   23555     SDValue BC0 = N0.getOperand(0);
   23556     EVT SVT = BC0.getValueType();
   23557     unsigned Opcode = BC0.getOpcode();
   23558     unsigned NumElts = VT.getVectorNumElements();
   23559 
   23560     if (BC0.hasOneUse() && SVT.isVector() &&
   23561         SVT.getVectorNumElements() * 2 == NumElts &&
   23562         TLI.isOperationLegal(Opcode, VT)) {
   23563       bool CanFold = false;
   23564       switch (Opcode) {
   23565       default : break;
   23566       case ISD::ADD :
   23567       case ISD::FADD :
   23568       case ISD::SUB :
   23569       case ISD::FSUB :
   23570       case ISD::MUL :
   23571       case ISD::FMUL :
   23572         CanFold = true;
   23573       }
   23574 
   23575       unsigned SVTNumElts = SVT.getVectorNumElements();
   23576       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   23577       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
   23578         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
   23579       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
   23580         CanFold = SVOp->getMaskElt(i) < 0;
   23581 
   23582       if (CanFold) {
   23583         SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
   23584         SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
   23585         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
   23586         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
   23587       }
   23588     }
   23589   }
   23590 
   23591   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
   23592   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
   23593   // consecutive, non-overlapping, and in the right order.
   23594   SmallVector<SDValue, 16> Elts;
   23595   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
   23596     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
   23597 
   23598   if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
   23599     return LD;
   23600 
   23601   if (isTargetShuffle(N->getOpcode())) {
   23602     SDValue Shuffle =
   23603         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
   23604     if (Shuffle.getNode())
   23605       return Shuffle;
   23606 
   23607     // Try recursively combining arbitrary sequences of x86 shuffle
   23608     // instructions into higher-order shuffles. We do this after combining
   23609     // specific PSHUF instruction sequences into their minimal form so that we
   23610     // can evaluate how many specialized shuffle instructions are involved in
   23611     // a particular chain.
   23612     SmallVector<int, 1> NonceMask; // Just a placeholder.
   23613     NonceMask.push_back(0);
   23614     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
   23615                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
   23616                                       DCI, Subtarget))
   23617       return SDValue(); // This routine will use CombineTo to replace N.
   23618   }
   23619 
   23620   return SDValue();
   23621 }
   23622 
   23623 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
   23624 /// specific shuffle of a load can be folded into a single element load.
   23625 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
   23626 /// shuffles have been custom lowered so we need to handle those here.
   23627 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   23628                                          TargetLowering::DAGCombinerInfo &DCI) {
   23629   if (DCI.isBeforeLegalizeOps())
   23630     return SDValue();
   23631 
   23632   SDValue InVec = N->getOperand(0);
   23633   SDValue EltNo = N->getOperand(1);
   23634 
   23635   if (!isa<ConstantSDNode>(EltNo))
   23636     return SDValue();
   23637 
   23638   EVT OriginalVT = InVec.getValueType();
   23639 
   23640   if (InVec.getOpcode() == ISD::BITCAST) {
   23641     // Don't duplicate a load with other uses.
   23642     if (!InVec.hasOneUse())
   23643       return SDValue();
   23644     EVT BCVT = InVec.getOperand(0).getValueType();
   23645     if (!BCVT.isVector() ||
   23646         BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
   23647       return SDValue();
   23648     InVec = InVec.getOperand(0);
   23649   }
   23650 
   23651   EVT CurrentVT = InVec.getValueType();
   23652 
   23653   if (!isTargetShuffle(InVec.getOpcode()))
   23654     return SDValue();
   23655 
   23656   // Don't duplicate a load with other uses.
   23657   if (!InVec.hasOneUse())
   23658     return SDValue();
   23659 
   23660   SmallVector<int, 16> ShuffleMask;
   23661   bool UnaryShuffle;
   23662   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
   23663                             ShuffleMask, UnaryShuffle))
   23664     return SDValue();
   23665 
   23666   // Select the input vector, guarding against out of range extract vector.
   23667   unsigned NumElems = CurrentVT.getVectorNumElements();
   23668   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   23669   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
   23670   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
   23671                                          : InVec.getOperand(1);
   23672 
   23673   // If inputs to shuffle are the same for both ops, then allow 2 uses
   23674   unsigned AllowedUses = InVec.getNumOperands() > 1 &&
   23675                          InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
   23676 
   23677   if (LdNode.getOpcode() == ISD::BITCAST) {
   23678     // Don't duplicate a load with other uses.
   23679     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
   23680       return SDValue();
   23681 
   23682     AllowedUses = 1; // only allow 1 load use if we have a bitcast
   23683     LdNode = LdNode.getOperand(0);
   23684   }
   23685 
   23686   if (!ISD::isNormalLoad(LdNode.getNode()))
   23687     return SDValue();
   23688 
   23689   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
   23690 
   23691   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
   23692     return SDValue();
   23693 
   23694   EVT EltVT = N->getValueType(0);
   23695   // If there's a bitcast before the shuffle, check if the load type and
   23696   // alignment is valid.
   23697   unsigned Align = LN0->getAlignment();
   23698   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   23699   unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
   23700       EltVT.getTypeForEVT(*DAG.getContext()));
   23701 
   23702   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
   23703     return SDValue();
   23704 
   23705   // All checks match so transform back to vector_shuffle so that DAG combiner
   23706   // can finish the job
   23707   SDLoc dl(N);
   23708 
   23709   // Create shuffle node taking into account the case that its a unary shuffle
   23710   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
   23711                                    : InVec.getOperand(1);
   23712   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
   23713                                  InVec.getOperand(0), Shuffle,
   23714                                  &ShuffleMask[0]);
   23715   Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
   23716   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
   23717                      EltNo);
   23718 }
   23719 
   23720 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
   23721                                      const X86Subtarget *Subtarget) {
   23722   SDValue N0 = N->getOperand(0);
   23723   EVT VT = N->getValueType(0);
   23724 
   23725   // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
   23726   // special and don't usually play with other vector types, it's better to
   23727   // handle them early to be sure we emit efficient code by avoiding
   23728   // store-load conversions.
   23729   if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
   23730       N0.getValueType() == MVT::v2i32 &&
   23731       isNullConstant(N0.getOperand(1))) {
   23732     SDValue N00 = N0->getOperand(0);
   23733     if (N00.getValueType() == MVT::i32)
   23734       return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
   23735   }
   23736 
   23737   // Convert a bitcasted integer logic operation that has one bitcasted
   23738   // floating-point operand and one constant operand into a floating-point
   23739   // logic operation. This may create a load of the constant, but that is
   23740   // cheaper than materializing the constant in an integer register and
   23741   // transferring it to an SSE register or transferring the SSE operand to
   23742   // integer register and back.
   23743   unsigned FPOpcode;
   23744   switch (N0.getOpcode()) {
   23745     case ISD::AND: FPOpcode = X86ISD::FAND; break;
   23746     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
   23747     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
   23748     default: return SDValue();
   23749   }
   23750   if (((Subtarget->hasSSE1() && VT == MVT::f32) ||
   23751        (Subtarget->hasSSE2() && VT == MVT::f64)) &&
   23752       isa<ConstantSDNode>(N0.getOperand(1)) &&
   23753       N0.getOperand(0).getOpcode() == ISD::BITCAST &&
   23754       N0.getOperand(0).getOperand(0).getValueType() == VT) {
   23755     SDValue N000 = N0.getOperand(0).getOperand(0);
   23756     SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
   23757     return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
   23758   }
   23759 
   23760   return SDValue();
   23761 }
   23762 
   23763 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
   23764 /// generation and convert it from being a bunch of shuffles and extracts
   23765 /// into a somewhat faster sequence. For i686, the best sequence is apparently
   23766 /// storing the value and loading scalars back, while for x64 we should
   23767 /// use 64-bit extracts and shifts.
   23768 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   23769                                          TargetLowering::DAGCombinerInfo &DCI) {
   23770   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
   23771     return NewOp;
   23772 
   23773   SDValue InputVector = N->getOperand(0);
   23774   SDLoc dl(InputVector);
   23775   // Detect mmx to i32 conversion through a v2i32 elt extract.
   23776   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
   23777       N->getValueType(0) == MVT::i32 &&
   23778       InputVector.getValueType() == MVT::v2i32) {
   23779 
   23780     // The bitcast source is a direct mmx result.
   23781     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
   23782     if (MMXSrc.getValueType() == MVT::x86mmx)
   23783       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
   23784                          N->getValueType(0),
   23785                          InputVector.getNode()->getOperand(0));
   23786 
   23787     // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
   23788     if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
   23789         MMXSrc.getValueType() == MVT::i64) {
   23790       SDValue MMXSrcOp = MMXSrc.getOperand(0);
   23791       if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST &&
   23792           MMXSrcOp.getValueType() == MVT::v1i64 &&
   23793           MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
   23794         return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
   23795                            N->getValueType(0), MMXSrcOp.getOperand(0));
   23796     }
   23797   }
   23798 
   23799   EVT VT = N->getValueType(0);
   23800 
   23801   if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
   23802       InputVector.getOpcode() == ISD::BITCAST &&
   23803       isa<ConstantSDNode>(InputVector.getOperand(0))) {
   23804     uint64_t ExtractedElt =
   23805         cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   23806     uint64_t InputValue =
   23807         cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
   23808     uint64_t Res = (InputValue >> ExtractedElt) & 1;
   23809     return DAG.getConstant(Res, dl, MVT::i1);
   23810   }
   23811   // Only operate on vectors of 4 elements, where the alternative shuffling
   23812   // gets to be more expensive.
   23813   if (InputVector.getValueType() != MVT::v4i32)
   23814     return SDValue();
   23815 
   23816   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
   23817   // single use which is a sign-extend or zero-extend, and all elements are
   23818   // used.
   23819   SmallVector<SDNode *, 4> Uses;
   23820   unsigned ExtractedElements = 0;
   23821   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
   23822        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
   23823     if (UI.getUse().getResNo() != InputVector.getResNo())
   23824       return SDValue();
   23825 
   23826     SDNode *Extract = *UI;
   23827     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   23828       return SDValue();
   23829 
   23830     if (Extract->getValueType(0) != MVT::i32)
   23831       return SDValue();
   23832     if (!Extract->hasOneUse())
   23833       return SDValue();
   23834     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
   23835         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
   23836       return SDValue();
   23837     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
   23838       return SDValue();
   23839 
   23840     // Record which element was extracted.
   23841     ExtractedElements |=
   23842       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
   23843 
   23844     Uses.push_back(Extract);
   23845   }
   23846 
   23847   // If not all the elements were used, this may not be worthwhile.
   23848   if (ExtractedElements != 15)
   23849     return SDValue();
   23850 
   23851   // Ok, we've now decided to do the transformation.
   23852   // If 64-bit shifts are legal, use the extract-shift sequence,
   23853   // otherwise bounce the vector off the cache.
   23854   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   23855   SDValue Vals[4];
   23856 
   23857   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
   23858     SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
   23859     auto &DL = DAG.getDataLayout();
   23860     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
   23861     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
   23862       DAG.getConstant(0, dl, VecIdxTy));
   23863     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
   23864       DAG.getConstant(1, dl, VecIdxTy));
   23865 
   23866     SDValue ShAmt = DAG.getConstant(
   23867         32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
   23868     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
   23869     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   23870       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
   23871     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
   23872     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   23873       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
   23874   } else {
   23875     // Store the value to a temporary stack slot.
   23876     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
   23877     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
   23878       MachinePointerInfo(), false, false, 0);
   23879 
   23880     EVT ElementType = InputVector.getValueType().getVectorElementType();
   23881     unsigned EltSize = ElementType.getSizeInBits() / 8;
   23882 
   23883     // Replace each use (extract) with a load of the appropriate element.
   23884     for (unsigned i = 0; i < 4; ++i) {
   23885       uint64_t Offset = EltSize * i;
   23886       auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   23887       SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
   23888 
   23889       SDValue ScalarAddr =
   23890           DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
   23891 
   23892       // Load the scalar.
   23893       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
   23894                             ScalarAddr, MachinePointerInfo(),
   23895                             false, false, false, 0);
   23896 
   23897     }
   23898   }
   23899 
   23900   // Replace the extracts
   23901   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
   23902     UE = Uses.end(); UI != UE; ++UI) {
   23903     SDNode *Extract = *UI;
   23904 
   23905     SDValue Idx = Extract->getOperand(1);
   23906     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   23907     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
   23908   }
   23909 
   23910   // The replacement was made in place; don't return anything.
   23911   return SDValue();
   23912 }
   23913 
   23914 static SDValue
   23915 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
   23916                                       const X86Subtarget *Subtarget) {
   23917   SDLoc dl(N);
   23918   SDValue Cond = N->getOperand(0);
   23919   SDValue LHS = N->getOperand(1);
   23920   SDValue RHS = N->getOperand(2);
   23921 
   23922   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
   23923     SDValue CondSrc = Cond->getOperand(0);
   23924     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
   23925       Cond = CondSrc->getOperand(0);
   23926   }
   23927 
   23928   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
   23929     return SDValue();
   23930 
   23931   // A vselect where all conditions and data are constants can be optimized into
   23932   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
   23933   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
   23934       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
   23935     return SDValue();
   23936 
   23937   unsigned MaskValue = 0;
   23938   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
   23939     return SDValue();
   23940 
   23941   MVT VT = N->getSimpleValueType(0);
   23942   unsigned NumElems = VT.getVectorNumElements();
   23943   SmallVector<int, 8> ShuffleMask(NumElems, -1);
   23944   for (unsigned i = 0; i < NumElems; ++i) {
   23945     // Be sure we emit undef where we can.
   23946     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
   23947       ShuffleMask[i] = -1;
   23948     else
   23949       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
   23950   }
   23951 
   23952   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   23953   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
   23954     return SDValue();
   23955   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
   23956 }
   23957 
   23958 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
   23959 /// nodes.
   23960 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   23961                                     TargetLowering::DAGCombinerInfo &DCI,
   23962                                     const X86Subtarget *Subtarget) {
   23963   SDLoc DL(N);
   23964   SDValue Cond = N->getOperand(0);
   23965   // Get the LHS/RHS of the select.
   23966   SDValue LHS = N->getOperand(1);
   23967   SDValue RHS = N->getOperand(2);
   23968   EVT VT = LHS.getValueType();
   23969   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   23970 
   23971   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   23972   // instructions match the semantics of the common C idiom x<y?x:y but not
   23973   // x<=y?x:y, because of how they handle negative zero (which can be
   23974   // ignored in unsafe-math mode).
   23975   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   23976   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
   23977       VT != MVT::f80 && VT != MVT::f128 &&
   23978       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
   23979       (Subtarget->hasSSE2() ||
   23980        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
   23981     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   23982 
   23983     unsigned Opcode = 0;
   23984     // Check for x CC y ? x : y.
   23985     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   23986         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   23987       switch (CC) {
   23988       default: break;
   23989       case ISD::SETULT:
   23990         // Converting this to a min would handle NaNs incorrectly, and swapping
   23991         // the operands would cause it to handle comparisons between positive
   23992         // and negative zero incorrectly.
   23993         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
   23994           if (!DAG.getTarget().Options.UnsafeFPMath &&
   23995               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
   23996             break;
   23997           std::swap(LHS, RHS);
   23998         }
   23999         Opcode = X86ISD::FMIN;
   24000         break;
   24001       case ISD::SETOLE:
   24002         // Converting this to a min would handle comparisons between positive
   24003         // and negative zero incorrectly.
   24004         if (!DAG.getTarget().Options.UnsafeFPMath &&
   24005             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
   24006           break;
   24007         Opcode = X86ISD::FMIN;
   24008         break;
   24009       case ISD::SETULE:
   24010         // Converting this to a min would handle both negative zeros and NaNs
   24011         // incorrectly, but we can swap the operands to fix both.
   24012         std::swap(LHS, RHS);
   24013       case ISD::SETOLT:
   24014       case ISD::SETLT:
   24015       case ISD::SETLE:
   24016         Opcode = X86ISD::FMIN;
   24017         break;
   24018 
   24019       case ISD::SETOGE:
   24020         // Converting this to a max would handle comparisons between positive
   24021         // and negative zero incorrectly.
   24022         if (!DAG.getTarget().Options.UnsafeFPMath &&
   24023             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
   24024           break;
   24025         Opcode = X86ISD::FMAX;
   24026         break;
   24027       case ISD::SETUGT:
   24028         // Converting this to a max would handle NaNs incorrectly, and swapping
   24029         // the operands would cause it to handle comparisons between positive
   24030         // and negative zero incorrectly.
   24031         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
   24032           if (!DAG.getTarget().Options.UnsafeFPMath &&
   24033               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
   24034             break;
   24035           std::swap(LHS, RHS);
   24036         }
   24037         Opcode = X86ISD::FMAX;
   24038         break;
   24039       case ISD::SETUGE:
   24040         // Converting this to a max would handle both negative zeros and NaNs
   24041         // incorrectly, but we can swap the operands to fix both.
   24042         std::swap(LHS, RHS);
   24043       case ISD::SETOGT:
   24044       case ISD::SETGT:
   24045       case ISD::SETGE:
   24046         Opcode = X86ISD::FMAX;
   24047         break;
   24048       }
   24049     // Check for x CC y ? y : x -- a min/max with reversed arms.
   24050     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
   24051                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
   24052       switch (CC) {
   24053       default: break;
   24054       case ISD::SETOGE:
   24055         // Converting this to a min would handle comparisons between positive
   24056         // and negative zero incorrectly, and swapping the operands would
   24057         // cause it to handle NaNs incorrectly.
   24058         if (!DAG.getTarget().Options.UnsafeFPMath &&
   24059             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
   24060           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   24061             break;
   24062           std::swap(LHS, RHS);
   24063         }
   24064         Opcode = X86ISD::FMIN;
   24065         break;
   24066       case ISD::SETUGT:
   24067         // Converting this to a min would handle NaNs incorrectly.
   24068         if (!DAG.getTarget().Options.UnsafeFPMath &&
   24069             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
   24070           break;
   24071         Opcode = X86ISD::FMIN;
   24072         break;
   24073       case ISD::SETUGE:
   24074         // Converting this to a min would handle both negative zeros and NaNs
   24075         // incorrectly, but we can swap the operands to fix both.
   24076         std::swap(LHS, RHS);
   24077       case ISD::SETOGT:
   24078       case ISD::SETGT:
   24079       case ISD::SETGE:
   24080         Opcode = X86ISD::FMIN;
   24081         break;
   24082 
   24083       case ISD::SETULT:
   24084         // Converting this to a max would handle NaNs incorrectly.
   24085         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   24086           break;
   24087         Opcode = X86ISD::FMAX;
   24088         break;
   24089       case ISD::SETOLE:
   24090         // Converting this to a max would handle comparisons between positive
   24091         // and negative zero incorrectly, and swapping the operands would
   24092         // cause it to handle NaNs incorrectly.
   24093         if (!DAG.getTarget().Options.UnsafeFPMath &&
   24094             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
   24095           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   24096             break;
   24097           std::swap(LHS, RHS);
   24098         }
   24099         Opcode = X86ISD::FMAX;
   24100         break;
   24101       case ISD::SETULE:
   24102         // Converting this to a max would handle both negative zeros and NaNs
   24103         // incorrectly, but we can swap the operands to fix both.
   24104         std::swap(LHS, RHS);
   24105       case ISD::SETOLT:
   24106       case ISD::SETLT:
   24107       case ISD::SETLE:
   24108         Opcode = X86ISD::FMAX;
   24109         break;
   24110       }
   24111     }
   24112 
   24113     if (Opcode)
   24114       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   24115   }
   24116 
   24117   EVT CondVT = Cond.getValueType();
   24118   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
   24119       CondVT.getVectorElementType() == MVT::i1) {
   24120     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
   24121     // lowering on KNL. In this case we convert it to
   24122     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
   24123     // The same situation for all 128 and 256-bit vectors of i8 and i16.
   24124     // Since SKX these selects have a proper lowering.
   24125     EVT OpVT = LHS.getValueType();
   24126     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
   24127         (OpVT.getVectorElementType() == MVT::i8 ||
   24128          OpVT.getVectorElementType() == MVT::i16) &&
   24129         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
   24130       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
   24131       DCI.AddToWorklist(Cond.getNode());
   24132       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
   24133     }
   24134   }
   24135   // If this is a select between two integer constants, try to do some
   24136   // optimizations.
   24137   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
   24138     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
   24139       // Don't do this for crazy integer types.
   24140       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
   24141         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
   24142         // so that TrueC (the true value) is larger than FalseC.
   24143         bool NeedsCondInvert = false;
   24144 
   24145         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
   24146             // Efficiently invertible.
   24147             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
   24148              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
   24149               isa<ConstantSDNode>(Cond.getOperand(1))))) {
   24150           NeedsCondInvert = true;
   24151           std::swap(TrueC, FalseC);
   24152         }
   24153 
   24154         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
   24155         if (FalseC->getAPIntValue() == 0 &&
   24156             TrueC->getAPIntValue().isPowerOf2()) {
   24157           if (NeedsCondInvert) // Invert the condition if needed.
   24158             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   24159                                DAG.getConstant(1, DL, Cond.getValueType()));
   24160 
   24161           // Zero extend the condition if needed.
   24162           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
   24163 
   24164           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
   24165           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
   24166                              DAG.getConstant(ShAmt, DL, MVT::i8));
   24167         }
   24168 
   24169         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
   24170         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
   24171           if (NeedsCondInvert) // Invert the condition if needed.
   24172             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   24173                                DAG.getConstant(1, DL, Cond.getValueType()));
   24174 
   24175           // Zero extend the condition if needed.
   24176           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
   24177                              FalseC->getValueType(0), Cond);
   24178           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   24179                              SDValue(FalseC, 0));
   24180         }
   24181 
   24182         // Optimize cases that will turn into an LEA instruction.  This requires
   24183         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   24184         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
   24185           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
   24186           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
   24187 
   24188           bool isFastMultiplier = false;
   24189           if (Diff < 10) {
   24190             switch ((unsigned char)Diff) {
   24191               default: break;
   24192               case 1:  // result = add base, cond
   24193               case 2:  // result = lea base(    , cond*2)
   24194               case 3:  // result = lea base(cond, cond*2)
   24195               case 4:  // result = lea base(    , cond*4)
   24196               case 5:  // result = lea base(cond, cond*4)
   24197               case 8:  // result = lea base(    , cond*8)
   24198               case 9:  // result = lea base(cond, cond*8)
   24199                 isFastMultiplier = true;
   24200                 break;
   24201             }
   24202           }
   24203 
   24204           if (isFastMultiplier) {
   24205             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
   24206             if (NeedsCondInvert) // Invert the condition if needed.
   24207               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   24208                                  DAG.getConstant(1, DL, Cond.getValueType()));
   24209 
   24210             // Zero extend the condition if needed.
   24211             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
   24212                                Cond);
   24213             // Scale the condition by the difference.
   24214             if (Diff != 1)
   24215               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
   24216                                  DAG.getConstant(Diff, DL,
   24217                                                  Cond.getValueType()));
   24218 
   24219             // Add the base if non-zero.
   24220             if (FalseC->getAPIntValue() != 0)
   24221               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   24222                                  SDValue(FalseC, 0));
   24223             return Cond;
   24224           }
   24225         }
   24226       }
   24227   }
   24228 
   24229   // Canonicalize max and min:
   24230   // (x > y) ? x : y -> (x >= y) ? x : y
   24231   // (x < y) ? x : y -> (x <= y) ? x : y
   24232   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
   24233   // the need for an extra compare
   24234   // against zero. e.g.
   24235   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
   24236   // subl   %esi, %edi
   24237   // testl  %edi, %edi
   24238   // movl   $0, %eax
   24239   // cmovgl %edi, %eax
   24240   // =>
   24241   // xorl   %eax, %eax
   24242   // subl   %esi, $edi
   24243   // cmovsl %eax, %edi
   24244   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
   24245       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   24246       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   24247     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   24248     switch (CC) {
   24249     default: break;
   24250     case ISD::SETLT:
   24251     case ISD::SETGT: {
   24252       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
   24253       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
   24254                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
   24255       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
   24256     }
   24257     }
   24258   }
   24259 
   24260   // Early exit check
   24261   if (!TLI.isTypeLegal(VT))
   24262     return SDValue();
   24263 
   24264   // Match VSELECTs into subs with unsigned saturation.
   24265   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
   24266       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
   24267       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
   24268        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
   24269     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   24270 
   24271     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
   24272     // left side invert the predicate to simplify logic below.
   24273     SDValue Other;
   24274     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
   24275       Other = RHS;
   24276       CC = ISD::getSetCCInverse(CC, true);
   24277     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
   24278       Other = LHS;
   24279     }
   24280 
   24281     if (Other.getNode() && Other->getNumOperands() == 2 &&
   24282         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
   24283       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
   24284       SDValue CondRHS = Cond->getOperand(1);
   24285 
   24286       // Look for a general sub with unsigned saturation first.
   24287       // x >= y ? x-y : 0 --> subus x, y
   24288       // x >  y ? x-y : 0 --> subus x, y
   24289       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
   24290           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
   24291         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
   24292 
   24293       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
   24294         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
   24295           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
   24296             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
   24297               // If the RHS is a constant we have to reverse the const
   24298               // canonicalization.
   24299               // x > C-1 ? x+-C : 0 --> subus x, C
   24300               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
   24301                   CondRHSConst->getAPIntValue() ==
   24302                       (-OpRHSConst->getAPIntValue() - 1))
   24303                 return DAG.getNode(
   24304                     X86ISD::SUBUS, DL, VT, OpLHS,
   24305                     DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
   24306 
   24307           // Another special case: If C was a sign bit, the sub has been
   24308           // canonicalized into a xor.
   24309           // FIXME: Would it be better to use computeKnownBits to determine
   24310           //        whether it's safe to decanonicalize the xor?
   24311           // x s< 0 ? x^C : 0 --> subus x, C
   24312           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
   24313               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
   24314               OpRHSConst->getAPIntValue().isSignBit())
   24315             // Note that we have to rebuild the RHS constant here to ensure we
   24316             // don't rely on particular values of undef lanes.
   24317             return DAG.getNode(
   24318                 X86ISD::SUBUS, DL, VT, OpLHS,
   24319                 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
   24320         }
   24321     }
   24322   }
   24323 
   24324   // Simplify vector selection if condition value type matches vselect
   24325   // operand type
   24326   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
   24327     assert(Cond.getValueType().isVector() &&
   24328            "vector select expects a vector selector!");
   24329 
   24330     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
   24331     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
   24332 
   24333     // Try invert the condition if true value is not all 1s and false value
   24334     // is not all 0s.
   24335     if (!TValIsAllOnes && !FValIsAllZeros &&
   24336         // Check if the selector will be produced by CMPP*/PCMP*
   24337         Cond.getOpcode() == ISD::SETCC &&
   24338         // Check if SETCC has already been promoted
   24339         TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
   24340             CondVT) {
   24341       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
   24342       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
   24343 
   24344       if (TValIsAllZeros || FValIsAllOnes) {
   24345         SDValue CC = Cond.getOperand(2);
   24346         ISD::CondCode NewCC =
   24347           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
   24348                                Cond.getOperand(0).getValueType().isInteger());
   24349         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
   24350         std::swap(LHS, RHS);
   24351         TValIsAllOnes = FValIsAllOnes;
   24352         FValIsAllZeros = TValIsAllZeros;
   24353       }
   24354     }
   24355 
   24356     if (TValIsAllOnes || FValIsAllZeros) {
   24357       SDValue Ret;
   24358 
   24359       if (TValIsAllOnes && FValIsAllZeros)
   24360         Ret = Cond;
   24361       else if (TValIsAllOnes)
   24362         Ret =
   24363             DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
   24364       else if (FValIsAllZeros)
   24365         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
   24366                           DAG.getBitcast(CondVT, LHS));
   24367 
   24368       return DAG.getBitcast(VT, Ret);
   24369     }
   24370   }
   24371 
   24372   // We should generate an X86ISD::BLENDI from a vselect if its argument
   24373   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
   24374   // constants. This specific pattern gets generated when we split a
   24375   // selector for a 512 bit vector in a machine without AVX512 (but with
   24376   // 256-bit vectors), during legalization:
   24377   //
   24378   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
   24379   //
   24380   // Iff we find this pattern and the build_vectors are built from
   24381   // constants, we translate the vselect into a shuffle_vector that we
   24382   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
   24383   if ((N->getOpcode() == ISD::VSELECT ||
   24384        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
   24385       !DCI.isBeforeLegalize() && !VT.is512BitVector()) {
   24386     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
   24387     if (Shuffle.getNode())
   24388       return Shuffle;
   24389   }
   24390 
   24391   // If this is a *dynamic* select (non-constant condition) and we can match
   24392   // this node with one of the variable blend instructions, restructure the
   24393   // condition so that the blends can use the high bit of each element and use
   24394   // SimplifyDemandedBits to simplify the condition operand.
   24395   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
   24396       !DCI.isBeforeLegalize() &&
   24397       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
   24398     unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
   24399 
   24400     // Don't optimize vector selects that map to mask-registers.
   24401     if (BitWidth == 1)
   24402       return SDValue();
   24403 
   24404     // We can only handle the cases where VSELECT is directly legal on the
   24405     // subtarget. We custom lower VSELECT nodes with constant conditions and
   24406     // this makes it hard to see whether a dynamic VSELECT will correctly
   24407     // lower, so we both check the operation's status and explicitly handle the
   24408     // cases where a *dynamic* blend will fail even though a constant-condition
   24409     // blend could be custom lowered.
   24410     // FIXME: We should find a better way to handle this class of problems.
   24411     // Potentially, we should combine constant-condition vselect nodes
   24412     // pre-legalization into shuffles and not mark as many types as custom
   24413     // lowered.
   24414     if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
   24415       return SDValue();
   24416     // FIXME: We don't support i16-element blends currently. We could and
   24417     // should support them by making *all* the bits in the condition be set
   24418     // rather than just the high bit and using an i8-element blend.
   24419     if (VT.getVectorElementType() == MVT::i16)
   24420       return SDValue();
   24421     // Dynamic blending was only available from SSE4.1 onward.
   24422     if (VT.is128BitVector() && !Subtarget->hasSSE41())
   24423       return SDValue();
   24424     // Byte blends are only available in AVX2
   24425     if (VT == MVT::v32i8 && !Subtarget->hasAVX2())
   24426       return SDValue();
   24427 
   24428     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
   24429     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
   24430 
   24431     APInt KnownZero, KnownOne;
   24432     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
   24433                                           DCI.isBeforeLegalizeOps());
   24434     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
   24435         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
   24436                                  TLO)) {
   24437       // If we changed the computation somewhere in the DAG, this change
   24438       // will affect all users of Cond.
   24439       // Make sure it is fine and update all the nodes so that we do not
   24440       // use the generic VSELECT anymore. Otherwise, we may perform
   24441       // wrong optimizations as we messed up with the actual expectation
   24442       // for the vector boolean values.
   24443       if (Cond != TLO.Old) {
   24444         // Check all uses of that condition operand to check whether it will be
   24445         // consumed by non-BLEND instructions, which may depend on all bits are
   24446         // set properly.
   24447         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
   24448              I != E; ++I)
   24449           if (I->getOpcode() != ISD::VSELECT)
   24450             // TODO: Add other opcodes eventually lowered into BLEND.
   24451             return SDValue();
   24452 
   24453         // Update all the users of the condition, before committing the change,
   24454         // so that the VSELECT optimizations that expect the correct vector
   24455         // boolean value will not be triggered.
   24456         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
   24457              I != E; ++I)
   24458           DAG.ReplaceAllUsesOfValueWith(
   24459               SDValue(*I, 0),
   24460               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
   24461                           Cond, I->getOperand(1), I->getOperand(2)));
   24462         DCI.CommitTargetLoweringOpt(TLO);
   24463         return SDValue();
   24464       }
   24465       // At this point, only Cond is changed. Change the condition
   24466       // just for N to keep the opportunity to optimize all other
   24467       // users their own way.
   24468       DAG.ReplaceAllUsesOfValueWith(
   24469           SDValue(N, 0),
   24470           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
   24471                       TLO.New, N->getOperand(1), N->getOperand(2)));
   24472       return SDValue();
   24473     }
   24474   }
   24475 
   24476   return SDValue();
   24477 }
   24478 
   24479 // Check whether a boolean test is testing a boolean value generated by
   24480 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
   24481 // code.
   24482 //
   24483 // Simplify the following patterns:
   24484 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
   24485 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
   24486 // to (Op EFLAGS Cond)
   24487 //
   24488 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
   24489 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
   24490 // to (Op EFLAGS !Cond)
   24491 //
   24492 // where Op could be BRCOND or CMOV.
   24493 //
   24494 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   24495   // Quit if not CMP and SUB with its value result used.
   24496   if (Cmp.getOpcode() != X86ISD::CMP &&
   24497       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
   24498       return SDValue();
   24499 
   24500   // Quit if not used as a boolean value.
   24501   if (CC != X86::COND_E && CC != X86::COND_NE)
   24502     return SDValue();
   24503 
   24504   // Check CMP operands. One of them should be 0 or 1 and the other should be
   24505   // an SetCC or extended from it.
   24506   SDValue Op1 = Cmp.getOperand(0);
   24507   SDValue Op2 = Cmp.getOperand(1);
   24508 
   24509   SDValue SetCC;
   24510   const ConstantSDNode* C = nullptr;
   24511   bool needOppositeCond = (CC == X86::COND_E);
   24512   bool checkAgainstTrue = false; // Is it a comparison against 1?
   24513 
   24514   if ((C = dyn_cast<ConstantSDNode>(Op1)))
   24515     SetCC = Op2;
   24516   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
   24517     SetCC = Op1;
   24518   else // Quit if all operands are not constants.
   24519     return SDValue();
   24520 
   24521   if (C->getZExtValue() == 1) {
   24522     needOppositeCond = !needOppositeCond;
   24523     checkAgainstTrue = true;
   24524   } else if (C->getZExtValue() != 0)
   24525     // Quit if the constant is neither 0 or 1.
   24526     return SDValue();
   24527 
   24528   bool truncatedToBoolWithAnd = false;
   24529   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
   24530   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
   24531          SetCC.getOpcode() == ISD::TRUNCATE ||
   24532          SetCC.getOpcode() == ISD::AND) {
   24533     if (SetCC.getOpcode() == ISD::AND) {
   24534       int OpIdx = -1;
   24535       if (isOneConstant(SetCC.getOperand(0)))
   24536         OpIdx = 1;
   24537       if (isOneConstant(SetCC.getOperand(1)))
   24538         OpIdx = 0;
   24539       if (OpIdx == -1)
   24540         break;
   24541       SetCC = SetCC.getOperand(OpIdx);
   24542       truncatedToBoolWithAnd = true;
   24543     } else
   24544       SetCC = SetCC.getOperand(0);
   24545   }
   24546 
   24547   switch (SetCC.getOpcode()) {
   24548   case X86ISD::SETCC_CARRY:
   24549     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
   24550     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
   24551     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
   24552     // truncated to i1 using 'and'.
   24553     if (checkAgainstTrue && !truncatedToBoolWithAnd)
   24554       break;
   24555     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
   24556            "Invalid use of SETCC_CARRY!");
   24557     // FALL THROUGH
   24558   case X86ISD::SETCC:
   24559     // Set the condition code or opposite one if necessary.
   24560     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
   24561     if (needOppositeCond)
   24562       CC = X86::GetOppositeBranchCondition(CC);
   24563     return SetCC.getOperand(1);
   24564   case X86ISD::CMOV: {
   24565     // Check whether false/true value has canonical one, i.e. 0 or 1.
   24566     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
   24567     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
   24568     // Quit if true value is not a constant.
   24569     if (!TVal)
   24570       return SDValue();
   24571     // Quit if false value is not a constant.
   24572     if (!FVal) {
   24573       SDValue Op = SetCC.getOperand(0);
   24574       // Skip 'zext' or 'trunc' node.
   24575       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
   24576           Op.getOpcode() == ISD::TRUNCATE)
   24577         Op = Op.getOperand(0);
   24578       // A special case for rdrand/rdseed, where 0 is set if false cond is
   24579       // found.
   24580       if ((Op.getOpcode() != X86ISD::RDRAND &&
   24581            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
   24582         return SDValue();
   24583     }
   24584     // Quit if false value is not the constant 0 or 1.
   24585     bool FValIsFalse = true;
   24586     if (FVal && FVal->getZExtValue() != 0) {
   24587       if (FVal->getZExtValue() != 1)
   24588         return SDValue();
   24589       // If FVal is 1, opposite cond is needed.
   24590       needOppositeCond = !needOppositeCond;
   24591       FValIsFalse = false;
   24592     }
   24593     // Quit if TVal is not the constant opposite of FVal.
   24594     if (FValIsFalse && TVal->getZExtValue() != 1)
   24595       return SDValue();
   24596     if (!FValIsFalse && TVal->getZExtValue() != 0)
   24597       return SDValue();
   24598     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
   24599     if (needOppositeCond)
   24600       CC = X86::GetOppositeBranchCondition(CC);
   24601     return SetCC.getOperand(3);
   24602   }
   24603   }
   24604 
   24605   return SDValue();
   24606 }
   24607 
   24608 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
   24609 /// Match:
   24610 ///   (X86or (X86setcc) (X86setcc))
   24611 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
   24612 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
   24613                                            X86::CondCode &CC1, SDValue &Flags,
   24614                                            bool &isAnd) {
   24615   if (Cond->getOpcode() == X86ISD::CMP) {
   24616     if (!isNullConstant(Cond->getOperand(1)))
   24617       return false;
   24618 
   24619     Cond = Cond->getOperand(0);
   24620   }
   24621 
   24622   isAnd = false;
   24623 
   24624   SDValue SetCC0, SetCC1;
   24625   switch (Cond->getOpcode()) {
   24626   default: return false;
   24627   case ISD::AND:
   24628   case X86ISD::AND:
   24629     isAnd = true;
   24630     // fallthru
   24631   case ISD::OR:
   24632   case X86ISD::OR:
   24633     SetCC0 = Cond->getOperand(0);
   24634     SetCC1 = Cond->getOperand(1);
   24635     break;
   24636   };
   24637 
   24638   // Make sure we have SETCC nodes, using the same flags value.
   24639   if (SetCC0.getOpcode() != X86ISD::SETCC ||
   24640       SetCC1.getOpcode() != X86ISD::SETCC ||
   24641       SetCC0->getOperand(1) != SetCC1->getOperand(1))
   24642     return false;
   24643 
   24644   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
   24645   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
   24646   Flags = SetCC0->getOperand(1);
   24647   return true;
   24648 }
   24649 
   24650 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
   24651 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
   24652                                   TargetLowering::DAGCombinerInfo &DCI,
   24653                                   const X86Subtarget *Subtarget) {
   24654   SDLoc DL(N);
   24655 
   24656   // If the flag operand isn't dead, don't touch this CMOV.
   24657   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
   24658     return SDValue();
   24659 
   24660   SDValue FalseOp = N->getOperand(0);
   24661   SDValue TrueOp = N->getOperand(1);
   24662   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
   24663   SDValue Cond = N->getOperand(3);
   24664 
   24665   if (CC == X86::COND_E || CC == X86::COND_NE) {
   24666     switch (Cond.getOpcode()) {
   24667     default: break;
   24668     case X86ISD::BSR:
   24669     case X86ISD::BSF:
   24670       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
   24671       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
   24672         return (CC == X86::COND_E) ? FalseOp : TrueOp;
   24673     }
   24674   }
   24675 
   24676   SDValue Flags;
   24677 
   24678   Flags = checkBoolTestSetCCCombine(Cond, CC);
   24679   if (Flags.getNode() &&
   24680       // Extra check as FCMOV only supports a subset of X86 cond.
   24681       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
   24682     SDValue Ops[] = { FalseOp, TrueOp,
   24683                       DAG.getConstant(CC, DL, MVT::i8), Flags };
   24684     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
   24685   }
   24686 
   24687   // If this is a select between two integer constants, try to do some
   24688   // optimizations.  Note that the operands are ordered the opposite of SELECT
   24689   // operands.
   24690   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
   24691     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
   24692       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
   24693       // larger than FalseC (the false value).
   24694       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
   24695         CC = X86::GetOppositeBranchCondition(CC);
   24696         std::swap(TrueC, FalseC);
   24697         std::swap(TrueOp, FalseOp);
   24698       }
   24699 
   24700       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
   24701       // This is efficient for any integer data type (including i8/i16) and
   24702       // shift amount.
   24703       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
   24704         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   24705                            DAG.getConstant(CC, DL, MVT::i8), Cond);
   24706 
   24707         // Zero extend the condition if needed.
   24708         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
   24709 
   24710         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
   24711         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
   24712                            DAG.getConstant(ShAmt, DL, MVT::i8));
   24713         if (N->getNumValues() == 2)  // Dead flag value?
   24714           return DCI.CombineTo(N, Cond, SDValue());
   24715         return Cond;
   24716       }
   24717 
   24718       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
   24719       // for any integer data type, including i8/i16.
   24720       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
   24721         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   24722                            DAG.getConstant(CC, DL, MVT::i8), Cond);
   24723 
   24724         // Zero extend the condition if needed.
   24725         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
   24726                            FalseC->getValueType(0), Cond);
   24727         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   24728                            SDValue(FalseC, 0));
   24729 
   24730         if (N->getNumValues() == 2)  // Dead flag value?
   24731           return DCI.CombineTo(N, Cond, SDValue());
   24732         return Cond;
   24733       }
   24734 
   24735       // Optimize cases that will turn into an LEA instruction.  This requires
   24736       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   24737       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
   24738         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
   24739         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
   24740 
   24741         bool isFastMultiplier = false;
   24742         if (Diff < 10) {
   24743           switch ((unsigned char)Diff) {
   24744           default: break;
   24745           case 1:  // result = add base, cond
   24746           case 2:  // result = lea base(    , cond*2)
   24747           case 3:  // result = lea base(cond, cond*2)
   24748           case 4:  // result = lea base(    , cond*4)
   24749           case 5:  // result = lea base(cond, cond*4)
   24750           case 8:  // result = lea base(    , cond*8)
   24751           case 9:  // result = lea base(cond, cond*8)
   24752             isFastMultiplier = true;
   24753             break;
   24754           }
   24755         }
   24756 
   24757         if (isFastMultiplier) {
   24758           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
   24759           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   24760                              DAG.getConstant(CC, DL, MVT::i8), Cond);
   24761           // Zero extend the condition if needed.
   24762           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
   24763                              Cond);
   24764           // Scale the condition by the difference.
   24765           if (Diff != 1)
   24766             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
   24767                                DAG.getConstant(Diff, DL, Cond.getValueType()));
   24768 
   24769           // Add the base if non-zero.
   24770           if (FalseC->getAPIntValue() != 0)
   24771             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   24772                                SDValue(FalseC, 0));
   24773           if (N->getNumValues() == 2)  // Dead flag value?
   24774             return DCI.CombineTo(N, Cond, SDValue());
   24775           return Cond;
   24776         }
   24777       }
   24778     }
   24779   }
   24780 
   24781   // Handle these cases:
   24782   //   (select (x != c), e, c) -> select (x != c), e, x),
   24783   //   (select (x == c), c, e) -> select (x == c), x, e)
   24784   // where the c is an integer constant, and the "select" is the combination
   24785   // of CMOV and CMP.
   24786   //
   24787   // The rationale for this change is that the conditional-move from a constant
   24788   // needs two instructions, however, conditional-move from a register needs
   24789   // only one instruction.
   24790   //
   24791   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
   24792   //  some instruction-combining opportunities. This opt needs to be
   24793   //  postponed as late as possible.
   24794   //
   24795   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
   24796     // the DCI.xxxx conditions are provided to postpone the optimization as
   24797     // late as possible.
   24798 
   24799     ConstantSDNode *CmpAgainst = nullptr;
   24800     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
   24801         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
   24802         !isa<ConstantSDNode>(Cond.getOperand(0))) {
   24803 
   24804       if (CC == X86::COND_NE &&
   24805           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
   24806         CC = X86::GetOppositeBranchCondition(CC);
   24807         std::swap(TrueOp, FalseOp);
   24808       }
   24809 
   24810       if (CC == X86::COND_E &&
   24811           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
   24812         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
   24813                           DAG.getConstant(CC, DL, MVT::i8), Cond };
   24814         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
   24815       }
   24816     }
   24817   }
   24818 
   24819   // Fold and/or of setcc's to double CMOV:
   24820   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
   24821   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
   24822   //
   24823   // This combine lets us generate:
   24824   //   cmovcc1 (jcc1 if we don't have CMOV)
   24825   //   cmovcc2 (same)
   24826   // instead of:
   24827   //   setcc1
   24828   //   setcc2
   24829   //   and/or
   24830   //   cmovne (jne if we don't have CMOV)
   24831   // When we can't use the CMOV instruction, it might increase branch
   24832   // mispredicts.
   24833   // When we can use CMOV, or when there is no mispredict, this improves
   24834   // throughput and reduces register pressure.
   24835   //
   24836   if (CC == X86::COND_NE) {
   24837     SDValue Flags;
   24838     X86::CondCode CC0, CC1;
   24839     bool isAndSetCC;
   24840     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
   24841       if (isAndSetCC) {
   24842         std::swap(FalseOp, TrueOp);
   24843         CC0 = X86::GetOppositeBranchCondition(CC0);
   24844         CC1 = X86::GetOppositeBranchCondition(CC1);
   24845       }
   24846 
   24847       SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
   24848         Flags};
   24849       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
   24850       SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
   24851       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
   24852       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
   24853       return CMOV;
   24854     }
   24855   }
   24856 
   24857   return SDValue();
   24858 }
   24859 
   24860 /// PerformMulCombine - Optimize a single multiply with constant into two
   24861 /// in order to implement it with two cheaper instructions, e.g.
   24862 /// LEA + SHL, LEA + LEA.
   24863 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
   24864                                  TargetLowering::DAGCombinerInfo &DCI) {
   24865   // An imul is usually smaller than the alternative sequence.
   24866   if (DAG.getMachineFunction().getFunction()->optForMinSize())
   24867     return SDValue();
   24868 
   24869   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
   24870     return SDValue();
   24871 
   24872   EVT VT = N->getValueType(0);
   24873   if (VT != MVT::i64 && VT != MVT::i32)
   24874     return SDValue();
   24875 
   24876   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   24877   if (!C)
   24878     return SDValue();
   24879   uint64_t MulAmt = C->getZExtValue();
   24880   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
   24881     return SDValue();
   24882 
   24883   uint64_t MulAmt1 = 0;
   24884   uint64_t MulAmt2 = 0;
   24885   if ((MulAmt % 9) == 0) {
   24886     MulAmt1 = 9;
   24887     MulAmt2 = MulAmt / 9;
   24888   } else if ((MulAmt % 5) == 0) {
   24889     MulAmt1 = 5;
   24890     MulAmt2 = MulAmt / 5;
   24891   } else if ((MulAmt % 3) == 0) {
   24892     MulAmt1 = 3;
   24893     MulAmt2 = MulAmt / 3;
   24894   }
   24895 
   24896   SDLoc DL(N);
   24897   SDValue NewMul;
   24898   if (MulAmt2 &&
   24899       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
   24900 
   24901     if (isPowerOf2_64(MulAmt2) &&
   24902         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
   24903       // If second multiplifer is pow2, issue it first. We want the multiply by
   24904       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
   24905       // is an add.
   24906       std::swap(MulAmt1, MulAmt2);
   24907 
   24908     if (isPowerOf2_64(MulAmt1))
   24909       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   24910                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
   24911     else
   24912       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
   24913                            DAG.getConstant(MulAmt1, DL, VT));
   24914 
   24915     if (isPowerOf2_64(MulAmt2))
   24916       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
   24917                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
   24918     else
   24919       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
   24920                            DAG.getConstant(MulAmt2, DL, VT));
   24921   }
   24922 
   24923   if (!NewMul) {
   24924     assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
   24925            && "Both cases that could cause potential overflows should have "
   24926               "already been handled.");
   24927     if (isPowerOf2_64(MulAmt - 1))
   24928       // (mul x, 2^N + 1) => (add (shl x, N), x)
   24929       NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
   24930                                 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   24931                                 DAG.getConstant(Log2_64(MulAmt - 1), DL,
   24932                                 MVT::i8)));
   24933 
   24934     else if (isPowerOf2_64(MulAmt + 1))
   24935       // (mul x, 2^N - 1) => (sub (shl x, N), x)
   24936       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
   24937                                 N->getOperand(0),
   24938                                 DAG.getConstant(Log2_64(MulAmt + 1),
   24939                                 DL, MVT::i8)), N->getOperand(0));
   24940   }
   24941 
   24942   if (NewMul)
   24943     // Do not add new nodes to DAG combiner worklist.
   24944     DCI.CombineTo(N, NewMul, false);
   24945 
   24946   return SDValue();
   24947 }
   24948 
   24949 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
   24950   SDValue N0 = N->getOperand(0);
   24951   SDValue N1 = N->getOperand(1);
   24952   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   24953   EVT VT = N0.getValueType();
   24954 
   24955   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
   24956   // since the result of setcc_c is all zero's or all ones.
   24957   if (VT.isInteger() && !VT.isVector() &&
   24958       N1C && N0.getOpcode() == ISD::AND &&
   24959       N0.getOperand(1).getOpcode() == ISD::Constant) {
   24960     SDValue N00 = N0.getOperand(0);
   24961     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
   24962     APInt ShAmt = N1C->getAPIntValue();
   24963     Mask = Mask.shl(ShAmt);
   24964     bool MaskOK = false;
   24965     // We can handle cases concerning bit-widening nodes containing setcc_c if
   24966     // we carefully interrogate the mask to make sure we are semantics
   24967     // preserving.
   24968     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
   24969     // of the underlying setcc_c operation if the setcc_c was zero extended.
   24970     // Consider the following example:
   24971     //   zext(setcc_c)                 -> i32 0x0000FFFF
   24972     //   c1                            -> i32 0x0000FFFF
   24973     //   c2                            -> i32 0x00000001
   24974     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
   24975     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
   24976     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   24977       MaskOK = true;
   24978     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
   24979                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   24980       MaskOK = true;
   24981     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
   24982                 N00.getOpcode() == ISD::ANY_EXTEND) &&
   24983                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   24984       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
   24985     }
   24986     if (MaskOK && Mask != 0) {
   24987       SDLoc DL(N);
   24988       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
   24989     }
   24990   }
   24991 
   24992   // Hardware support for vector shifts is sparse which makes us scalarize the
   24993   // vector operations in many cases. Also, on sandybridge ADD is faster than
   24994   // shl.
   24995   // (shl V, 1) -> add V,V
   24996   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
   24997     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
   24998       assert(N0.getValueType().isVector() && "Invalid vector shift type");
   24999       // We shift all of the values by one. In many cases we do not have
   25000       // hardware support for this operation. This is better expressed as an ADD
   25001       // of two values.
   25002       if (N1SplatC->getAPIntValue() == 1)
   25003         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
   25004     }
   25005 
   25006   return SDValue();
   25007 }
   25008 
   25009 static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) {
   25010   SDValue N0 = N->getOperand(0);
   25011   SDValue N1 = N->getOperand(1);
   25012   EVT VT = N0.getValueType();
   25013   unsigned Size = VT.getSizeInBits();
   25014 
   25015   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
   25016   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
   25017   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
   25018   // depending on sign of (SarConst - [56,48,32,24,16])
   25019 
   25020   // sexts in X86 are MOVs. The MOVs have the same code size
   25021   // as above SHIFTs (only SHIFT on 1 has lower code size).
   25022   // However the MOVs have 2 advantages to a SHIFT:
   25023   // 1. MOVs can write to a register that differs from source
   25024   // 2. MOVs accept memory operands
   25025 
   25026   if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
   25027       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
   25028       N0.getOperand(1).getOpcode() != ISD::Constant)
   25029     return SDValue();
   25030 
   25031   SDValue N00 = N0.getOperand(0);
   25032   SDValue N01 = N0.getOperand(1);
   25033   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
   25034   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
   25035   EVT CVT = N1.getValueType();
   25036 
   25037   if (SarConst.isNegative())
   25038     return SDValue();
   25039 
   25040   for (MVT SVT : MVT::integer_valuetypes()) {
   25041     unsigned ShiftSize = SVT.getSizeInBits();
   25042     // skipping types without corresponding sext/zext and
   25043     // ShlConst that is not one of [56,48,32,24,16]
   25044     if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
   25045       continue;
   25046     SDLoc DL(N);
   25047     SDValue NN =
   25048         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
   25049     SarConst = SarConst - (Size - ShiftSize);
   25050     if (SarConst == 0)
   25051       return NN;
   25052     else if (SarConst.isNegative())
   25053       return DAG.getNode(ISD::SHL, DL, VT, NN,
   25054                          DAG.getConstant(-SarConst, DL, CVT));
   25055     else
   25056       return DAG.getNode(ISD::SRA, DL, VT, NN,
   25057                          DAG.getConstant(SarConst, DL, CVT));
   25058   }
   25059   return SDValue();
   25060 }
   25061 
   25062 /// \brief Returns a vector of 0s if the node in input is a vector logical
   25063 /// shift by a constant amount which is known to be bigger than or equal
   25064 /// to the vector element size in bits.
   25065 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
   25066                                       const X86Subtarget *Subtarget) {
   25067   EVT VT = N->getValueType(0);
   25068 
   25069   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
   25070       (!Subtarget->hasInt256() ||
   25071        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
   25072     return SDValue();
   25073 
   25074   SDValue Amt = N->getOperand(1);
   25075   SDLoc DL(N);
   25076   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
   25077     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
   25078       APInt ShiftAmt = AmtSplat->getAPIntValue();
   25079       unsigned MaxAmount =
   25080         VT.getSimpleVT().getVectorElementType().getSizeInBits();
   25081 
   25082       // SSE2/AVX2 logical shifts always return a vector of 0s
   25083       // if the shift amount is bigger than or equal to
   25084       // the element size. The constant shift amount will be
   25085       // encoded as a 8-bit immediate.
   25086       if (ShiftAmt.trunc(8).uge(MaxAmount))
   25087         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
   25088     }
   25089 
   25090   return SDValue();
   25091 }
   25092 
   25093 /// PerformShiftCombine - Combine shifts.
   25094 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
   25095                                    TargetLowering::DAGCombinerInfo &DCI,
   25096                                    const X86Subtarget *Subtarget) {
   25097   if (N->getOpcode() == ISD::SHL)
   25098     if (SDValue V = PerformSHLCombine(N, DAG))
   25099       return V;
   25100 
   25101   if (N->getOpcode() == ISD::SRA)
   25102     if (SDValue V = PerformSRACombine(N, DAG))
   25103       return V;
   25104 
   25105   // Try to fold this logical shift into a zero vector.
   25106   if (N->getOpcode() != ISD::SRA)
   25107     if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
   25108       return V;
   25109 
   25110   return SDValue();
   25111 }
   25112 
   25113 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
   25114 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
   25115 // and friends.  Likewise for OR -> CMPNEQSS.
   25116 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
   25117                             TargetLowering::DAGCombinerInfo &DCI,
   25118                             const X86Subtarget *Subtarget) {
   25119   unsigned opcode;
   25120 
   25121   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
   25122   // we're requiring SSE2 for both.
   25123   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
   25124     SDValue N0 = N->getOperand(0);
   25125     SDValue N1 = N->getOperand(1);
   25126     SDValue CMP0 = N0->getOperand(1);
   25127     SDValue CMP1 = N1->getOperand(1);
   25128     SDLoc DL(N);
   25129 
   25130     // The SETCCs should both refer to the same CMP.
   25131     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
   25132       return SDValue();
   25133 
   25134     SDValue CMP00 = CMP0->getOperand(0);
   25135     SDValue CMP01 = CMP0->getOperand(1);
   25136     EVT     VT    = CMP00.getValueType();
   25137 
   25138     if (VT == MVT::f32 || VT == MVT::f64) {
   25139       bool ExpectingFlags = false;
   25140       // Check for any users that want flags:
   25141       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
   25142            !ExpectingFlags && UI != UE; ++UI)
   25143         switch (UI->getOpcode()) {
   25144         default:
   25145         case ISD::BR_CC:
   25146         case ISD::BRCOND:
   25147         case ISD::SELECT:
   25148           ExpectingFlags = true;
   25149           break;
   25150         case ISD::CopyToReg:
   25151         case ISD::SIGN_EXTEND:
   25152         case ISD::ZERO_EXTEND:
   25153         case ISD::ANY_EXTEND:
   25154           break;
   25155         }
   25156 
   25157       if (!ExpectingFlags) {
   25158         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
   25159         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
   25160 
   25161         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
   25162           X86::CondCode tmp = cc0;
   25163           cc0 = cc1;
   25164           cc1 = tmp;
   25165         }
   25166 
   25167         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
   25168             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
   25169           // FIXME: need symbolic constants for these magic numbers.
   25170           // See X86ATTInstPrinter.cpp:printSSECC().
   25171           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
   25172           if (Subtarget->hasAVX512()) {
   25173             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
   25174                                          CMP01,
   25175                                          DAG.getConstant(x86cc, DL, MVT::i8));
   25176             if (N->getValueType(0) != MVT::i1)
   25177               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
   25178                                  FSetCC);
   25179             return FSetCC;
   25180           }
   25181           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
   25182                                               CMP00.getValueType(), CMP00, CMP01,
   25183                                               DAG.getConstant(x86cc, DL,
   25184                                                               MVT::i8));
   25185 
   25186           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
   25187           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
   25188 
   25189           if (is64BitFP && !Subtarget->is64Bit()) {
   25190             // On a 32-bit target, we cannot bitcast the 64-bit float to a
   25191             // 64-bit integer, since that's not a legal type. Since
   25192             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
   25193             // bits, but can do this little dance to extract the lowest 32 bits
   25194             // and work with those going forward.
   25195             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
   25196                                            OnesOrZeroesF);
   25197             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
   25198             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
   25199                                         Vector32, DAG.getIntPtrConstant(0, DL));
   25200             IntVT = MVT::i32;
   25201           }
   25202 
   25203           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
   25204           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
   25205                                       DAG.getConstant(1, DL, IntVT));
   25206           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
   25207                                               ANDed);
   25208           return OneBitOfTruth;
   25209         }
   25210       }
   25211     }
   25212   }
   25213   return SDValue();
   25214 }
   25215 
   25216 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
   25217 /// so it can be folded inside ANDNP.
   25218 static bool CanFoldXORWithAllOnes(const SDNode *N) {
   25219   EVT VT = N->getValueType(0);
   25220 
   25221   // Match direct AllOnes for 128 and 256-bit vectors
   25222   if (ISD::isBuildVectorAllOnes(N))
   25223     return true;
   25224 
   25225   // Look through a bit convert.
   25226   if (N->getOpcode() == ISD::BITCAST)
   25227     N = N->getOperand(0).getNode();
   25228 
   25229   // Sometimes the operand may come from a insert_subvector building a 256-bit
   25230   // allones vector
   25231   if (VT.is256BitVector() &&
   25232       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
   25233     SDValue V1 = N->getOperand(0);
   25234     SDValue V2 = N->getOperand(1);
   25235 
   25236     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
   25237         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
   25238         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
   25239         ISD::isBuildVectorAllOnes(V2.getNode()))
   25240       return true;
   25241   }
   25242 
   25243   return false;
   25244 }
   25245 
   25246 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
   25247 // register. In most cases we actually compare or select YMM-sized registers
   25248 // and mixing the two types creates horrible code. This method optimizes
   25249 // some of the transition sequences.
   25250 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   25251                                  TargetLowering::DAGCombinerInfo &DCI,
   25252                                  const X86Subtarget *Subtarget) {
   25253   EVT VT = N->getValueType(0);
   25254   if (!VT.is256BitVector())
   25255     return SDValue();
   25256 
   25257   assert((N->getOpcode() == ISD::ANY_EXTEND ||
   25258           N->getOpcode() == ISD::ZERO_EXTEND ||
   25259           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
   25260 
   25261   SDValue Narrow = N->getOperand(0);
   25262   EVT NarrowVT = Narrow->getValueType(0);
   25263   if (!NarrowVT.is128BitVector())
   25264     return SDValue();
   25265 
   25266   if (Narrow->getOpcode() != ISD::XOR &&
   25267       Narrow->getOpcode() != ISD::AND &&
   25268       Narrow->getOpcode() != ISD::OR)
   25269     return SDValue();
   25270 
   25271   SDValue N0  = Narrow->getOperand(0);
   25272   SDValue N1  = Narrow->getOperand(1);
   25273   SDLoc DL(Narrow);
   25274 
   25275   // The Left side has to be a trunc.
   25276   if (N0.getOpcode() != ISD::TRUNCATE)
   25277     return SDValue();
   25278 
   25279   // The type of the truncated inputs.
   25280   EVT WideVT = N0->getOperand(0)->getValueType(0);
   25281   if (WideVT != VT)
   25282     return SDValue();
   25283 
   25284   // The right side has to be a 'trunc' or a constant vector.
   25285   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
   25286   ConstantSDNode *RHSConstSplat = nullptr;
   25287   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
   25288     RHSConstSplat = RHSBV->getConstantSplatNode();
   25289   if (!RHSTrunc && !RHSConstSplat)
   25290     return SDValue();
   25291 
   25292   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   25293 
   25294   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
   25295     return SDValue();
   25296 
   25297   // Set N0 and N1 to hold the inputs to the new wide operation.
   25298   N0 = N0->getOperand(0);
   25299   if (RHSConstSplat) {
   25300     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
   25301                      SDValue(RHSConstSplat, 0));
   25302     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
   25303     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
   25304   } else if (RHSTrunc) {
   25305     N1 = N1->getOperand(0);
   25306   }
   25307 
   25308   // Generate the wide operation.
   25309   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
   25310   unsigned Opcode = N->getOpcode();
   25311   switch (Opcode) {
   25312   case ISD::ANY_EXTEND:
   25313     return Op;
   25314   case ISD::ZERO_EXTEND: {
   25315     unsigned InBits = NarrowVT.getScalarSizeInBits();
   25316     APInt Mask = APInt::getAllOnesValue(InBits);
   25317     Mask = Mask.zext(VT.getScalarSizeInBits());
   25318     return DAG.getNode(ISD::AND, DL, VT,
   25319                        Op, DAG.getConstant(Mask, DL, VT));
   25320   }
   25321   case ISD::SIGN_EXTEND:
   25322     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
   25323                        Op, DAG.getValueType(NarrowVT));
   25324   default:
   25325     llvm_unreachable("Unexpected opcode");
   25326   }
   25327 }
   25328 
   25329 static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
   25330                                  TargetLowering::DAGCombinerInfo &DCI,
   25331                                  const X86Subtarget *Subtarget) {
   25332   SDValue N0 = N->getOperand(0);
   25333   SDValue N1 = N->getOperand(1);
   25334   SDLoc DL(N);
   25335 
   25336   // A vector zext_in_reg may be represented as a shuffle,
   25337   // feeding into a bitcast (this represents anyext) feeding into
   25338   // an and with a mask.
   25339   // We'd like to try to combine that into a shuffle with zero
   25340   // plus a bitcast, removing the and.
   25341   if (N0.getOpcode() != ISD::BITCAST ||
   25342       N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
   25343     return SDValue();
   25344 
   25345   // The other side of the AND should be a splat of 2^C, where C
   25346   // is the number of bits in the source type.
   25347   if (N1.getOpcode() == ISD::BITCAST)
   25348     N1 = N1.getOperand(0);
   25349   if (N1.getOpcode() != ISD::BUILD_VECTOR)
   25350     return SDValue();
   25351   BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
   25352 
   25353   ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
   25354   EVT SrcType = Shuffle->getValueType(0);
   25355 
   25356   // We expect a single-source shuffle
   25357   if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF)
   25358     return SDValue();
   25359 
   25360   unsigned SrcSize = SrcType.getScalarSizeInBits();
   25361 
   25362   APInt SplatValue, SplatUndef;
   25363   unsigned SplatBitSize;
   25364   bool HasAnyUndefs;
   25365   if (!Vector->isConstantSplat(SplatValue, SplatUndef,
   25366                                 SplatBitSize, HasAnyUndefs))
   25367     return SDValue();
   25368 
   25369   unsigned ResSize = N1.getValueType().getScalarSizeInBits();
   25370   // Make sure the splat matches the mask we expect
   25371   if (SplatBitSize > ResSize ||
   25372       (SplatValue + 1).exactLogBase2() != (int)SrcSize)
   25373     return SDValue();
   25374 
   25375   // Make sure the input and output size make sense
   25376   if (SrcSize >= ResSize || ResSize % SrcSize)
   25377     return SDValue();
   25378 
   25379   // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
   25380   // The number of u's between each two values depends on the ratio between
   25381   // the source and dest type.
   25382   unsigned ZextRatio = ResSize / SrcSize;
   25383   bool IsZext = true;
   25384   for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) {
   25385     if (i % ZextRatio) {
   25386       if (Shuffle->getMaskElt(i) > 0) {
   25387         // Expected undef
   25388         IsZext = false;
   25389         break;
   25390       }
   25391     } else {
   25392       if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
   25393         // Expected element number
   25394         IsZext = false;
   25395         break;
   25396       }
   25397     }
   25398   }
   25399 
   25400   if (!IsZext)
   25401     return SDValue();
   25402 
   25403   // Ok, perform the transformation - replace the shuffle with
   25404   // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
   25405   // (instead of undef) where the k elements come from the zero vector.
   25406   SmallVector<int, 8> Mask;
   25407   unsigned NumElems = SrcType.getVectorNumElements();
   25408   for (unsigned i = 0; i < NumElems; ++i)
   25409     if (i % ZextRatio)
   25410       Mask.push_back(NumElems);
   25411     else
   25412       Mask.push_back(i / ZextRatio);
   25413 
   25414   SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
   25415     Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
   25416   return DAG.getBitcast(N0.getValueType(), NewShuffle);
   25417 }
   25418 
   25419 /// If both input operands of a logic op are being cast from floating point
   25420 /// types, try to convert this into a floating point logic node to avoid
   25421 /// unnecessary moves from SSE to integer registers.
   25422 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
   25423                                         const X86Subtarget *Subtarget) {
   25424   unsigned FPOpcode = ISD::DELETED_NODE;
   25425   if (N->getOpcode() == ISD::AND)
   25426     FPOpcode = X86ISD::FAND;
   25427   else if (N->getOpcode() == ISD::OR)
   25428     FPOpcode = X86ISD::FOR;
   25429   else if (N->getOpcode() == ISD::XOR)
   25430     FPOpcode = X86ISD::FXOR;
   25431 
   25432   assert(FPOpcode != ISD::DELETED_NODE &&
   25433          "Unexpected input node for FP logic conversion");
   25434 
   25435   EVT VT = N->getValueType(0);
   25436   SDValue N0 = N->getOperand(0);
   25437   SDValue N1 = N->getOperand(1);
   25438   SDLoc DL(N);
   25439   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
   25440       ((Subtarget->hasSSE1() && VT == MVT::i32) ||
   25441        (Subtarget->hasSSE2() && VT == MVT::i64))) {
   25442     SDValue N00 = N0.getOperand(0);
   25443     SDValue N10 = N1.getOperand(0);
   25444     EVT N00Type = N00.getValueType();
   25445     EVT N10Type = N10.getValueType();
   25446     if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
   25447       SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
   25448       return DAG.getBitcast(VT, FPLogic);
   25449     }
   25450   }
   25451   return SDValue();
   25452 }
   25453 
   25454 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   25455                                  TargetLowering::DAGCombinerInfo &DCI,
   25456                                  const X86Subtarget *Subtarget) {
   25457   if (DCI.isBeforeLegalizeOps())
   25458     return SDValue();
   25459 
   25460   if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget))
   25461     return Zext;
   25462 
   25463   if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
   25464     return R;
   25465 
   25466   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
   25467     return FPLogic;
   25468 
   25469   EVT VT = N->getValueType(0);
   25470   SDValue N0 = N->getOperand(0);
   25471   SDValue N1 = N->getOperand(1);
   25472   SDLoc DL(N);
   25473 
   25474   // Create BEXTR instructions
   25475   // BEXTR is ((X >> imm) & (2**size-1))
   25476   if (VT == MVT::i32 || VT == MVT::i64) {
   25477     // Check for BEXTR.
   25478     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
   25479         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
   25480       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
   25481       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   25482       if (MaskNode && ShiftNode) {
   25483         uint64_t Mask = MaskNode->getZExtValue();
   25484         uint64_t Shift = ShiftNode->getZExtValue();
   25485         if (isMask_64(Mask)) {
   25486           uint64_t MaskSize = countPopulation(Mask);
   25487           if (Shift + MaskSize <= VT.getSizeInBits())
   25488             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
   25489                                DAG.getConstant(Shift | (MaskSize << 8), DL,
   25490                                                VT));
   25491         }
   25492       }
   25493     } // BEXTR
   25494 
   25495     return SDValue();
   25496   }
   25497 
   25498   // Want to form ANDNP nodes:
   25499   // 1) In the hopes of then easily combining them with OR and AND nodes
   25500   //    to form PBLEND/PSIGN.
   25501   // 2) To match ANDN packed intrinsics
   25502   if (VT != MVT::v2i64 && VT != MVT::v4i64)
   25503     return SDValue();
   25504 
   25505   // Check LHS for vnot
   25506   if (N0.getOpcode() == ISD::XOR &&
   25507       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
   25508       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
   25509     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
   25510 
   25511   // Check RHS for vnot
   25512   if (N1.getOpcode() == ISD::XOR &&
   25513       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
   25514       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
   25515     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
   25516 
   25517   return SDValue();
   25518 }
   25519 
   25520 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   25521                                 TargetLowering::DAGCombinerInfo &DCI,
   25522                                 const X86Subtarget *Subtarget) {
   25523   if (DCI.isBeforeLegalizeOps())
   25524     return SDValue();
   25525 
   25526   if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
   25527     return R;
   25528 
   25529   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
   25530     return FPLogic;
   25531 
   25532   SDValue N0 = N->getOperand(0);
   25533   SDValue N1 = N->getOperand(1);
   25534   EVT VT = N->getValueType(0);
   25535 
   25536   // look for psign/blend
   25537   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
   25538     if (!Subtarget->hasSSSE3() ||
   25539         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
   25540       return SDValue();
   25541 
   25542     // Canonicalize pandn to RHS
   25543     if (N0.getOpcode() == X86ISD::ANDNP)
   25544       std::swap(N0, N1);
   25545     // or (and (m, y), (pandn m, x))
   25546     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
   25547       SDValue Mask = N1.getOperand(0);
   25548       SDValue X    = N1.getOperand(1);
   25549       SDValue Y;
   25550       if (N0.getOperand(0) == Mask)
   25551         Y = N0.getOperand(1);
   25552       if (N0.getOperand(1) == Mask)
   25553         Y = N0.getOperand(0);
   25554 
   25555       // Check to see if the mask appeared in both the AND and ANDNP and
   25556       if (!Y.getNode())
   25557         return SDValue();
   25558 
   25559       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
   25560       // Look through mask bitcast.
   25561       if (Mask.getOpcode() == ISD::BITCAST)
   25562         Mask = Mask.getOperand(0);
   25563       if (X.getOpcode() == ISD::BITCAST)
   25564         X = X.getOperand(0);
   25565       if (Y.getOpcode() == ISD::BITCAST)
   25566         Y = Y.getOperand(0);
   25567 
   25568       EVT MaskVT = Mask.getValueType();
   25569 
   25570       // Validate that the Mask operand is a vector sra node.
   25571       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
   25572       // there is no psrai.b
   25573       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
   25574       unsigned SraAmt = ~0;
   25575       if (Mask.getOpcode() == ISD::SRA) {
   25576         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
   25577           if (auto *AmtConst = AmtBV->getConstantSplatNode())
   25578             SraAmt = AmtConst->getZExtValue();
   25579       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
   25580         SDValue SraC = Mask.getOperand(1);
   25581         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
   25582       }
   25583       if ((SraAmt + 1) != EltBits)
   25584         return SDValue();
   25585 
   25586       SDLoc DL(N);
   25587 
   25588       // Now we know we at least have a plendvb with the mask val.  See if
   25589       // we can form a psignb/w/d.
   25590       // psign = x.type == y.type == mask.type && y = sub(0, x);
   25591       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
   25592           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
   25593           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
   25594         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
   25595                "Unsupported VT for PSIGN");
   25596         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
   25597         return DAG.getBitcast(VT, Mask);
   25598       }
   25599       // PBLENDVB only available on SSE 4.1
   25600       if (!Subtarget->hasSSE41())
   25601         return SDValue();
   25602 
   25603       MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
   25604 
   25605       X = DAG.getBitcast(BlendVT, X);
   25606       Y = DAG.getBitcast(BlendVT, Y);
   25607       Mask = DAG.getBitcast(BlendVT, Mask);
   25608       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
   25609       return DAG.getBitcast(VT, Mask);
   25610     }
   25611   }
   25612 
   25613   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
   25614     return SDValue();
   25615 
   25616   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
   25617   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
   25618 
   25619   // SHLD/SHRD instructions have lower register pressure, but on some
   25620   // platforms they have higher latency than the equivalent
   25621   // series of shifts/or that would otherwise be generated.
   25622   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
   25623   // have higher latencies and we are not optimizing for size.
   25624   if (!OptForSize && Subtarget->isSHLDSlow())
   25625     return SDValue();
   25626 
   25627   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
   25628     std::swap(N0, N1);
   25629   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
   25630     return SDValue();
   25631   if (!N0.hasOneUse() || !N1.hasOneUse())
   25632     return SDValue();
   25633 
   25634   SDValue ShAmt0 = N0.getOperand(1);
   25635   if (ShAmt0.getValueType() != MVT::i8)
   25636     return SDValue();
   25637   SDValue ShAmt1 = N1.getOperand(1);
   25638   if (ShAmt1.getValueType() != MVT::i8)
   25639     return SDValue();
   25640   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
   25641     ShAmt0 = ShAmt0.getOperand(0);
   25642   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
   25643     ShAmt1 = ShAmt1.getOperand(0);
   25644 
   25645   SDLoc DL(N);
   25646   unsigned Opc = X86ISD::SHLD;
   25647   SDValue Op0 = N0.getOperand(0);
   25648   SDValue Op1 = N1.getOperand(0);
   25649   if (ShAmt0.getOpcode() == ISD::SUB) {
   25650     Opc = X86ISD::SHRD;
   25651     std::swap(Op0, Op1);
   25652     std::swap(ShAmt0, ShAmt1);
   25653   }
   25654 
   25655   unsigned Bits = VT.getSizeInBits();
   25656   if (ShAmt1.getOpcode() == ISD::SUB) {
   25657     SDValue Sum = ShAmt1.getOperand(0);
   25658     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
   25659       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
   25660       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
   25661         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
   25662       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
   25663         return DAG.getNode(Opc, DL, VT,
   25664                            Op0, Op1,
   25665                            DAG.getNode(ISD::TRUNCATE, DL,
   25666                                        MVT::i8, ShAmt0));
   25667     }
   25668   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
   25669     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
   25670     if (ShAmt0C &&
   25671         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
   25672       return DAG.getNode(Opc, DL, VT,
   25673                          N0.getOperand(0), N1.getOperand(0),
   25674                          DAG.getNode(ISD::TRUNCATE, DL,
   25675                                        MVT::i8, ShAmt0));
   25676   }
   25677 
   25678   return SDValue();
   25679 }
   25680 
   25681 // Generate NEG and CMOV for integer abs.
   25682 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
   25683   EVT VT = N->getValueType(0);
   25684 
   25685   // Since X86 does not have CMOV for 8-bit integer, we don't convert
   25686   // 8-bit integer abs to NEG and CMOV.
   25687   if (VT.isInteger() && VT.getSizeInBits() == 8)
   25688     return SDValue();
   25689 
   25690   SDValue N0 = N->getOperand(0);
   25691   SDValue N1 = N->getOperand(1);
   25692   SDLoc DL(N);
   25693 
   25694   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
   25695   // and change it to SUB and CMOV.
   25696   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
   25697       N0.getOpcode() == ISD::ADD &&
   25698       N0.getOperand(1) == N1 &&
   25699       N1.getOpcode() == ISD::SRA &&
   25700       N1.getOperand(0) == N0.getOperand(0))
   25701     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
   25702       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
   25703         // Generate SUB & CMOV.
   25704         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
   25705                                   DAG.getConstant(0, DL, VT), N0.getOperand(0));
   25706 
   25707         SDValue Ops[] = { N0.getOperand(0), Neg,
   25708                           DAG.getConstant(X86::COND_GE, DL, MVT::i8),
   25709                           SDValue(Neg.getNode(), 1) };
   25710         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
   25711       }
   25712   return SDValue();
   25713 }
   25714 
   25715 // Try to turn tests against the signbit in the form of:
   25716 //   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
   25717 // into:
   25718 //   SETGT(X, -1)
   25719 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
   25720   // This is only worth doing if the output type is i8.
   25721   if (N->getValueType(0) != MVT::i8)
   25722     return SDValue();
   25723 
   25724   SDValue N0 = N->getOperand(0);
   25725   SDValue N1 = N->getOperand(1);
   25726 
   25727   // We should be performing an xor against a truncated shift.
   25728   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
   25729     return SDValue();
   25730 
   25731   // Make sure we are performing an xor against one.
   25732   if (!isOneConstant(N1))
   25733     return SDValue();
   25734 
   25735   // SetCC on x86 zero extends so only act on this if it's a logical shift.
   25736   SDValue Shift = N0.getOperand(0);
   25737   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
   25738     return SDValue();
   25739 
   25740   // Make sure we are truncating from one of i16, i32 or i64.
   25741   EVT ShiftTy = Shift.getValueType();
   25742   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
   25743     return SDValue();
   25744 
   25745   // Make sure the shift amount extracts the sign bit.
   25746   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
   25747       Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
   25748     return SDValue();
   25749 
   25750   // Create a greater-than comparison against -1.
   25751   // N.B. Using SETGE against 0 works but we want a canonical looking
   25752   // comparison, using SETGT matches up with what TranslateX86CC.
   25753   SDLoc DL(N);
   25754   SDValue ShiftOp = Shift.getOperand(0);
   25755   EVT ShiftOpTy = ShiftOp.getValueType();
   25756   SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp,
   25757                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
   25758   return Cond;
   25759 }
   25760 
   25761 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
   25762                                  TargetLowering::DAGCombinerInfo &DCI,
   25763                                  const X86Subtarget *Subtarget) {
   25764   if (DCI.isBeforeLegalizeOps())
   25765     return SDValue();
   25766 
   25767   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
   25768     return RV;
   25769 
   25770   if (Subtarget->hasCMov())
   25771     if (SDValue RV = performIntegerAbsCombine(N, DAG))
   25772       return RV;
   25773 
   25774   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
   25775     return FPLogic;
   25776 
   25777   return SDValue();
   25778 }
   25779 
   25780 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
   25781 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
   25782 /// X86ISD::AVG instruction.
   25783 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   25784                                 const X86Subtarget *Subtarget, SDLoc DL) {
   25785   if (!VT.isVector() || !VT.isSimple())
   25786     return SDValue();
   25787   EVT InVT = In.getValueType();
   25788   unsigned NumElems = VT.getVectorNumElements();
   25789 
   25790   EVT ScalarVT = VT.getVectorElementType();
   25791   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
   25792         isPowerOf2_32(NumElems)))
   25793     return SDValue();
   25794 
   25795   // InScalarVT is the intermediate type in AVG pattern and it should be greater
   25796   // than the original input type (i8/i16).
   25797   EVT InScalarVT = InVT.getVectorElementType();
   25798   if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
   25799     return SDValue();
   25800 
   25801   if (Subtarget->hasAVX512()) {
   25802     if (VT.getSizeInBits() > 512)
   25803       return SDValue();
   25804   } else if (Subtarget->hasAVX2()) {
   25805     if (VT.getSizeInBits() > 256)
   25806       return SDValue();
   25807   } else {
   25808     if (VT.getSizeInBits() > 128)
   25809       return SDValue();
   25810   }
   25811 
   25812   // Detect the following pattern:
   25813   //
   25814   //   %1 = zext <N x i8> %a to <N x i32>
   25815   //   %2 = zext <N x i8> %b to <N x i32>
   25816   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
   25817   //   %4 = add nuw nsw <N x i32> %3, %2
   25818   //   %5 = lshr <N x i32> %N, <i32 1 x N>
   25819   //   %6 = trunc <N x i32> %5 to <N x i8>
   25820   //
   25821   // In AVX512, the last instruction can also be a trunc store.
   25822 
   25823   if (In.getOpcode() != ISD::SRL)
   25824     return SDValue();
   25825 
   25826   // A lambda checking the given SDValue is a constant vector and each element
   25827   // is in the range [Min, Max].
   25828   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
   25829     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
   25830     if (!BV || !BV->isConstant())
   25831       return false;
   25832     for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
   25833       ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
   25834       if (!C)
   25835         return false;
   25836       uint64_t Val = C->getZExtValue();
   25837       if (Val < Min || Val > Max)
   25838         return false;
   25839     }
   25840     return true;
   25841   };
   25842 
   25843   // Check if each element of the vector is left-shifted by one.
   25844   auto LHS = In.getOperand(0);
   25845   auto RHS = In.getOperand(1);
   25846   if (!IsConstVectorInRange(RHS, 1, 1))
   25847     return SDValue();
   25848   if (LHS.getOpcode() != ISD::ADD)
   25849     return SDValue();
   25850 
   25851   // Detect a pattern of a + b + 1 where the order doesn't matter.
   25852   SDValue Operands[3];
   25853   Operands[0] = LHS.getOperand(0);
   25854   Operands[1] = LHS.getOperand(1);
   25855 
   25856   // Take care of the case when one of the operands is a constant vector whose
   25857   // element is in the range [1, 256].
   25858   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
   25859       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
   25860       Operands[0].getOperand(0).getValueType() == VT) {
   25861     // The pattern is detected. Subtract one from the constant vector, then
   25862     // demote it and emit X86ISD::AVG instruction.
   25863     SDValue One = DAG.getConstant(1, DL, InScalarVT);
   25864     SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT,
   25865                                SmallVector<SDValue, 8>(NumElems, One));
   25866     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones);
   25867     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
   25868     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
   25869                        Operands[1]);
   25870   }
   25871 
   25872   if (Operands[0].getOpcode() == ISD::ADD)
   25873     std::swap(Operands[0], Operands[1]);
   25874   else if (Operands[1].getOpcode() != ISD::ADD)
   25875     return SDValue();
   25876   Operands[2] = Operands[1].getOperand(0);
   25877   Operands[1] = Operands[1].getOperand(1);
   25878 
   25879   // Now we have three operands of two additions. Check that one of them is a
   25880   // constant vector with ones, and the other two are promoted from i8/i16.
   25881   for (int i = 0; i < 3; ++i) {
   25882     if (!IsConstVectorInRange(Operands[i], 1, 1))
   25883       continue;
   25884     std::swap(Operands[i], Operands[2]);
   25885 
   25886     // Check if Operands[0] and Operands[1] are results of type promotion.
   25887     for (int j = 0; j < 2; ++j)
   25888       if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
   25889           Operands[j].getOperand(0).getValueType() != VT)
   25890         return SDValue();
   25891 
   25892     // The pattern is detected, emit X86ISD::AVG instruction.
   25893     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
   25894                        Operands[1].getOperand(0));
   25895   }
   25896 
   25897   return SDValue();
   25898 }
   25899 
   25900 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
   25901 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   25902                                   TargetLowering::DAGCombinerInfo &DCI,
   25903                                   const X86Subtarget *Subtarget) {
   25904   LoadSDNode *Ld = cast<LoadSDNode>(N);
   25905   EVT RegVT = Ld->getValueType(0);
   25906   EVT MemVT = Ld->getMemoryVT();
   25907   SDLoc dl(Ld);
   25908   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   25909 
   25910   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
   25911   // into two 16-byte operations.
   25912   ISD::LoadExtType Ext = Ld->getExtensionType();
   25913   bool Fast;
   25914   unsigned AddressSpace = Ld->getAddressSpace();
   25915   unsigned Alignment = Ld->getAlignment();
   25916   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
   25917       Ext == ISD::NON_EXTLOAD &&
   25918       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
   25919                              AddressSpace, Alignment, &Fast) && !Fast) {
   25920     unsigned NumElems = RegVT.getVectorNumElements();
   25921     if (NumElems < 2)
   25922       return SDValue();
   25923 
   25924     SDValue Ptr = Ld->getBasePtr();
   25925     SDValue Increment =
   25926         DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
   25927 
   25928     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
   25929                                   NumElems/2);
   25930     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
   25931                                 Ld->getPointerInfo(), Ld->isVolatile(),
   25932                                 Ld->isNonTemporal(), Ld->isInvariant(),
   25933                                 Alignment);
   25934     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   25935     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
   25936                                 Ld->getPointerInfo(), Ld->isVolatile(),
   25937                                 Ld->isNonTemporal(), Ld->isInvariant(),
   25938                                 std::min(16U, Alignment));
   25939     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   25940                              Load1.getValue(1),
   25941                              Load2.getValue(1));
   25942 
   25943     SDValue NewVec = DAG.getUNDEF(RegVT);
   25944     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
   25945     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
   25946     return DCI.CombineTo(N, NewVec, TF, true);
   25947   }
   25948 
   25949   return SDValue();
   25950 }
   25951 
   25952 /// PerformMLOADCombine - Resolve extending loads
   25953 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
   25954                                    TargetLowering::DAGCombinerInfo &DCI,
   25955                                    const X86Subtarget *Subtarget) {
   25956   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
   25957   if (Mld->getExtensionType() != ISD::SEXTLOAD)
   25958     return SDValue();
   25959 
   25960   EVT VT = Mld->getValueType(0);
   25961   unsigned NumElems = VT.getVectorNumElements();
   25962   EVT LdVT = Mld->getMemoryVT();
   25963   SDLoc dl(Mld);
   25964 
   25965   assert(LdVT != VT && "Cannot extend to the same type");
   25966   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
   25967   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
   25968   // From, To sizes and ElemCount must be pow of two
   25969   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
   25970     "Unexpected size for extending masked load");
   25971 
   25972   unsigned SizeRatio  = ToSz / FromSz;
   25973   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
   25974 
   25975   // Create a type on which we perform the shuffle
   25976   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   25977           LdVT.getScalarType(), NumElems*SizeRatio);
   25978   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   25979 
   25980   // Convert Src0 value
   25981   SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
   25982   if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
   25983     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   25984     for (unsigned i = 0; i != NumElems; ++i)
   25985       ShuffleVec[i] = i * SizeRatio;
   25986 
   25987     // Can't shuffle using an illegal type.
   25988     assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
   25989            "WideVecVT should be legal");
   25990     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
   25991                                     DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
   25992   }
   25993   // Prepare the new mask
   25994   SDValue NewMask;
   25995   SDValue Mask = Mld->getMask();
   25996   if (Mask.getValueType() == VT) {
   25997     // Mask and original value have the same type
   25998     NewMask = DAG.getBitcast(WideVecVT, Mask);
   25999     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   26000     for (unsigned i = 0; i != NumElems; ++i)
   26001       ShuffleVec[i] = i * SizeRatio;
   26002     for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
   26003       ShuffleVec[i] = NumElems * SizeRatio;
   26004     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
   26005                                    DAG.getConstant(0, dl, WideVecVT),
   26006                                    &ShuffleVec[0]);
   26007   }
   26008   else {
   26009     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
   26010     unsigned WidenNumElts = NumElems*SizeRatio;
   26011     unsigned MaskNumElts = VT.getVectorNumElements();
   26012     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
   26013                                      WidenNumElts);
   26014 
   26015     unsigned NumConcat = WidenNumElts / MaskNumElts;
   26016     SmallVector<SDValue, 16> Ops(NumConcat);
   26017     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
   26018     Ops[0] = Mask;
   26019     for (unsigned i = 1; i != NumConcat; ++i)
   26020       Ops[i] = ZeroVal;
   26021 
   26022     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   26023   }
   26024 
   26025   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
   26026                                      Mld->getBasePtr(), NewMask, WideSrc0,
   26027                                      Mld->getMemoryVT(), Mld->getMemOperand(),
   26028                                      ISD::NON_EXTLOAD);
   26029   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
   26030   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
   26031 }
   26032 /// PerformMSTORECombine - Resolve truncating stores
   26033 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
   26034                                     const X86Subtarget *Subtarget) {
   26035   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
   26036   if (!Mst->isTruncatingStore())
   26037     return SDValue();
   26038 
   26039   EVT VT = Mst->getValue().getValueType();
   26040   unsigned NumElems = VT.getVectorNumElements();
   26041   EVT StVT = Mst->getMemoryVT();
   26042   SDLoc dl(Mst);
   26043 
   26044   assert(StVT != VT && "Cannot truncate to the same type");
   26045   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
   26046   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
   26047 
   26048   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   26049 
   26050   // The truncating store is legal in some cases. For example
   26051   // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
   26052   // are designated for truncate store.
   26053   // In this case we don't need any further transformations.
   26054   if (TLI.isTruncStoreLegal(VT, StVT))
   26055     return SDValue();
   26056 
   26057   // From, To sizes and ElemCount must be pow of two
   26058   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
   26059     "Unexpected size for truncating masked store");
   26060   // We are going to use the original vector elt for storing.
   26061   // Accumulated smaller vector elements must be a multiple of the store size.
   26062   assert (((NumElems * FromSz) % ToSz) == 0 &&
   26063           "Unexpected ratio for truncating masked store");
   26064 
   26065   unsigned SizeRatio  = FromSz / ToSz;
   26066   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
   26067 
   26068   // Create a type on which we perform the shuffle
   26069   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   26070           StVT.getScalarType(), NumElems*SizeRatio);
   26071 
   26072   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   26073 
   26074   SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
   26075   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   26076   for (unsigned i = 0; i != NumElems; ++i)
   26077     ShuffleVec[i] = i * SizeRatio;
   26078 
   26079   // Can't shuffle using an illegal type.
   26080   assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
   26081          "WideVecVT should be legal");
   26082 
   26083   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
   26084                                               DAG.getUNDEF(WideVecVT),
   26085                                               &ShuffleVec[0]);
   26086 
   26087   SDValue NewMask;
   26088   SDValue Mask = Mst->getMask();
   26089   if (Mask.getValueType() == VT) {
   26090     // Mask and original value have the same type
   26091     NewMask = DAG.getBitcast(WideVecVT, Mask);
   26092     for (unsigned i = 0; i != NumElems; ++i)
   26093       ShuffleVec[i] = i * SizeRatio;
   26094     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
   26095       ShuffleVec[i] = NumElems*SizeRatio;
   26096     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
   26097                                    DAG.getConstant(0, dl, WideVecVT),
   26098                                    &ShuffleVec[0]);
   26099   }
   26100   else {
   26101     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
   26102     unsigned WidenNumElts = NumElems*SizeRatio;
   26103     unsigned MaskNumElts = VT.getVectorNumElements();
   26104     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
   26105                                      WidenNumElts);
   26106 
   26107     unsigned NumConcat = WidenNumElts / MaskNumElts;
   26108     SmallVector<SDValue, 16> Ops(NumConcat);
   26109     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
   26110     Ops[0] = Mask;
   26111     for (unsigned i = 1; i != NumConcat; ++i)
   26112       Ops[i] = ZeroVal;
   26113 
   26114     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   26115   }
   26116 
   26117   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
   26118                             Mst->getBasePtr(), NewMask, StVT,
   26119                             Mst->getMemOperand(), false);
   26120 }
   26121 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
   26122 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   26123                                    const X86Subtarget *Subtarget) {
   26124   StoreSDNode *St = cast<StoreSDNode>(N);
   26125   EVT VT = St->getValue().getValueType();
   26126   EVT StVT = St->getMemoryVT();
   26127   SDLoc dl(St);
   26128   SDValue StoredVal = St->getOperand(1);
   26129   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   26130 
   26131   // If we are saving a concatenation of two XMM registers and 32-byte stores
   26132   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
   26133   bool Fast;
   26134   unsigned AddressSpace = St->getAddressSpace();
   26135   unsigned Alignment = St->getAlignment();
   26136   if (VT.is256BitVector() && StVT == VT &&
   26137       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
   26138                              AddressSpace, Alignment, &Fast) && !Fast) {
   26139     unsigned NumElems = VT.getVectorNumElements();
   26140     if (NumElems < 2)
   26141       return SDValue();
   26142 
   26143     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
   26144     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
   26145 
   26146     SDValue Stride =
   26147         DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
   26148     SDValue Ptr0 = St->getBasePtr();
   26149     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
   26150 
   26151     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
   26152                                 St->getPointerInfo(), St->isVolatile(),
   26153                                 St->isNonTemporal(), Alignment);
   26154     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
   26155                                 St->getPointerInfo(), St->isVolatile(),
   26156                                 St->isNonTemporal(),
   26157                                 std::min(16U, Alignment));
   26158     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
   26159   }
   26160 
   26161   // Optimize trunc store (of multiple scalars) to shuffle and store.
   26162   // First, pack all of the elements in one place. Next, store to memory
   26163   // in fewer chunks.
   26164   if (St->isTruncatingStore() && VT.isVector()) {
   26165     // Check if we can detect an AVG pattern from the truncation. If yes,
   26166     // replace the trunc store by a normal store with the result of X86ISD::AVG
   26167     // instruction.
   26168     SDValue Avg =
   26169         detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl);
   26170     if (Avg.getNode())
   26171       return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
   26172                           St->getPointerInfo(), St->isVolatile(),
   26173                           St->isNonTemporal(), St->getAlignment());
   26174 
   26175     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   26176     unsigned NumElems = VT.getVectorNumElements();
   26177     assert(StVT != VT && "Cannot truncate to the same type");
   26178     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
   26179     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
   26180 
   26181     // The truncating store is legal in some cases. For example
   26182     // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
   26183     // are designated for truncate store.
   26184     // In this case we don't need any further transformations.
   26185     if (TLI.isTruncStoreLegal(VT, StVT))
   26186       return SDValue();
   26187 
   26188     // From, To sizes and ElemCount must be pow of two
   26189     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
   26190     // We are going to use the original vector elt for storing.
   26191     // Accumulated smaller vector elements must be a multiple of the store size.
   26192     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
   26193 
   26194     unsigned SizeRatio  = FromSz / ToSz;
   26195 
   26196     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
   26197 
   26198     // Create a type on which we perform the shuffle
   26199     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   26200             StVT.getScalarType(), NumElems*SizeRatio);
   26201 
   26202     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   26203 
   26204     SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
   26205     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
   26206     for (unsigned i = 0; i != NumElems; ++i)
   26207       ShuffleVec[i] = i * SizeRatio;
   26208 
   26209     // Can't shuffle using an illegal type.
   26210     if (!TLI.isTypeLegal(WideVecVT))
   26211       return SDValue();
   26212 
   26213     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
   26214                                          DAG.getUNDEF(WideVecVT),
   26215                                          &ShuffleVec[0]);
   26216     // At this point all of the data is stored at the bottom of the
   26217     // register. We now need to save it to mem.
   26218 
   26219     // Find the largest store unit
   26220     MVT StoreType = MVT::i8;
   26221     for (MVT Tp : MVT::integer_valuetypes()) {
   26222       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
   26223         StoreType = Tp;
   26224     }
   26225 
   26226     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
   26227     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
   26228         (64 <= NumElems * ToSz))
   26229       StoreType = MVT::f64;
   26230 
   26231     // Bitcast the original vector into a vector of store-size units
   26232     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
   26233             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
   26234     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
   26235     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
   26236     SmallVector<SDValue, 8> Chains;
   26237     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl,
   26238                                         TLI.getPointerTy(DAG.getDataLayout()));
   26239     SDValue Ptr = St->getBasePtr();
   26240 
   26241     // Perform one or more big stores into memory.
   26242     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
   26243       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   26244                                    StoreType, ShuffWide,
   26245                                    DAG.getIntPtrConstant(i, dl));
   26246       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
   26247                                 St->getPointerInfo(), St->isVolatile(),
   26248                                 St->isNonTemporal(), St->getAlignment());
   26249       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   26250       Chains.push_back(Ch);
   26251     }
   26252 
   26253     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   26254   }
   26255 
   26256   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   26257   // the FP state in cases where an emms may be missing.
   26258   // A preferable solution to the general problem is to figure out the right
   26259   // places to insert EMMS.  This qualifies as a quick hack.
   26260 
   26261   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
   26262   if (VT.getSizeInBits() != 64)
   26263     return SDValue();
   26264 
   26265   const Function *F = DAG.getMachineFunction().getFunction();
   26266   bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
   26267   bool F64IsLegal =
   26268       !Subtarget->useSoftFloat() && !NoImplicitFloatOps && Subtarget->hasSSE2();
   26269   if ((VT.isVector() ||
   26270        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
   26271       isa<LoadSDNode>(St->getValue()) &&
   26272       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
   26273       St->getChain().hasOneUse() && !St->isVolatile()) {
   26274     SDNode* LdVal = St->getValue().getNode();
   26275     LoadSDNode *Ld = nullptr;
   26276     int TokenFactorIndex = -1;
   26277     SmallVector<SDValue, 8> Ops;
   26278     SDNode* ChainVal = St->getChain().getNode();
   26279     // Must be a store of a load.  We currently handle two cases:  the load
   26280     // is a direct child, and it's under an intervening TokenFactor.  It is
   26281     // possible to dig deeper under nested TokenFactors.
   26282     if (ChainVal == LdVal)
   26283       Ld = cast<LoadSDNode>(St->getChain());
   26284     else if (St->getValue().hasOneUse() &&
   26285              ChainVal->getOpcode() == ISD::TokenFactor) {
   26286       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
   26287         if (ChainVal->getOperand(i).getNode() == LdVal) {
   26288           TokenFactorIndex = i;
   26289           Ld = cast<LoadSDNode>(St->getValue());
   26290         } else
   26291           Ops.push_back(ChainVal->getOperand(i));
   26292       }
   26293     }
   26294 
   26295     if (!Ld || !ISD::isNormalLoad(Ld))
   26296       return SDValue();
   26297 
   26298     // If this is not the MMX case, i.e. we are just turning i64 load/store
   26299     // into f64 load/store, avoid the transformation if there are multiple
   26300     // uses of the loaded value.
   26301     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
   26302       return SDValue();
   26303 
   26304     SDLoc LdDL(Ld);
   26305     SDLoc StDL(N);
   26306     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
   26307     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
   26308     // pair instead.
   26309     if (Subtarget->is64Bit() || F64IsLegal) {
   26310       MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
   26311       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
   26312                                   Ld->getPointerInfo(), Ld->isVolatile(),
   26313                                   Ld->isNonTemporal(), Ld->isInvariant(),
   26314                                   Ld->getAlignment());
   26315       SDValue NewChain = NewLd.getValue(1);
   26316       if (TokenFactorIndex != -1) {
   26317         Ops.push_back(NewChain);
   26318         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
   26319       }
   26320       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
   26321                           St->getPointerInfo(),
   26322                           St->isVolatile(), St->isNonTemporal(),
   26323                           St->getAlignment());
   26324     }
   26325 
   26326     // Otherwise, lower to two pairs of 32-bit loads / stores.
   26327     SDValue LoAddr = Ld->getBasePtr();
   26328     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
   26329                                  DAG.getConstant(4, LdDL, MVT::i32));
   26330 
   26331     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
   26332                                Ld->getPointerInfo(),
   26333                                Ld->isVolatile(), Ld->isNonTemporal(),
   26334                                Ld->isInvariant(), Ld->getAlignment());
   26335     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
   26336                                Ld->getPointerInfo().getWithOffset(4),
   26337                                Ld->isVolatile(), Ld->isNonTemporal(),
   26338                                Ld->isInvariant(),
   26339                                MinAlign(Ld->getAlignment(), 4));
   26340 
   26341     SDValue NewChain = LoLd.getValue(1);
   26342     if (TokenFactorIndex != -1) {
   26343       Ops.push_back(LoLd);
   26344       Ops.push_back(HiLd);
   26345       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
   26346     }
   26347 
   26348     LoAddr = St->getBasePtr();
   26349     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
   26350                          DAG.getConstant(4, StDL, MVT::i32));
   26351 
   26352     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
   26353                                 St->getPointerInfo(),
   26354                                 St->isVolatile(), St->isNonTemporal(),
   26355                                 St->getAlignment());
   26356     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
   26357                                 St->getPointerInfo().getWithOffset(4),
   26358                                 St->isVolatile(),
   26359                                 St->isNonTemporal(),
   26360                                 MinAlign(St->getAlignment(), 4));
   26361     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
   26362   }
   26363 
   26364   // This is similar to the above case, but here we handle a scalar 64-bit
   26365   // integer store that is extracted from a vector on a 32-bit target.
   26366   // If we have SSE2, then we can treat it like a floating-point double
   26367   // to get past legalization. The execution dependencies fixup pass will
   26368   // choose the optimal machine instruction for the store if this really is
   26369   // an integer or v2f32 rather than an f64.
   26370   if (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit() &&
   26371       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   26372     SDValue OldExtract = St->getOperand(1);
   26373     SDValue ExtOp0 = OldExtract.getOperand(0);
   26374     unsigned VecSize = ExtOp0.getValueSizeInBits();
   26375     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
   26376     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
   26377     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   26378                                      BitCast, OldExtract.getOperand(1));
   26379     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
   26380                         St->getPointerInfo(), St->isVolatile(),
   26381                         St->isNonTemporal(), St->getAlignment());
   26382   }
   26383 
   26384   return SDValue();
   26385 }
   26386 
   26387 /// Return 'true' if this vector operation is "horizontal"
   26388 /// and return the operands for the horizontal operation in LHS and RHS.  A
   26389 /// horizontal operation performs the binary operation on successive elements
   26390 /// of its first operand, then on successive elements of its second operand,
   26391 /// returning the resulting values in a vector.  For example, if
   26392 ///   A = < float a0, float a1, float a2, float a3 >
   26393 /// and
   26394 ///   B = < float b0, float b1, float b2, float b3 >
   26395 /// then the result of doing a horizontal operation on A and B is
   26396 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
   26397 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
   26398 /// A horizontal-op B, for some already available A and B, and if so then LHS is
   26399 /// set to A, RHS to B, and the routine returns 'true'.
   26400 /// Note that the binary operation should have the property that if one of the
   26401 /// operands is UNDEF then the result is UNDEF.
   26402 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   26403   // Look for the following pattern: if
   26404   //   A = < float a0, float a1, float a2, float a3 >
   26405   //   B = < float b0, float b1, float b2, float b3 >
   26406   // and
   26407   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
   26408   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
   26409   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
   26410   // which is A horizontal-op B.
   26411 
   26412   // At least one of the operands should be a vector shuffle.
   26413   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
   26414       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
   26415     return false;
   26416 
   26417   MVT VT = LHS.getSimpleValueType();
   26418 
   26419   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   26420          "Unsupported vector type for horizontal add/sub");
   26421 
   26422   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
   26423   // operate independently on 128-bit lanes.
   26424   unsigned NumElts = VT.getVectorNumElements();
   26425   unsigned NumLanes = VT.getSizeInBits()/128;
   26426   unsigned NumLaneElts = NumElts / NumLanes;
   26427   assert((NumLaneElts % 2 == 0) &&
   26428          "Vector type should have an even number of elements in each lane");
   26429   unsigned HalfLaneElts = NumLaneElts/2;
   26430 
   26431   // View LHS in the form
   26432   //   LHS = VECTOR_SHUFFLE A, B, LMask
   26433   // If LHS is not a shuffle then pretend it is the shuffle
   26434   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
   26435   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
   26436   // type VT.
   26437   SDValue A, B;
   26438   SmallVector<int, 16> LMask(NumElts);
   26439   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
   26440     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
   26441       A = LHS.getOperand(0);
   26442     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
   26443       B = LHS.getOperand(1);
   26444     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
   26445     std::copy(Mask.begin(), Mask.end(), LMask.begin());
   26446   } else {
   26447     if (LHS.getOpcode() != ISD::UNDEF)
   26448       A = LHS;
   26449     for (unsigned i = 0; i != NumElts; ++i)
   26450       LMask[i] = i;
   26451   }
   26452 
   26453   // Likewise, view RHS in the form
   26454   //   RHS = VECTOR_SHUFFLE C, D, RMask
   26455   SDValue C, D;
   26456   SmallVector<int, 16> RMask(NumElts);
   26457   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
   26458     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
   26459       C = RHS.getOperand(0);
   26460     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
   26461       D = RHS.getOperand(1);
   26462     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
   26463     std::copy(Mask.begin(), Mask.end(), RMask.begin());
   26464   } else {
   26465     if (RHS.getOpcode() != ISD::UNDEF)
   26466       C = RHS;
   26467     for (unsigned i = 0; i != NumElts; ++i)
   26468       RMask[i] = i;
   26469   }
   26470 
   26471   // Check that the shuffles are both shuffling the same vectors.
   26472   if (!(A == C && B == D) && !(A == D && B == C))
   26473     return false;
   26474 
   26475   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
   26476   if (!A.getNode() && !B.getNode())
   26477     return false;
   26478 
   26479   // If A and B occur in reverse order in RHS, then "swap" them (which means
   26480   // rewriting the mask).
   26481   if (A != C)
   26482     ShuffleVectorSDNode::commuteMask(RMask);
   26483 
   26484   // At this point LHS and RHS are equivalent to
   26485   //   LHS = VECTOR_SHUFFLE A, B, LMask
   26486   //   RHS = VECTOR_SHUFFLE A, B, RMask
   26487   // Check that the masks correspond to performing a horizontal operation.
   26488   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
   26489     for (unsigned i = 0; i != NumLaneElts; ++i) {
   26490       int LIdx = LMask[i+l], RIdx = RMask[i+l];
   26491 
   26492       // Ignore any UNDEF components.
   26493       if (LIdx < 0 || RIdx < 0 ||
   26494           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
   26495           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
   26496         continue;
   26497 
   26498       // Check that successive elements are being operated on.  If not, this is
   26499       // not a horizontal operation.
   26500       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
   26501       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
   26502       if (!(LIdx == Index && RIdx == Index + 1) &&
   26503           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
   26504         return false;
   26505     }
   26506   }
   26507 
   26508   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
   26509   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
   26510   return true;
   26511 }
   26512 
   26513 /// Do target-specific dag combines on floating point adds.
   26514 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
   26515                                   const X86Subtarget *Subtarget) {
   26516   EVT VT = N->getValueType(0);
   26517   SDValue LHS = N->getOperand(0);
   26518   SDValue RHS = N->getOperand(1);
   26519 
   26520   // Try to synthesize horizontal adds from adds of shuffles.
   26521   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   26522        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
   26523       isHorizontalBinOp(LHS, RHS, true))
   26524     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
   26525   return SDValue();
   26526 }
   26527 
   26528 /// Do target-specific dag combines on floating point subs.
   26529 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
   26530                                   const X86Subtarget *Subtarget) {
   26531   EVT VT = N->getValueType(0);
   26532   SDValue LHS = N->getOperand(0);
   26533   SDValue RHS = N->getOperand(1);
   26534 
   26535   // Try to synthesize horizontal subs from subs of shuffles.
   26536   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   26537        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
   26538       isHorizontalBinOp(LHS, RHS, false))
   26539     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
   26540   return SDValue();
   26541 }
   26542 
   26543 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
   26544 static SDValue
   26545 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
   26546                                   SmallVector<SDValue, 8> &Regs) {
   26547   assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
   26548                              Regs[0].getValueType() == MVT::v2i64));
   26549   EVT OutVT = N->getValueType(0);
   26550   EVT OutSVT = OutVT.getVectorElementType();
   26551   EVT InVT = Regs[0].getValueType();
   26552   EVT InSVT = InVT.getVectorElementType();
   26553   SDLoc DL(N);
   26554 
   26555   // First, use mask to unset all bits that won't appear in the result.
   26556   assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
   26557          "OutSVT can only be either i8 or i16.");
   26558   SDValue MaskVal =
   26559       DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT);
   26560   SDValue MaskVec = DAG.getNode(
   26561       ISD::BUILD_VECTOR, DL, InVT,
   26562       SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal));
   26563   for (auto &Reg : Regs)
   26564     Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg);
   26565 
   26566   MVT UnpackedVT, PackedVT;
   26567   if (OutSVT == MVT::i8) {
   26568     UnpackedVT = MVT::v8i16;
   26569     PackedVT = MVT::v16i8;
   26570   } else {
   26571     UnpackedVT = MVT::v4i32;
   26572     PackedVT = MVT::v8i16;
   26573   }
   26574 
   26575   // In each iteration, truncate the type by a half size.
   26576   auto RegNum = Regs.size();
   26577   for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
   26578        j < e; j *= 2, RegNum /= 2) {
   26579     for (unsigned i = 0; i < RegNum; i++)
   26580       Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]);
   26581     for (unsigned i = 0; i < RegNum / 2; i++)
   26582       Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
   26583                             Regs[i * 2 + 1]);
   26584   }
   26585 
   26586   // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
   26587   // then extract a subvector as the result since v8i8 is not a legal type.
   26588   if (OutVT == MVT::v8i8) {
   26589     Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
   26590     Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
   26591                           DAG.getIntPtrConstant(0, DL));
   26592     return Regs[0];
   26593   } else if (RegNum > 1) {
   26594     Regs.resize(RegNum);
   26595     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
   26596   } else
   26597     return Regs[0];
   26598 }
   26599 
   26600 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
   26601 static SDValue
   26602 combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
   26603                                   SmallVector<SDValue, 8> &Regs) {
   26604   assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
   26605   EVT OutVT = N->getValueType(0);
   26606   SDLoc DL(N);
   26607 
   26608   // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
   26609   SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
   26610   for (auto &Reg : Regs) {
   26611     Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
   26612     Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
   26613   }
   26614 
   26615   for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
   26616     Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
   26617                           Regs[i * 2 + 1]);
   26618 
   26619   if (Regs.size() > 2) {
   26620     Regs.resize(Regs.size() / 2);
   26621     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
   26622   } else
   26623     return Regs[0];
   26624 }
   26625 
   26626 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
   26627 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
   26628 /// legalization the truncation will be translated into a BUILD_VECTOR with each
   26629 /// element that is extracted from a vector and then truncated, and it is
   26630 /// diffcult to do this optimization based on them.
   26631 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
   26632                                        const X86Subtarget *Subtarget) {
   26633   EVT OutVT = N->getValueType(0);
   26634   if (!OutVT.isVector())
   26635     return SDValue();
   26636 
   26637   SDValue In = N->getOperand(0);
   26638   if (!In.getValueType().isSimple())
   26639     return SDValue();
   26640 
   26641   EVT InVT = In.getValueType();
   26642   unsigned NumElems = OutVT.getVectorNumElements();
   26643 
   26644   // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
   26645   // SSE2, and we need to take care of it specially.
   26646   // AVX512 provides vpmovdb.
   26647   if (!Subtarget->hasSSE2() || Subtarget->hasAVX2())
   26648     return SDValue();
   26649 
   26650   EVT OutSVT = OutVT.getVectorElementType();
   26651   EVT InSVT = InVT.getVectorElementType();
   26652   if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
   26653         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
   26654         NumElems >= 8))
   26655     return SDValue();
   26656 
   26657   // SSSE3's pshufb results in less instructions in the cases below.
   26658   if (Subtarget->hasSSSE3() && NumElems == 8 &&
   26659       ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
   26660        (InSVT == MVT::i32 && OutSVT == MVT::i16)))
   26661     return SDValue();
   26662 
   26663   SDLoc DL(N);
   26664 
   26665   // Split a long vector into vectors of legal type.
   26666   unsigned RegNum = InVT.getSizeInBits() / 128;
   26667   SmallVector<SDValue, 8> SubVec(RegNum);
   26668   if (InSVT == MVT::i32) {
   26669     for (unsigned i = 0; i < RegNum; i++)
   26670       SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
   26671                               DAG.getIntPtrConstant(i * 4, DL));
   26672   } else {
   26673     for (unsigned i = 0; i < RegNum; i++)
   26674       SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   26675                               DAG.getIntPtrConstant(i * 2, DL));
   26676   }
   26677 
   26678   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS
   26679   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
   26680   // truncate 2 x v4i32 to v8i16.
   26681   if (Subtarget->hasSSE41() || OutSVT == MVT::i8)
   26682     return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
   26683   else if (InSVT == MVT::i32)
   26684     return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
   26685   else
   26686     return SDValue();
   26687 }
   26688 
   26689 static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
   26690                                       const X86Subtarget *Subtarget) {
   26691   // Try to detect AVG pattern first.
   26692   SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG,
   26693                                  Subtarget, SDLoc(N));
   26694   if (Avg.getNode())
   26695     return Avg;
   26696 
   26697   return combineVectorTruncation(N, DAG, Subtarget);
   26698 }
   26699 
   26700 /// Do target-specific dag combines on floating point negations.
   26701 static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
   26702                                   const X86Subtarget *Subtarget) {
   26703   EVT VT = N->getValueType(0);
   26704   EVT SVT = VT.getScalarType();
   26705   SDValue Arg = N->getOperand(0);
   26706   SDLoc DL(N);
   26707 
   26708   // Let legalize expand this if it isn't a legal type yet.
   26709   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   26710     return SDValue();
   26711 
   26712   // If we're negating a FMUL node on a target with FMA, then we can avoid the
   26713   // use of a constant by performing (-0 - A*B) instead.
   26714   // FIXME: Check rounding control flags as well once it becomes available.
   26715   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
   26716       Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
   26717     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
   26718     return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
   26719                        Arg.getOperand(1), Zero);
   26720   }
   26721 
   26722   // If we're negating a FMA node, then we can adjust the
   26723   // instruction to include the extra negation.
   26724   if (Arg.hasOneUse()) {
   26725     switch (Arg.getOpcode()) {
   26726     case X86ISD::FMADD:
   26727       return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
   26728                          Arg.getOperand(1), Arg.getOperand(2));
   26729     case X86ISD::FMSUB:
   26730       return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
   26731                          Arg.getOperand(1), Arg.getOperand(2));
   26732     case X86ISD::FNMADD:
   26733       return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
   26734                          Arg.getOperand(1), Arg.getOperand(2));
   26735     case X86ISD::FNMSUB:
   26736       return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
   26737                          Arg.getOperand(1), Arg.getOperand(2));
   26738     }
   26739   }
   26740   return SDValue();
   26741 }
   26742 
   26743 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
   26744                               const X86Subtarget *Subtarget) {
   26745   EVT VT = N->getValueType(0);
   26746   if (VT.is512BitVector() && !Subtarget->hasDQI()) {
   26747     // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
   26748     // These logic operations may be executed in the integer domain.
   26749     SDLoc dl(N);
   26750     MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
   26751     MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
   26752 
   26753     SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
   26754     SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
   26755     unsigned IntOpcode = 0;
   26756     switch (N->getOpcode()) {
   26757       default: llvm_unreachable("Unexpected FP logic op");
   26758       case X86ISD::FOR: IntOpcode = ISD::OR; break;
   26759       case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
   26760       case X86ISD::FAND: IntOpcode = ISD::AND; break;
   26761       case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
   26762     }
   26763     SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
   26764     return  DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
   26765   }
   26766   return SDValue();
   26767 }
   26768 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
   26769 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
   26770                                  const X86Subtarget *Subtarget) {
   26771   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
   26772 
   26773   // F[X]OR(0.0, x) -> x
   26774   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   26775     if (C->getValueAPF().isPosZero())
   26776       return N->getOperand(1);
   26777 
   26778   // F[X]OR(x, 0.0) -> x
   26779   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   26780     if (C->getValueAPF().isPosZero())
   26781       return N->getOperand(0);
   26782 
   26783   return lowerX86FPLogicOp(N, DAG, Subtarget);
   26784 }
   26785 
   26786 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
   26787 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
   26788   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
   26789 
   26790   // Only perform optimizations if UnsafeMath is used.
   26791   if (!DAG.getTarget().Options.UnsafeFPMath)
   26792     return SDValue();
   26793 
   26794   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
   26795   // into FMINC and FMAXC, which are Commutative operations.
   26796   unsigned NewOp = 0;
   26797   switch (N->getOpcode()) {
   26798     default: llvm_unreachable("unknown opcode");
   26799     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
   26800     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
   26801   }
   26802 
   26803   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
   26804                      N->getOperand(0), N->getOperand(1));
   26805 }
   26806 
   26807 static SDValue performFMaxNumCombine(SDNode *N, SelectionDAG &DAG,
   26808                                      const X86Subtarget *Subtarget) {
   26809   // This takes at least 3 instructions, so favor a library call when
   26810   // minimizing code size.
   26811   if (DAG.getMachineFunction().getFunction()->optForMinSize())
   26812     return SDValue();
   26813 
   26814   EVT VT = N->getValueType(0);
   26815 
   26816   // TODO: Check for global or instruction-level "nnan". In that case, we
   26817   //       should be able to lower to FMAX/FMIN alone.
   26818   // TODO: If an operand is already known to be a NaN or not a NaN, this
   26819   //       should be an optional swap and FMAX/FMIN.
   26820   // TODO: Allow f64, vectors, and fminnum.
   26821 
   26822   if (VT != MVT::f32 || !Subtarget->hasSSE1() || Subtarget->useSoftFloat())
   26823     return SDValue();
   26824 
   26825   SDValue Op0 = N->getOperand(0);
   26826   SDValue Op1 = N->getOperand(1);
   26827   SDLoc DL(N);
   26828   EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
   26829       DAG.getDataLayout(), *DAG.getContext(), VT);
   26830 
   26831   // There are 4 possibilities involving NaN inputs, and these are the required
   26832   // outputs:
   26833   //                   Op1
   26834   //               Num     NaN
   26835   //            ----------------
   26836   //       Num  |  Max  |  Op0 |
   26837   // Op0        ----------------
   26838   //       NaN  |  Op1  |  NaN |
   26839   //            ----------------
   26840   //
   26841   // The SSE FP max/min instructions were not designed for this case, but rather
   26842   // to implement:
   26843   //   Max = Op1 > Op0 ? Op1 : Op0
   26844   //
   26845   // So they always return Op0 if either input is a NaN. However, we can still
   26846   // use those instructions for fmaxnum by selecting away a NaN input.
   26847 
   26848   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
   26849   SDValue Max = DAG.getNode(X86ISD::FMAX, DL, VT, Op1, Op0);
   26850   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
   26851 
   26852   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
   26853   // are NaN, the NaN value of Op1 is the result.
   26854   return DAG.getNode(ISD::SELECT, DL, VT, IsOp0Nan, Op1, Max);
   26855 }
   26856 
   26857 /// Do target-specific dag combines on X86ISD::FAND nodes.
   26858 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
   26859                                   const X86Subtarget *Subtarget) {
   26860   // FAND(0.0, x) -> 0.0
   26861   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   26862     if (C->getValueAPF().isPosZero())
   26863       return N->getOperand(0);
   26864 
   26865   // FAND(x, 0.0) -> 0.0
   26866   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   26867     if (C->getValueAPF().isPosZero())
   26868       return N->getOperand(1);
   26869 
   26870   return lowerX86FPLogicOp(N, DAG, Subtarget);
   26871 }
   26872 
   26873 /// Do target-specific dag combines on X86ISD::FANDN nodes
   26874 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
   26875                                    const X86Subtarget *Subtarget) {
   26876   // FANDN(0.0, x) -> x
   26877   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   26878     if (C->getValueAPF().isPosZero())
   26879       return N->getOperand(1);
   26880 
   26881   // FANDN(x, 0.0) -> 0.0
   26882   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   26883     if (C->getValueAPF().isPosZero())
   26884       return N->getOperand(1);
   26885 
   26886   return lowerX86FPLogicOp(N, DAG, Subtarget);
   26887 }
   26888 
   26889 static SDValue PerformBTCombine(SDNode *N,
   26890                                 SelectionDAG &DAG,
   26891                                 TargetLowering::DAGCombinerInfo &DCI) {
   26892   // BT ignores high bits in the bit index operand.
   26893   SDValue Op1 = N->getOperand(1);
   26894   if (Op1.hasOneUse()) {
   26895     unsigned BitWidth = Op1.getValueSizeInBits();
   26896     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
   26897     APInt KnownZero, KnownOne;
   26898     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   26899                                           !DCI.isBeforeLegalizeOps());
   26900     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   26901     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
   26902         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
   26903       DCI.CommitTargetLoweringOpt(TLO);
   26904   }
   26905   return SDValue();
   26906 }
   26907 
   26908 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
   26909   SDValue Op = N->getOperand(0);
   26910   if (Op.getOpcode() == ISD::BITCAST)
   26911     Op = Op.getOperand(0);
   26912   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
   26913   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
   26914       VT.getVectorElementType().getSizeInBits() ==
   26915       OpVT.getVectorElementType().getSizeInBits()) {
   26916     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
   26917   }
   26918   return SDValue();
   26919 }
   26920 
   26921 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
   26922                                                const X86Subtarget *Subtarget) {
   26923   EVT VT = N->getValueType(0);
   26924   if (!VT.isVector())
   26925     return SDValue();
   26926 
   26927   SDValue N0 = N->getOperand(0);
   26928   SDValue N1 = N->getOperand(1);
   26929   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
   26930   SDLoc dl(N);
   26931 
   26932   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
   26933   // both SSE and AVX2 since there is no sign-extended shift right
   26934   // operation on a vector with 64-bit elements.
   26935   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
   26936   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
   26937   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
   26938       N0.getOpcode() == ISD::SIGN_EXTEND)) {
   26939     SDValue N00 = N0.getOperand(0);
   26940 
   26941     // EXTLOAD has a better solution on AVX2,
   26942     // it may be replaced with X86ISD::VSEXT node.
   26943     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
   26944       if (!ISD::isNormalLoad(N00.getNode()))
   26945         return SDValue();
   26946 
   26947     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
   26948         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
   26949                                   N00, N1);
   26950       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
   26951     }
   26952   }
   26953   return SDValue();
   26954 }
   26955 
   26956 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
   26957 /// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
   26958 /// to combine math ops, use an LEA, or use a complex addressing mode. This can
   26959 /// eliminate extend, add, and shift instructions.
   26960 static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
   26961                                        const X86Subtarget *Subtarget) {
   26962   // TODO: This should be valid for other integer types.
   26963   EVT VT = Sext->getValueType(0);
   26964   if (VT != MVT::i64)
   26965     return SDValue();
   26966 
   26967   // We need an 'add nsw' feeding into the 'sext'.
   26968   SDValue Add = Sext->getOperand(0);
   26969   if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
   26970     return SDValue();
   26971 
   26972   // Having a constant operand to the 'add' ensures that we are not increasing
   26973   // the instruction count because the constant is extended for free below.
   26974   // A constant operand can also become the displacement field of an LEA.
   26975   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
   26976   if (!AddOp1)
   26977     return SDValue();
   26978 
   26979   // Don't make the 'add' bigger if there's no hope of combining it with some
   26980   // other 'add' or 'shl' instruction.
   26981   // TODO: It may be profitable to generate simpler LEA instructions in place
   26982   // of single 'add' instructions, but the cost model for selecting an LEA
   26983   // currently has a high threshold.
   26984   bool HasLEAPotential = false;
   26985   for (auto *User : Sext->uses()) {
   26986     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
   26987       HasLEAPotential = true;
   26988       break;
   26989     }
   26990   }
   26991   if (!HasLEAPotential)
   26992     return SDValue();
   26993 
   26994   // Everything looks good, so pull the 'sext' ahead of the 'add'.
   26995   int64_t AddConstant = AddOp1->getSExtValue();
   26996   SDValue AddOp0 = Add.getOperand(0);
   26997   SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
   26998   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
   26999 
   27000   // The wider add is guaranteed to not wrap because both operands are
   27001   // sign-extended.
   27002   SDNodeFlags Flags;
   27003   Flags.setNoSignedWrap(true);
   27004   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
   27005 }
   27006 
   27007 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
   27008                                   TargetLowering::DAGCombinerInfo &DCI,
   27009                                   const X86Subtarget *Subtarget) {
   27010   SDValue N0 = N->getOperand(0);
   27011   EVT VT = N->getValueType(0);
   27012   EVT SVT = VT.getScalarType();
   27013   EVT InVT = N0.getValueType();
   27014   EVT InSVT = InVT.getScalarType();
   27015   SDLoc DL(N);
   27016 
   27017   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
   27018   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
   27019   // This exposes the sext to the sdivrem lowering, so that it directly extends
   27020   // from AH (which we otherwise need to do contortions to access).
   27021   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
   27022       InVT == MVT::i8 && VT == MVT::i32) {
   27023     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
   27024     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys,
   27025                             N0.getOperand(0), N0.getOperand(1));
   27026     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
   27027     return R.getValue(1);
   27028   }
   27029 
   27030   if (!DCI.isBeforeLegalizeOps()) {
   27031     if (InVT == MVT::i1) {
   27032       SDValue Zero = DAG.getConstant(0, DL, VT);
   27033       SDValue AllOnes =
   27034         DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
   27035       return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
   27036     }
   27037     return SDValue();
   27038   }
   27039 
   27040   if (VT.isVector() && Subtarget->hasSSE2()) {
   27041     auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) {
   27042       EVT InVT = N.getValueType();
   27043       EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
   27044                                    Size / InVT.getScalarSizeInBits());
   27045       SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
   27046                                     DAG.getUNDEF(InVT));
   27047       Opnds[0] = N;
   27048       return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
   27049     };
   27050 
   27051     // If target-size is less than 128-bits, extend to a type that would extend
   27052     // to 128 bits, extend that and extract the original target vector.
   27053     if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits()) &&
   27054         (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
   27055         (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
   27056       unsigned Scale = 128 / VT.getSizeInBits();
   27057       EVT ExVT =
   27058           EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
   27059       SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
   27060       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex);
   27061       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
   27062                          DAG.getIntPtrConstant(0, DL));
   27063     }
   27064 
   27065     // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG
   27066     // which ensures lowering to X86ISD::VSEXT (pmovsx*).
   27067     if (VT.getSizeInBits() == 128 &&
   27068         (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
   27069         (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
   27070       SDValue ExOp = ExtendVecSize(DL, N0, 128);
   27071       return DAG.getSignExtendVectorInReg(ExOp, DL, VT);
   27072     }
   27073 
   27074     // On pre-AVX2 targets, split into 128-bit nodes of
   27075     // ISD::SIGN_EXTEND_VECTOR_INREG.
   27076     if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) &&
   27077         (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
   27078         (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
   27079       unsigned NumVecs = VT.getSizeInBits() / 128;
   27080       unsigned NumSubElts = 128 / SVT.getSizeInBits();
   27081       EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
   27082       EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
   27083 
   27084       SmallVector<SDValue, 8> Opnds;
   27085       for (unsigned i = 0, Offset = 0; i != NumVecs;
   27086            ++i, Offset += NumSubElts) {
   27087         SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
   27088                                      DAG.getIntPtrConstant(Offset, DL));
   27089         SrcVec = ExtendVecSize(DL, SrcVec, 128);
   27090         SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT);
   27091         Opnds.push_back(SrcVec);
   27092       }
   27093       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
   27094     }
   27095   }
   27096 
   27097   if (Subtarget->hasAVX() && VT.is256BitVector())
   27098     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
   27099       return R;
   27100 
   27101   if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
   27102     return NewAdd;
   27103 
   27104   return SDValue();
   27105 }
   27106 
   27107 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
   27108                                  const X86Subtarget* Subtarget) {
   27109   SDLoc dl(N);
   27110   EVT VT = N->getValueType(0);
   27111 
   27112   // Let legalize expand this if it isn't a legal type yet.
   27113   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   27114     return SDValue();
   27115 
   27116   EVT ScalarVT = VT.getScalarType();
   27117   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA())
   27118     return SDValue();
   27119 
   27120   SDValue A = N->getOperand(0);
   27121   SDValue B = N->getOperand(1);
   27122   SDValue C = N->getOperand(2);
   27123 
   27124   bool NegA = (A.getOpcode() == ISD::FNEG);
   27125   bool NegB = (B.getOpcode() == ISD::FNEG);
   27126   bool NegC = (C.getOpcode() == ISD::FNEG);
   27127 
   27128   // Negative multiplication when NegA xor NegB
   27129   bool NegMul = (NegA != NegB);
   27130   if (NegA)
   27131     A = A.getOperand(0);
   27132   if (NegB)
   27133     B = B.getOperand(0);
   27134   if (NegC)
   27135     C = C.getOperand(0);
   27136 
   27137   unsigned Opcode;
   27138   if (!NegMul)
   27139     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
   27140   else
   27141     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
   27142 
   27143   return DAG.getNode(Opcode, dl, VT, A, B, C);
   27144 }
   27145 
   27146 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
   27147                                   TargetLowering::DAGCombinerInfo &DCI,
   27148                                   const X86Subtarget *Subtarget) {
   27149   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   27150   //           (and (i32 x86isd::setcc_carry), 1)
   27151   // This eliminates the zext. This transformation is necessary because
   27152   // ISD::SETCC is always legalized to i8.
   27153   SDLoc dl(N);
   27154   SDValue N0 = N->getOperand(0);
   27155   EVT VT = N->getValueType(0);
   27156 
   27157   if (N0.getOpcode() == ISD::AND &&
   27158       N0.hasOneUse() &&
   27159       N0.getOperand(0).hasOneUse()) {
   27160     SDValue N00 = N0.getOperand(0);
   27161     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   27162       if (!isOneConstant(N0.getOperand(1)))
   27163         return SDValue();
   27164       return DAG.getNode(ISD::AND, dl, VT,
   27165                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
   27166                                      N00.getOperand(0), N00.getOperand(1)),
   27167                          DAG.getConstant(1, dl, VT));
   27168     }
   27169   }
   27170 
   27171   if (N0.getOpcode() == ISD::TRUNCATE &&
   27172       N0.hasOneUse() &&
   27173       N0.getOperand(0).hasOneUse()) {
   27174     SDValue N00 = N0.getOperand(0);
   27175     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   27176       return DAG.getNode(ISD::AND, dl, VT,
   27177                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
   27178                                      N00.getOperand(0), N00.getOperand(1)),
   27179                          DAG.getConstant(1, dl, VT));
   27180     }
   27181   }
   27182 
   27183   if (VT.is256BitVector())
   27184     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
   27185       return R;
   27186 
   27187   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
   27188   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
   27189   // This exposes the zext to the udivrem lowering, so that it directly extends
   27190   // from AH (which we otherwise need to do contortions to access).
   27191   if (N0.getOpcode() == ISD::UDIVREM &&
   27192       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
   27193       (VT == MVT::i32 || VT == MVT::i64)) {
   27194     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
   27195     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
   27196                             N0.getOperand(0), N0.getOperand(1));
   27197     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
   27198     return R.getValue(1);
   27199   }
   27200 
   27201   return SDValue();
   27202 }
   27203 
   27204 // Optimize x == -y --> x+y == 0
   27205 //          x != -y --> x+y != 0
   27206 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
   27207                                       const X86Subtarget* Subtarget) {
   27208   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   27209   SDValue LHS = N->getOperand(0);
   27210   SDValue RHS = N->getOperand(1);
   27211   EVT VT = N->getValueType(0);
   27212   SDLoc DL(N);
   27213 
   27214   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
   27215     if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
   27216       SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
   27217                                  LHS.getOperand(1));
   27218       return DAG.getSetCC(DL, N->getValueType(0), addV,
   27219                           DAG.getConstant(0, DL, addV.getValueType()), CC);
   27220     }
   27221   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
   27222     if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
   27223       SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
   27224                                  RHS.getOperand(1));
   27225       return DAG.getSetCC(DL, N->getValueType(0), addV,
   27226                           DAG.getConstant(0, DL, addV.getValueType()), CC);
   27227     }
   27228 
   27229   if (VT.getScalarType() == MVT::i1 &&
   27230       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
   27231     bool IsSEXT0 =
   27232         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
   27233         (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
   27234     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
   27235 
   27236     if (!IsSEXT0 || !IsVZero1) {
   27237       // Swap the operands and update the condition code.
   27238       std::swap(LHS, RHS);
   27239       CC = ISD::getSetCCSwappedOperands(CC);
   27240 
   27241       IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
   27242                 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
   27243       IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
   27244     }
   27245 
   27246     if (IsSEXT0 && IsVZero1) {
   27247       assert(VT == LHS.getOperand(0).getValueType() &&
   27248              "Uexpected operand type");
   27249       if (CC == ISD::SETGT)
   27250         return DAG.getConstant(0, DL, VT);
   27251       if (CC == ISD::SETLE)
   27252         return DAG.getConstant(1, DL, VT);
   27253       if (CC == ISD::SETEQ || CC == ISD::SETGE)
   27254         return DAG.getNOT(DL, LHS.getOperand(0), VT);
   27255 
   27256       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
   27257              "Unexpected condition code!");
   27258       return LHS.getOperand(0);
   27259     }
   27260   }
   27261 
   27262   return SDValue();
   27263 }
   27264 
   27265 static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
   27266   SDValue V0 = N->getOperand(0);
   27267   SDValue V1 = N->getOperand(1);
   27268   SDLoc DL(N);
   27269   EVT VT = N->getValueType(0);
   27270 
   27271   // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
   27272   // operands and changing the mask to 1. This saves us a bunch of
   27273   // pattern-matching possibilities related to scalar math ops in SSE/AVX.
   27274   // x86InstrInfo knows how to commute this back after instruction selection
   27275   // if it would help register allocation.
   27276 
   27277   // TODO: If optimizing for size or a processor that doesn't suffer from
   27278   // partial register update stalls, this should be transformed into a MOVSD
   27279   // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
   27280 
   27281   if (VT == MVT::v2f64)
   27282     if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
   27283       if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
   27284         SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
   27285         return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
   27286       }
   27287 
   27288   return SDValue();
   27289 }
   27290 
   27291 static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
   27292   SDLoc DL(N);
   27293   // Gather and Scatter instructions use k-registers for masks. The type of
   27294   // the masks is v*i1. So the mask will be truncated anyway.
   27295   // The SIGN_EXTEND_INREG my be dropped.
   27296   SDValue Mask = N->getOperand(2);
   27297   if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
   27298     SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
   27299     NewOps[2] = Mask.getOperand(0);
   27300     DAG.UpdateNodeOperands(N, NewOps);
   27301   }
   27302   return SDValue();
   27303 }
   27304 
   27305 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
   27306 // as "sbb reg,reg", since it can be extended without zext and produces
   27307 // an all-ones bit which is more useful than 0/1 in some cases.
   27308 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
   27309                                MVT VT) {
   27310   if (VT == MVT::i8)
   27311     return DAG.getNode(ISD::AND, DL, VT,
   27312                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
   27313                                    DAG.getConstant(X86::COND_B, DL, MVT::i8),
   27314                                    EFLAGS),
   27315                        DAG.getConstant(1, DL, VT));
   27316   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
   27317   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
   27318                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
   27319                                  DAG.getConstant(X86::COND_B, DL, MVT::i8),
   27320                                  EFLAGS));
   27321 }
   27322 
   27323 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
   27324 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
   27325                                    TargetLowering::DAGCombinerInfo &DCI,
   27326                                    const X86Subtarget *Subtarget) {
   27327   SDLoc DL(N);
   27328   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   27329   SDValue EFLAGS = N->getOperand(1);
   27330 
   27331   if (CC == X86::COND_A) {
   27332     // Try to convert COND_A into COND_B in an attempt to facilitate
   27333     // materializing "setb reg".
   27334     //
   27335     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
   27336     // cannot take an immediate as its first operand.
   27337     //
   27338     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
   27339         EFLAGS.getValueType().isInteger() &&
   27340         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
   27341       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
   27342                                    EFLAGS.getNode()->getVTList(),
   27343                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
   27344       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
   27345       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
   27346     }
   27347   }
   27348 
   27349   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
   27350   // a zext and produces an all-ones bit which is more useful than 0/1 in some
   27351   // cases.
   27352   if (CC == X86::COND_B)
   27353     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
   27354 
   27355   if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
   27356     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
   27357     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
   27358   }
   27359 
   27360   return SDValue();
   27361 }
   27362 
   27363 // Optimize branch condition evaluation.
   27364 //
   27365 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
   27366                                     TargetLowering::DAGCombinerInfo &DCI,
   27367                                     const X86Subtarget *Subtarget) {
   27368   SDLoc DL(N);
   27369   SDValue Chain = N->getOperand(0);
   27370   SDValue Dest = N->getOperand(1);
   27371   SDValue EFLAGS = N->getOperand(3);
   27372   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
   27373 
   27374   if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
   27375     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
   27376     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
   27377                        Flags);
   27378   }
   27379 
   27380   return SDValue();
   27381 }
   27382 
   27383 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   27384                                                          SelectionDAG &DAG) {
   27385   // Take advantage of vector comparisons producing 0 or -1 in each lane to
   27386   // optimize away operation when it's from a constant.
   27387   //
   27388   // The general transformation is:
   27389   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
   27390   //       AND(VECTOR_CMP(x,y), constant2)
   27391   //    constant2 = UNARYOP(constant)
   27392 
   27393   // Early exit if this isn't a vector operation, the operand of the
   27394   // unary operation isn't a bitwise AND, or if the sizes of the operations
   27395   // aren't the same.
   27396   EVT VT = N->getValueType(0);
   27397   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
   27398       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
   27399       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
   27400     return SDValue();
   27401 
   27402   // Now check that the other operand of the AND is a constant. We could
   27403   // make the transformation for non-constant splats as well, but it's unclear
   27404   // that would be a benefit as it would not eliminate any operations, just
   27405   // perform one more step in scalar code before moving to the vector unit.
   27406   if (BuildVectorSDNode *BV =
   27407           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
   27408     // Bail out if the vector isn't a constant.
   27409     if (!BV->isConstant())
   27410       return SDValue();
   27411 
   27412     // Everything checks out. Build up the new and improved node.
   27413     SDLoc DL(N);
   27414     EVT IntVT = BV->getValueType(0);
   27415     // Create a new constant of the appropriate type for the transformed
   27416     // DAG.
   27417     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
   27418     // The AND node needs bitcasts to/from an integer vector type around it.
   27419     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
   27420     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
   27421                                  N->getOperand(0)->getOperand(0), MaskConst);
   27422     SDValue Res = DAG.getBitcast(VT, NewAnd);
   27423     return Res;
   27424   }
   27425 
   27426   return SDValue();
   27427 }
   27428 
   27429 static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
   27430                                         const X86Subtarget *Subtarget) {
   27431   SDValue Op0 = N->getOperand(0);
   27432   EVT VT = N->getValueType(0);
   27433   EVT InVT = Op0.getValueType();
   27434   EVT InSVT = InVT.getScalarType();
   27435   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   27436 
   27437   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
   27438   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
   27439   if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
   27440     SDLoc dl(N);
   27441     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
   27442                                  InVT.getVectorNumElements());
   27443     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
   27444 
   27445     if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
   27446       return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
   27447 
   27448     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
   27449   }
   27450 
   27451   return SDValue();
   27452 }
   27453 
   27454 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
   27455                                         const X86Subtarget *Subtarget) {
   27456   // First try to optimize away the conversion entirely when it's
   27457   // conditionally from a constant. Vectors only.
   27458   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
   27459     return Res;
   27460 
   27461   // Now move on to more general possibilities.
   27462   SDValue Op0 = N->getOperand(0);
   27463   EVT VT = N->getValueType(0);
   27464   EVT InVT = Op0.getValueType();
   27465   EVT InSVT = InVT.getScalarType();
   27466 
   27467   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
   27468   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
   27469   if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
   27470     SDLoc dl(N);
   27471     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
   27472                                  InVT.getVectorNumElements());
   27473     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
   27474     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
   27475   }
   27476 
   27477   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
   27478   // a 32-bit target where SSE doesn't support i64->FP operations.
   27479   if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
   27480     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
   27481     EVT LdVT = Ld->getValueType(0);
   27482 
   27483     // This transformation is not supported if the result type is f16
   27484     if (VT == MVT::f16)
   27485       return SDValue();
   27486 
   27487     if (!Ld->isVolatile() && !VT.isVector() &&
   27488         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
   27489         !Subtarget->is64Bit() && LdVT == MVT::i64) {
   27490       SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
   27491           SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
   27492       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
   27493       return FILDChain;
   27494     }
   27495   }
   27496   return SDValue();
   27497 }
   27498 
   27499 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
   27500 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
   27501                                  X86TargetLowering::DAGCombinerInfo &DCI) {
   27502   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
   27503   // the result is either zero or one (depending on the input carry bit).
   27504   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
   27505   if (X86::isZeroNode(N->getOperand(0)) &&
   27506       X86::isZeroNode(N->getOperand(1)) &&
   27507       // We don't have a good way to replace an EFLAGS use, so only do this when
   27508       // dead right now.
   27509       SDValue(N, 1).use_empty()) {
   27510     SDLoc DL(N);
   27511     EVT VT = N->getValueType(0);
   27512     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
   27513     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
   27514                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
   27515                                            DAG.getConstant(X86::COND_B, DL,
   27516                                                            MVT::i8),
   27517                                            N->getOperand(2)),
   27518                                DAG.getConstant(1, DL, VT));
   27519     return DCI.CombineTo(N, Res1, CarryOut);
   27520   }
   27521 
   27522   return SDValue();
   27523 }
   27524 
   27525 // fold (add Y, (sete  X, 0)) -> adc  0, Y
   27526 //      (add Y, (setne X, 0)) -> sbb -1, Y
   27527 //      (sub (sete  X, 0), Y) -> sbb  0, Y
   27528 //      (sub (setne X, 0), Y) -> adc -1, Y
   27529 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
   27530   SDLoc DL(N);
   27531 
   27532   // Look through ZExts.
   27533   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
   27534   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
   27535     return SDValue();
   27536 
   27537   SDValue SetCC = Ext.getOperand(0);
   27538   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
   27539     return SDValue();
   27540 
   27541   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
   27542   if (CC != X86::COND_E && CC != X86::COND_NE)
   27543     return SDValue();
   27544 
   27545   SDValue Cmp = SetCC.getOperand(1);
   27546   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
   27547       !X86::isZeroNode(Cmp.getOperand(1)) ||
   27548       !Cmp.getOperand(0).getValueType().isInteger())
   27549     return SDValue();
   27550 
   27551   SDValue CmpOp0 = Cmp.getOperand(0);
   27552   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
   27553                                DAG.getConstant(1, DL, CmpOp0.getValueType()));
   27554 
   27555   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
   27556   if (CC == X86::COND_NE)
   27557     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
   27558                        DL, OtherVal.getValueType(), OtherVal,
   27559                        DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
   27560                        NewCmp);
   27561   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
   27562                      DL, OtherVal.getValueType(), OtherVal,
   27563                      DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
   27564 }
   27565 
   27566 /// PerformADDCombine - Do target-specific dag combines on integer adds.
   27567 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
   27568                                  const X86Subtarget *Subtarget) {
   27569   EVT VT = N->getValueType(0);
   27570   SDValue Op0 = N->getOperand(0);
   27571   SDValue Op1 = N->getOperand(1);
   27572 
   27573   // Try to synthesize horizontal adds from adds of shuffles.
   27574   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
   27575        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
   27576       isHorizontalBinOp(Op0, Op1, true))
   27577     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
   27578 
   27579   return OptimizeConditionalInDecrement(N, DAG);
   27580 }
   27581 
   27582 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
   27583                                  const X86Subtarget *Subtarget) {
   27584   SDValue Op0 = N->getOperand(0);
   27585   SDValue Op1 = N->getOperand(1);
   27586 
   27587   // X86 can't encode an immediate LHS of a sub. See if we can push the
   27588   // negation into a preceding instruction.
   27589   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
   27590     // If the RHS of the sub is a XOR with one use and a constant, invert the
   27591     // immediate. Then add one to the LHS of the sub so we can turn
   27592     // X-Y -> X+~Y+1, saving one register.
   27593     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
   27594         isa<ConstantSDNode>(Op1.getOperand(1))) {
   27595       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
   27596       EVT VT = Op0.getValueType();
   27597       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
   27598                                    Op1.getOperand(0),
   27599                                    DAG.getConstant(~XorC, SDLoc(Op1), VT));
   27600       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
   27601                          DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
   27602     }
   27603   }
   27604 
   27605   // Try to synthesize horizontal adds from adds of shuffles.
   27606   EVT VT = N->getValueType(0);
   27607   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
   27608        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
   27609       isHorizontalBinOp(Op0, Op1, true))
   27610     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
   27611 
   27612   return OptimizeConditionalInDecrement(N, DAG);
   27613 }
   27614 
   27615 /// performVZEXTCombine - Performs build vector combines
   27616 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
   27617                                    TargetLowering::DAGCombinerInfo &DCI,
   27618                                    const X86Subtarget *Subtarget) {
   27619   SDLoc DL(N);
   27620   MVT VT = N->getSimpleValueType(0);
   27621   SDValue Op = N->getOperand(0);
   27622   MVT OpVT = Op.getSimpleValueType();
   27623   MVT OpEltVT = OpVT.getVectorElementType();
   27624   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
   27625 
   27626   // (vzext (bitcast (vzext (x)) -> (vzext x)
   27627   SDValue V = Op;
   27628   while (V.getOpcode() == ISD::BITCAST)
   27629     V = V.getOperand(0);
   27630 
   27631   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
   27632     MVT InnerVT = V.getSimpleValueType();
   27633     MVT InnerEltVT = InnerVT.getVectorElementType();
   27634 
   27635     // If the element sizes match exactly, we can just do one larger vzext. This
   27636     // is always an exact type match as vzext operates on integer types.
   27637     if (OpEltVT == InnerEltVT) {
   27638       assert(OpVT == InnerVT && "Types must match for vzext!");
   27639       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
   27640     }
   27641 
   27642     // The only other way we can combine them is if only a single element of the
   27643     // inner vzext is used in the input to the outer vzext.
   27644     if (InnerEltVT.getSizeInBits() < InputBits)
   27645       return SDValue();
   27646 
   27647     // In this case, the inner vzext is completely dead because we're going to
   27648     // only look at bits inside of the low element. Just do the outer vzext on
   27649     // a bitcast of the input to the inner.
   27650     return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
   27651   }
   27652 
   27653   // Check if we can bypass extracting and re-inserting an element of an input
   27654   // vector. Essentially:
   27655   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
   27656   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
   27657       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   27658       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
   27659     SDValue ExtractedV = V.getOperand(0);
   27660     SDValue OrigV = ExtractedV.getOperand(0);
   27661     if (isNullConstant(ExtractedV.getOperand(1))) {
   27662         MVT OrigVT = OrigV.getSimpleValueType();
   27663         // Extract a subvector if necessary...
   27664         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
   27665           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
   27666           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
   27667                                     OrigVT.getVectorNumElements() / Ratio);
   27668           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
   27669                               DAG.getIntPtrConstant(0, DL));
   27670         }
   27671         Op = DAG.getBitcast(OpVT, OrigV);
   27672         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
   27673       }
   27674   }
   27675 
   27676   return SDValue();
   27677 }
   27678 
   27679 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   27680                                              DAGCombinerInfo &DCI) const {
   27681   SelectionDAG &DAG = DCI.DAG;
   27682   switch (N->getOpcode()) {
   27683   default: break;
   27684   case ISD::EXTRACT_VECTOR_ELT:
   27685     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
   27686   case ISD::VSELECT:
   27687   case ISD::SELECT:
   27688   case X86ISD::SHRUNKBLEND:
   27689     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
   27690   case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG, Subtarget);
   27691   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
   27692   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
   27693   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
   27694   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
   27695   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
   27696   case ISD::SHL:
   27697   case ISD::SRA:
   27698   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
   27699   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
   27700   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
   27701   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
   27702   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
   27703   case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
   27704   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
   27705   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
   27706   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
   27707   case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
   27708   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   27709   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   27710   case ISD::FNEG:           return PerformFNEGCombine(N, DAG, Subtarget);
   27711   case ISD::TRUNCATE:       return PerformTRUNCATECombine(N, DAG, Subtarget);
   27712   case X86ISD::FXOR:
   27713   case X86ISD::FOR:         return PerformFORCombine(N, DAG, Subtarget);
   27714   case X86ISD::FMIN:
   27715   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
   27716   case ISD::FMAXNUM:        return performFMaxNumCombine(N, DAG, Subtarget);
   27717   case X86ISD::FAND:        return PerformFANDCombine(N, DAG, Subtarget);
   27718   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG, Subtarget);
   27719   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   27720   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
   27721   case ISD::ANY_EXTEND:
   27722   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
   27723   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
   27724   case ISD::SIGN_EXTEND_INREG:
   27725     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
   27726   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
   27727   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
   27728   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
   27729   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
   27730   case X86ISD::SHUFP:       // Handle all target specific shuffles
   27731   case X86ISD::PALIGNR:
   27732   case X86ISD::UNPCKH:
   27733   case X86ISD::UNPCKL:
   27734   case X86ISD::MOVHLPS:
   27735   case X86ISD::MOVLHPS:
   27736   case X86ISD::PSHUFB:
   27737   case X86ISD::PSHUFD:
   27738   case X86ISD::PSHUFHW:
   27739   case X86ISD::PSHUFLW:
   27740   case X86ISD::MOVSS:
   27741   case X86ISD::MOVSD:
   27742   case X86ISD::VPERMILPI:
   27743   case X86ISD::VPERM2X128:
   27744   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   27745   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
   27746   case X86ISD::BLENDI:    return PerformBLENDICombine(N, DAG);
   27747   case ISD::MGATHER:
   27748   case ISD::MSCATTER:       return PerformGatherScatterCombine(N, DAG);
   27749   }
   27750 
   27751   return SDValue();
   27752 }
   27753 
   27754 /// isTypeDesirableForOp - Return true if the target has native support for
   27755 /// the specified value type and it is 'desirable' to use the type for the
   27756 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
   27757 /// instruction encodings are longer and some i16 instructions are slow.
   27758 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   27759   if (!isTypeLegal(VT))
   27760     return false;
   27761   if (VT != MVT::i16)
   27762     return true;
   27763 
   27764   switch (Opc) {
   27765   default:
   27766     return true;
   27767   case ISD::LOAD:
   27768   case ISD::SIGN_EXTEND:
   27769   case ISD::ZERO_EXTEND:
   27770   case ISD::ANY_EXTEND:
   27771   case ISD::SHL:
   27772   case ISD::SRL:
   27773   case ISD::SUB:
   27774   case ISD::ADD:
   27775   case ISD::MUL:
   27776   case ISD::AND:
   27777   case ISD::OR:
   27778   case ISD::XOR:
   27779     return false;
   27780   }
   27781 }
   27782 
   27783 /// IsDesirableToPromoteOp - This method query the target whether it is
   27784 /// beneficial for dag combiner to promote the specified node. If true, it
   27785 /// should return the desired promotion type by reference.
   27786 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   27787   EVT VT = Op.getValueType();
   27788   if (VT != MVT::i16)
   27789     return false;
   27790 
   27791   bool Promote = false;
   27792   bool Commute = false;
   27793   switch (Op.getOpcode()) {
   27794   default: break;
   27795   case ISD::LOAD: {
   27796     LoadSDNode *LD = cast<LoadSDNode>(Op);
   27797     // If the non-extending load has a single use and it's not live out, then it
   27798     // might be folded.
   27799     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
   27800                                                      Op.hasOneUse()*/) {
   27801       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   27802              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
   27803         // The only case where we'd want to promote LOAD (rather then it being
   27804         // promoted as an operand is when it's only use is liveout.
   27805         if (UI->getOpcode() != ISD::CopyToReg)
   27806           return false;
   27807       }
   27808     }
   27809     Promote = true;
   27810     break;
   27811   }
   27812   case ISD::SIGN_EXTEND:
   27813   case ISD::ZERO_EXTEND:
   27814   case ISD::ANY_EXTEND:
   27815     Promote = true;
   27816     break;
   27817   case ISD::SHL:
   27818   case ISD::SRL: {
   27819     SDValue N0 = Op.getOperand(0);
   27820     // Look out for (store (shl (load), x)).
   27821     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
   27822       return false;
   27823     Promote = true;
   27824     break;
   27825   }
   27826   case ISD::ADD:
   27827   case ISD::MUL:
   27828   case ISD::AND:
   27829   case ISD::OR:
   27830   case ISD::XOR:
   27831     Commute = true;
   27832     // fallthrough
   27833   case ISD::SUB: {
   27834     SDValue N0 = Op.getOperand(0);
   27835     SDValue N1 = Op.getOperand(1);
   27836     if (!Commute && MayFoldLoad(N1))
   27837       return false;
   27838     // Avoid disabling potential load folding opportunities.
   27839     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
   27840       return false;
   27841     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
   27842       return false;
   27843     Promote = true;
   27844   }
   27845   }
   27846 
   27847   PVT = MVT::i32;
   27848   return Promote;
   27849 }
   27850 
   27851 //===----------------------------------------------------------------------===//
   27852 //                           X86 Inline Assembly Support
   27853 //===----------------------------------------------------------------------===//
   27854 
   27855 // Helper to match a string separated by whitespace.
   27856 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
   27857   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
   27858 
   27859   for (StringRef Piece : Pieces) {
   27860     if (!S.startswith(Piece)) // Check if the piece matches.
   27861       return false;
   27862 
   27863     S = S.substr(Piece.size());
   27864     StringRef::size_type Pos = S.find_first_not_of(" \t");
   27865     if (Pos == 0) // We matched a prefix.
   27866       return false;
   27867 
   27868     S = S.substr(Pos);
   27869   }
   27870 
   27871   return S.empty();
   27872 }
   27873 
   27874 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
   27875 
   27876   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
   27877     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
   27878         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
   27879         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
   27880 
   27881       if (AsmPieces.size() == 3)
   27882         return true;
   27883       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
   27884         return true;
   27885     }
   27886   }
   27887   return false;
   27888 }
   27889 
   27890 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   27891   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
   27892 
   27893   std::string AsmStr = IA->getAsmString();
   27894 
   27895   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
   27896   if (!Ty || Ty->getBitWidth() % 16 != 0)
   27897     return false;
   27898 
   27899   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
   27900   SmallVector<StringRef, 4> AsmPieces;
   27901   SplitString(AsmStr, AsmPieces, ";\n");
   27902 
   27903   switch (AsmPieces.size()) {
   27904   default: return false;
   27905   case 1:
   27906     // FIXME: this should verify that we are targeting a 486 or better.  If not,
   27907     // we will turn this bswap into something that will be lowered to logical
   27908     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
   27909     // lower so don't worry about this.
   27910     // bswap $0
   27911     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
   27912         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
   27913         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
   27914         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
   27915         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
   27916         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
   27917       // No need to check constraints, nothing other than the equivalent of
   27918       // "=r,0" would be valid here.
   27919       return IntrinsicLowering::LowerToByteSwap(CI);
   27920     }
   27921 
   27922     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
   27923     if (CI->getType()->isIntegerTy(16) &&
   27924         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
   27925         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
   27926          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
   27927       AsmPieces.clear();
   27928       StringRef ConstraintsStr = IA->getConstraintString();
   27929       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
   27930       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
   27931       if (clobbersFlagRegisters(AsmPieces))
   27932         return IntrinsicLowering::LowerToByteSwap(CI);
   27933     }
   27934     break;
   27935   case 3:
   27936     if (CI->getType()->isIntegerTy(32) &&
   27937         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
   27938         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
   27939         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
   27940         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
   27941       AsmPieces.clear();
   27942       StringRef ConstraintsStr = IA->getConstraintString();
   27943       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
   27944       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
   27945       if (clobbersFlagRegisters(AsmPieces))
   27946         return IntrinsicLowering::LowerToByteSwap(CI);
   27947     }
   27948 
   27949     if (CI->getType()->isIntegerTy(64)) {
   27950       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
   27951       if (Constraints.size() >= 2 &&
   27952           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
   27953           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
   27954         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
   27955         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
   27956             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
   27957             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
   27958           return IntrinsicLowering::LowerToByteSwap(CI);
   27959       }
   27960     }
   27961     break;
   27962   }
   27963   return false;
   27964 }
   27965 
   27966 /// getConstraintType - Given a constraint letter, return the type of
   27967 /// constraint it is for this target.
   27968 X86TargetLowering::ConstraintType
   27969 X86TargetLowering::getConstraintType(StringRef Constraint) const {
   27970   if (Constraint.size() == 1) {
   27971     switch (Constraint[0]) {
   27972     case 'R':
   27973     case 'q':
   27974     case 'Q':
   27975     case 'f':
   27976     case 't':
   27977     case 'u':
   27978     case 'y':
   27979     case 'x':
   27980     case 'Y':
   27981     case 'l':
   27982       return C_RegisterClass;
   27983     case 'a':
   27984     case 'b':
   27985     case 'c':
   27986     case 'd':
   27987     case 'S':
   27988     case 'D':
   27989     case 'A':
   27990       return C_Register;
   27991     case 'I':
   27992     case 'J':
   27993     case 'K':
   27994     case 'L':
   27995     case 'M':
   27996     case 'N':
   27997     case 'G':
   27998     case 'C':
   27999     case 'e':
   28000     case 'Z':
   28001       return C_Other;
   28002     default:
   28003       break;
   28004     }
   28005   }
   28006   return TargetLowering::getConstraintType(Constraint);
   28007 }
   28008 
   28009 /// Examine constraint type and operand type and determine a weight value.
   28010 /// This object must already have been set up with the operand type
   28011 /// and the current alternative constraint selected.
   28012 TargetLowering::ConstraintWeight
   28013   X86TargetLowering::getSingleConstraintMatchWeight(
   28014     AsmOperandInfo &info, const char *constraint) const {
   28015   ConstraintWeight weight = CW_Invalid;
   28016   Value *CallOperandVal = info.CallOperandVal;
   28017     // If we don't have a value, we can't do a match,
   28018     // but allow it at the lowest weight.
   28019   if (!CallOperandVal)
   28020     return CW_Default;
   28021   Type *type = CallOperandVal->getType();
   28022   // Look at the constraint type.
   28023   switch (*constraint) {
   28024   default:
   28025     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
   28026   case 'R':
   28027   case 'q':
   28028   case 'Q':
   28029   case 'a':
   28030   case 'b':
   28031   case 'c':
   28032   case 'd':
   28033   case 'S':
   28034   case 'D':
   28035   case 'A':
   28036     if (CallOperandVal->getType()->isIntegerTy())
   28037       weight = CW_SpecificReg;
   28038     break;
   28039   case 'f':
   28040   case 't':
   28041   case 'u':
   28042     if (type->isFloatingPointTy())
   28043       weight = CW_SpecificReg;
   28044     break;
   28045   case 'y':
   28046     if (type->isX86_MMXTy() && Subtarget->hasMMX())
   28047       weight = CW_SpecificReg;
   28048     break;
   28049   case 'x':
   28050   case 'Y':
   28051     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
   28052         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
   28053       weight = CW_Register;
   28054     break;
   28055   case 'I':
   28056     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
   28057       if (C->getZExtValue() <= 31)
   28058         weight = CW_Constant;
   28059     }
   28060     break;
   28061   case 'J':
   28062     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   28063       if (C->getZExtValue() <= 63)
   28064         weight = CW_Constant;
   28065     }
   28066     break;
   28067   case 'K':
   28068     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   28069       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
   28070         weight = CW_Constant;
   28071     }
   28072     break;
   28073   case 'L':
   28074     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   28075       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
   28076         weight = CW_Constant;
   28077     }
   28078     break;
   28079   case 'M':
   28080     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   28081       if (C->getZExtValue() <= 3)
   28082         weight = CW_Constant;
   28083     }
   28084     break;
   28085   case 'N':
   28086     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   28087       if (C->getZExtValue() <= 0xff)
   28088         weight = CW_Constant;
   28089     }
   28090     break;
   28091   case 'G':
   28092   case 'C':
   28093     if (isa<ConstantFP>(CallOperandVal)) {
   28094       weight = CW_Constant;
   28095     }
   28096     break;
   28097   case 'e':
   28098     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   28099       if ((C->getSExtValue() >= -0x80000000LL) &&
   28100           (C->getSExtValue() <= 0x7fffffffLL))
   28101         weight = CW_Constant;
   28102     }
   28103     break;
   28104   case 'Z':
   28105     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   28106       if (C->getZExtValue() <= 0xffffffff)
   28107         weight = CW_Constant;
   28108     }
   28109     break;
   28110   }
   28111   return weight;
   28112 }
   28113 
   28114 /// LowerXConstraint - try to replace an X constraint, which matches anything,
   28115 /// with another that has more specific requirements based on the type of the
   28116 /// corresponding operand.
   28117 const char *X86TargetLowering::
   28118 LowerXConstraint(EVT ConstraintVT) const {
   28119   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   28120   // 'f' like normal targets.
   28121   if (ConstraintVT.isFloatingPoint()) {
   28122     if (Subtarget->hasSSE2())
   28123       return "Y";
   28124     if (Subtarget->hasSSE1())
   28125       return "x";
   28126   }
   28127 
   28128   return TargetLowering::LowerXConstraint(ConstraintVT);
   28129 }
   28130 
   28131 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
   28132 /// vector.  If it is invalid, don't add anything to Ops.
   28133 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   28134                                                      std::string &Constraint,
   28135                                                      std::vector<SDValue>&Ops,
   28136                                                      SelectionDAG &DAG) const {
   28137   SDValue Result;
   28138 
   28139   // Only support length 1 constraints for now.
   28140   if (Constraint.length() > 1) return;
   28141 
   28142   char ConstraintLetter = Constraint[0];
   28143   switch (ConstraintLetter) {
   28144   default: break;
   28145   case 'I':
   28146     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   28147       if (C->getZExtValue() <= 31) {
   28148         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   28149                                        Op.getValueType());
   28150         break;
   28151       }
   28152     }
   28153     return;
   28154   case 'J':
   28155     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   28156       if (C->getZExtValue() <= 63) {
   28157         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   28158                                        Op.getValueType());
   28159         break;
   28160       }
   28161     }
   28162     return;
   28163   case 'K':
   28164     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   28165       if (isInt<8>(C->getSExtValue())) {
   28166         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   28167                                        Op.getValueType());
   28168         break;
   28169       }
   28170     }
   28171     return;
   28172   case 'L':
   28173     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   28174       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
   28175           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
   28176         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
   28177                                        Op.getValueType());
   28178         break;
   28179       }
   28180     }
   28181     return;
   28182   case 'M':
   28183     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   28184       if (C->getZExtValue() <= 3) {
   28185         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   28186                                        Op.getValueType());
   28187         break;
   28188       }
   28189     }
   28190     return;
   28191   case 'N':
   28192     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   28193       if (C->getZExtValue() <= 255) {
   28194         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   28195                                        Op.getValueType());
   28196         break;
   28197       }
   28198     }
   28199     return;
   28200   case 'O':
   28201     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   28202       if (C->getZExtValue() <= 127) {
   28203         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   28204                                        Op.getValueType());
   28205         break;
   28206       }
   28207     }
   28208     return;
   28209   case 'e': {
   28210     // 32-bit signed value
   28211     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   28212       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
   28213                                            C->getSExtValue())) {
   28214         // Widen to 64 bits here to get it sign extended.
   28215         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
   28216         break;
   28217       }
   28218     // FIXME gcc accepts some relocatable values here too, but only in certain
   28219     // memory models; it's complicated.
   28220     }
   28221     return;
   28222   }
   28223   case 'Z': {
   28224     // 32-bit unsigned value
   28225     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   28226       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
   28227                                            C->getZExtValue())) {
   28228         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   28229                                        Op.getValueType());
   28230         break;
   28231       }
   28232     }
   28233     // FIXME gcc accepts some relocatable values here too, but only in certain
   28234     // memory models; it's complicated.
   28235     return;
   28236   }
   28237   case 'i': {
   28238     // Literal immediates are always ok.
   28239     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
   28240       // Widen to 64 bits here to get it sign extended.
   28241       Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
   28242       break;
   28243     }
   28244 
   28245     // In any sort of PIC mode addresses need to be computed at runtime by
   28246     // adding in a register or some sort of table lookup.  These can't
   28247     // be used as immediates.
   28248     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
   28249       return;
   28250 
   28251     // If we are in non-pic codegen mode, we allow the address of a global (with
   28252     // an optional displacement) to be used with 'i'.
   28253     GlobalAddressSDNode *GA = nullptr;
   28254     int64_t Offset = 0;
   28255 
   28256     // Match either (GA), (GA+C), (GA+C1+C2), etc.
   28257     while (1) {
   28258       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
   28259         Offset += GA->getOffset();
   28260         break;
   28261       } else if (Op.getOpcode() == ISD::ADD) {
   28262         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   28263           Offset += C->getZExtValue();
   28264           Op = Op.getOperand(0);
   28265           continue;
   28266         }
   28267       } else if (Op.getOpcode() == ISD::SUB) {
   28268         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   28269           Offset += -C->getZExtValue();
   28270           Op = Op.getOperand(0);
   28271           continue;
   28272         }
   28273       }
   28274 
   28275       // Otherwise, this isn't something we can handle, reject it.
   28276       return;
   28277     }
   28278 
   28279     const GlobalValue *GV = GA->getGlobal();
   28280     // If we require an extra load to get this address, as in PIC mode, we
   28281     // can't accept it.
   28282     if (isGlobalStubReference(
   28283             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
   28284       return;
   28285 
   28286     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
   28287                                         GA->getValueType(0), Offset);
   28288     break;
   28289   }
   28290   }
   28291 
   28292   if (Result.getNode()) {
   28293     Ops.push_back(Result);
   28294     return;
   28295   }
   28296   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   28297 }
   28298 
   28299 std::pair<unsigned, const TargetRegisterClass *>
   28300 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   28301                                                 StringRef Constraint,
   28302                                                 MVT VT) const {
   28303   // First, see if this is a constraint that directly corresponds to an LLVM
   28304   // register class.
   28305   if (Constraint.size() == 1) {
   28306     // GCC Constraint Letters
   28307     switch (Constraint[0]) {
   28308     default: break;
   28309       // TODO: Slight differences here in allocation order and leaving
   28310       // RIP in the class. Do they matter any more here than they do
   28311       // in the normal allocation?
   28312     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
   28313       if (Subtarget->is64Bit()) {
   28314         if (VT == MVT::i32 || VT == MVT::f32)
   28315           return std::make_pair(0U, &X86::GR32RegClass);
   28316         if (VT == MVT::i16)
   28317           return std::make_pair(0U, &X86::GR16RegClass);
   28318         if (VT == MVT::i8 || VT == MVT::i1)
   28319           return std::make_pair(0U, &X86::GR8RegClass);
   28320         if (VT == MVT::i64 || VT == MVT::f64)
   28321           return std::make_pair(0U, &X86::GR64RegClass);
   28322         break;
   28323       }
   28324       // 32-bit fallthrough
   28325     case 'Q':   // Q_REGS
   28326       if (VT == MVT::i32 || VT == MVT::f32)
   28327         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
   28328       if (VT == MVT::i16)
   28329         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
   28330       if (VT == MVT::i8 || VT == MVT::i1)
   28331         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
   28332       if (VT == MVT::i64)
   28333         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
   28334       break;
   28335     case 'r':   // GENERAL_REGS
   28336     case 'l':   // INDEX_REGS
   28337       if (VT == MVT::i8 || VT == MVT::i1)
   28338         return std::make_pair(0U, &X86::GR8RegClass);
   28339       if (VT == MVT::i16)
   28340         return std::make_pair(0U, &X86::GR16RegClass);
   28341       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
   28342         return std::make_pair(0U, &X86::GR32RegClass);
   28343       return std::make_pair(0U, &X86::GR64RegClass);
   28344     case 'R':   // LEGACY_REGS
   28345       if (VT == MVT::i8 || VT == MVT::i1)
   28346         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
   28347       if (VT == MVT::i16)
   28348         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
   28349       if (VT == MVT::i32 || !Subtarget->is64Bit())
   28350         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
   28351       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
   28352     case 'f':  // FP Stack registers.
   28353       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
   28354       // value to the correct fpstack register class.
   28355       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
   28356         return std::make_pair(0U, &X86::RFP32RegClass);
   28357       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
   28358         return std::make_pair(0U, &X86::RFP64RegClass);
   28359       return std::make_pair(0U, &X86::RFP80RegClass);
   28360     case 'y':   // MMX_REGS if MMX allowed.
   28361       if (!Subtarget->hasMMX()) break;
   28362       return std::make_pair(0U, &X86::VR64RegClass);
   28363     case 'Y':   // SSE_REGS if SSE2 allowed
   28364       if (!Subtarget->hasSSE2()) break;
   28365       // FALL THROUGH.
   28366     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
   28367       if (!Subtarget->hasSSE1()) break;
   28368 
   28369       switch (VT.SimpleTy) {
   28370       default: break;
   28371       // Scalar SSE types.
   28372       case MVT::f32:
   28373       case MVT::i32:
   28374         return std::make_pair(0U, &X86::FR32RegClass);
   28375       case MVT::f64:
   28376       case MVT::i64:
   28377         return std::make_pair(0U, &X86::FR64RegClass);
   28378       // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
   28379       // Vector types.
   28380       case MVT::v16i8:
   28381       case MVT::v8i16:
   28382       case MVT::v4i32:
   28383       case MVT::v2i64:
   28384       case MVT::v4f32:
   28385       case MVT::v2f64:
   28386         return std::make_pair(0U, &X86::VR128RegClass);
   28387       // AVX types.
   28388       case MVT::v32i8:
   28389       case MVT::v16i16:
   28390       case MVT::v8i32:
   28391       case MVT::v4i64:
   28392       case MVT::v8f32:
   28393       case MVT::v4f64:
   28394         return std::make_pair(0U, &X86::VR256RegClass);
   28395       case MVT::v8f64:
   28396       case MVT::v16f32:
   28397       case MVT::v16i32:
   28398       case MVT::v8i64:
   28399         return std::make_pair(0U, &X86::VR512RegClass);
   28400       }
   28401       break;
   28402     }
   28403   }
   28404 
   28405   // Use the default implementation in TargetLowering to convert the register
   28406   // constraint into a member of a register class.
   28407   std::pair<unsigned, const TargetRegisterClass*> Res;
   28408   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
   28409 
   28410   // Not found as a standard register?
   28411   if (!Res.second) {
   28412     // Map st(0) -> st(7) -> ST0
   28413     if (Constraint.size() == 7 && Constraint[0] == '{' &&
   28414         tolower(Constraint[1]) == 's' &&
   28415         tolower(Constraint[2]) == 't' &&
   28416         Constraint[3] == '(' &&
   28417         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
   28418         Constraint[5] == ')' &&
   28419         Constraint[6] == '}') {
   28420 
   28421       Res.first = X86::FP0+Constraint[4]-'0';
   28422       Res.second = &X86::RFP80RegClass;
   28423       return Res;
   28424     }
   28425 
   28426     // GCC allows "st(0)" to be called just plain "st".
   28427     if (StringRef("{st}").equals_lower(Constraint)) {
   28428       Res.first = X86::FP0;
   28429       Res.second = &X86::RFP80RegClass;
   28430       return Res;
   28431     }
   28432 
   28433     // flags -> EFLAGS
   28434     if (StringRef("{flags}").equals_lower(Constraint)) {
   28435       Res.first = X86::EFLAGS;
   28436       Res.second = &X86::CCRRegClass;
   28437       return Res;
   28438     }
   28439 
   28440     // 'A' means EAX + EDX.
   28441     if (Constraint == "A") {
   28442       Res.first = X86::EAX;
   28443       Res.second = &X86::GR32_ADRegClass;
   28444       return Res;
   28445     }
   28446     return Res;
   28447   }
   28448 
   28449   // Otherwise, check to see if this is a register class of the wrong value
   28450   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
   28451   // turn into {ax},{dx}.
   28452   // MVT::Other is used to specify clobber names.
   28453   if (Res.second->hasType(VT) || VT == MVT::Other)
   28454     return Res;   // Correct type already, nothing to do.
   28455 
   28456   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
   28457   // return "eax". This should even work for things like getting 64bit integer
   28458   // registers when given an f64 type.
   28459   const TargetRegisterClass *Class = Res.second;
   28460   if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass ||
   28461       Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) {
   28462     unsigned Size = VT.getSizeInBits();
   28463     MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8
   28464                                   : Size == 16 ? MVT::i16
   28465                                   : Size == 32 ? MVT::i32
   28466                                   : Size == 64 ? MVT::i64
   28467                                   : MVT::Other;
   28468     unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy);
   28469     if (DestReg > 0) {
   28470       Res.first = DestReg;
   28471       Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass
   28472                  : SimpleTy == MVT::i16 ? &X86::GR16RegClass
   28473                  : SimpleTy == MVT::i32 ? &X86::GR32RegClass
   28474                  : &X86::GR64RegClass;
   28475       assert(Res.second->contains(Res.first) && "Register in register class");
   28476     } else {
   28477       // No register found/type mismatch.
   28478       Res.first = 0;
   28479       Res.second = nullptr;
   28480     }
   28481   } else if (Class == &X86::FR32RegClass || Class == &X86::FR64RegClass ||
   28482              Class == &X86::VR128RegClass || Class == &X86::VR256RegClass ||
   28483              Class == &X86::FR32XRegClass || Class == &X86::FR64XRegClass ||
   28484              Class == &X86::VR128XRegClass || Class == &X86::VR256XRegClass ||
   28485              Class == &X86::VR512RegClass) {
   28486     // Handle references to XMM physical registers that got mapped into the
   28487     // wrong class.  This can happen with constraints like {xmm0} where the
   28488     // target independent register mapper will just pick the first match it can
   28489     // find, ignoring the required type.
   28490 
   28491     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
   28492     if (VT == MVT::f32 || VT == MVT::i32)
   28493       Res.second = &X86::FR32RegClass;
   28494     else if (VT == MVT::f64 || VT == MVT::i64)
   28495       Res.second = &X86::FR64RegClass;
   28496     else if (X86::VR128RegClass.hasType(VT))
   28497       Res.second = &X86::VR128RegClass;
   28498     else if (X86::VR256RegClass.hasType(VT))
   28499       Res.second = &X86::VR256RegClass;
   28500     else if (X86::VR512RegClass.hasType(VT))
   28501       Res.second = &X86::VR512RegClass;
   28502     else {
   28503       // Type mismatch and not a clobber: Return an error;
   28504       Res.first = 0;
   28505       Res.second = nullptr;
   28506     }
   28507   }
   28508 
   28509   return Res;
   28510 }
   28511 
   28512 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
   28513                                             const AddrMode &AM, Type *Ty,
   28514                                             unsigned AS) const {
   28515   // Scaling factors are not free at all.
   28516   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
   28517   // will take 2 allocations in the out of order engine instead of 1
   28518   // for plain addressing mode, i.e. inst (reg1).
   28519   // E.g.,
   28520   // vaddps (%rsi,%drx), %ymm0, %ymm1
   28521   // Requires two allocations (one for the load, one for the computation)
   28522   // whereas:
   28523   // vaddps (%rsi), %ymm0, %ymm1
   28524   // Requires just 1 allocation, i.e., freeing allocations for other operations
   28525   // and having less micro operations to execute.
   28526   //
   28527   // For some X86 architectures, this is even worse because for instance for
   28528   // stores, the complex addressing mode forces the instruction to use the
   28529   // "load" ports instead of the dedicated "store" port.
   28530   // E.g., on Haswell:
   28531   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
   28532   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
   28533   if (isLegalAddressingMode(DL, AM, Ty, AS))
   28534     // Scale represents reg2 * scale, thus account for 1
   28535     // as soon as we use a second register.
   28536     return AM.Scale != 0;
   28537   return -1;
   28538 }
   28539 
   28540 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
   28541   // Integer division on x86 is expensive. However, when aggressively optimizing
   28542   // for code size, we prefer to use a div instruction, as it is usually smaller
   28543   // than the alternative sequence.
   28544   // The exception to this is vector division. Since x86 doesn't have vector
   28545   // integer division, leaving the division as-is is a loss even in terms of
   28546   // size, because it will have to be scalarized, while the alternative code
   28547   // sequence can be performed in vector form.
   28548   bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
   28549                                    Attribute::MinSize);
   28550   return OptSize && !VT.isVector();
   28551 }
   28552 
   28553 void X86TargetLowering::markInRegArguments(SelectionDAG &DAG,
   28554        TargetLowering::ArgListTy& Args) const {
   28555   // The MCU psABI requires some arguments to be passed in-register.
   28556   // For regular calls, the inreg arguments are marked by the front-end.
   28557   // However, for compiler generated library calls, we have to patch this
   28558   // up here.
   28559   if (!Subtarget->isTargetMCU() || !Args.size())
   28560     return;
   28561 
   28562   unsigned FreeRegs = 3;
   28563   for (auto &Arg : Args) {
   28564     // For library functions, we do not expect any fancy types.
   28565     unsigned Size = DAG.getDataLayout().getTypeSizeInBits(Arg.Ty);
   28566     unsigned SizeInRegs = (Size + 31) / 32;
   28567     if (SizeInRegs > 2 || SizeInRegs > FreeRegs)
   28568       continue;
   28569 
   28570     Arg.isInReg = true;
   28571     FreeRegs -= SizeInRegs;
   28572     if (!FreeRegs)
   28573       break;
   28574   }
   28575 }
   28576