Home | History | Annotate | Download | only in X86
      1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the interfaces that X86 uses to lower LLVM code into a
     11 // selection DAG.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "X86ISelLowering.h"
     16 #include "Utils/X86ShuffleDecode.h"
     17 #include "X86CallingConv.h"
     18 #include "X86FrameLowering.h"
     19 #include "X86InstrBuilder.h"
     20 #include "X86MachineFunctionInfo.h"
     21 #include "X86TargetMachine.h"
     22 #include "X86TargetObjectFile.h"
     23 #include "llvm/ADT/SmallBitVector.h"
     24 #include "llvm/ADT/SmallSet.h"
     25 #include "llvm/ADT/Statistic.h"
     26 #include "llvm/ADT/StringExtras.h"
     27 #include "llvm/ADT/StringSwitch.h"
     28 #include "llvm/CodeGen/IntrinsicLowering.h"
     29 #include "llvm/CodeGen/MachineFrameInfo.h"
     30 #include "llvm/CodeGen/MachineFunction.h"
     31 #include "llvm/CodeGen/MachineInstrBuilder.h"
     32 #include "llvm/CodeGen/MachineJumpTableInfo.h"
     33 #include "llvm/CodeGen/MachineModuleInfo.h"
     34 #include "llvm/CodeGen/MachineRegisterInfo.h"
     35 #include "llvm/CodeGen/WinEHFuncInfo.h"
     36 #include "llvm/IR/CallSite.h"
     37 #include "llvm/IR/CallingConv.h"
     38 #include "llvm/IR/Constants.h"
     39 #include "llvm/IR/DerivedTypes.h"
     40 #include "llvm/IR/Function.h"
     41 #include "llvm/IR/GlobalAlias.h"
     42 #include "llvm/IR/GlobalVariable.h"
     43 #include "llvm/IR/Instructions.h"
     44 #include "llvm/IR/Intrinsics.h"
     45 #include "llvm/MC/MCAsmInfo.h"
     46 #include "llvm/MC/MCContext.h"
     47 #include "llvm/MC/MCExpr.h"
     48 #include "llvm/MC/MCSymbol.h"
     49 #include "llvm/Support/CommandLine.h"
     50 #include "llvm/Support/Debug.h"
     51 #include "llvm/Support/ErrorHandling.h"
     52 #include "llvm/Support/MathExtras.h"
     53 #include "llvm/Target/TargetOptions.h"
     54 #include "X86IntrinsicsInfo.h"
     55 #include <bitset>
     56 #include <numeric>
     57 #include <cctype>
     58 using namespace llvm;
     59 
     60 #define DEBUG_TYPE "x86-isel"
     61 
     62 STATISTIC(NumTailCalls, "Number of tail calls");
     63 
     64 static cl::opt<bool> ExperimentalVectorWideningLegalization(
     65     "x86-experimental-vector-widening-legalization", cl::init(false),
     66     cl::desc("Enable an experimental vector type legalization through widening "
     67              "rather than promotion."),
     68     cl::Hidden);
     69 
     70 static cl::opt<int> ReciprocalEstimateRefinementSteps(
     71     "x86-recip-refinement-steps", cl::init(1),
     72     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
     73              "result of the hardware reciprocal estimate instruction."),
     74     cl::NotHidden);
     75 
     76 // Forward declarations.
     77 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
     78                        SDValue V2);
     79 
     80 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     81                                      const X86Subtarget &STI)
     82     : TargetLowering(TM), Subtarget(&STI) {
     83   X86ScalarSSEf64 = Subtarget->hasSSE2();
     84   X86ScalarSSEf32 = Subtarget->hasSSE1();
     85   TD = getDataLayout();
     86 
     87   // Set up the TargetLowering object.
     88   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
     89 
     90   // X86 is weird. It always uses i8 for shift amounts and setcc results.
     91   setBooleanContents(ZeroOrOneBooleanContent);
     92   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
     93   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
     94 
     95   // For 64-bit, since we have so many registers, use the ILP scheduler.
     96   // For 32-bit, use the register pressure specific scheduling.
     97   // For Atom, always use ILP scheduling.
     98   if (Subtarget->isAtom())
     99     setSchedulingPreference(Sched::ILP);
    100   else if (Subtarget->is64Bit())
    101     setSchedulingPreference(Sched::ILP);
    102   else
    103     setSchedulingPreference(Sched::RegPressure);
    104   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
    105   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
    106 
    107   // Bypass expensive divides on Atom when compiling with O2.
    108   if (TM.getOptLevel() >= CodeGenOpt::Default) {
    109     if (Subtarget->hasSlowDivide32())
    110       addBypassSlowDiv(32, 8);
    111     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
    112       addBypassSlowDiv(64, 16);
    113   }
    114 
    115   if (Subtarget->isTargetKnownWindowsMSVC()) {
    116     // Setup Windows compiler runtime calls.
    117     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
    118     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
    119     setLibcallName(RTLIB::SREM_I64, "_allrem");
    120     setLibcallName(RTLIB::UREM_I64, "_aullrem");
    121     setLibcallName(RTLIB::MUL_I64, "_allmul");
    122     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
    123     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
    124     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
    125     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
    126     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
    127 
    128     // The _ftol2 runtime function has an unusual calling conv, which
    129     // is modeled by a special pseudo-instruction.
    130     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
    131     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
    132     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
    133     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
    134   }
    135 
    136   if (Subtarget->isTargetDarwin()) {
    137     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
    138     setUseUnderscoreSetJmp(false);
    139     setUseUnderscoreLongJmp(false);
    140   } else if (Subtarget->isTargetWindowsGNU()) {
    141     // MS runtime is weird: it exports _setjmp, but longjmp!
    142     setUseUnderscoreSetJmp(true);
    143     setUseUnderscoreLongJmp(false);
    144   } else {
    145     setUseUnderscoreSetJmp(true);
    146     setUseUnderscoreLongJmp(true);
    147   }
    148 
    149   // Set up the register classes.
    150   addRegisterClass(MVT::i8, &X86::GR8RegClass);
    151   addRegisterClass(MVT::i16, &X86::GR16RegClass);
    152   addRegisterClass(MVT::i32, &X86::GR32RegClass);
    153   if (Subtarget->is64Bit())
    154     addRegisterClass(MVT::i64, &X86::GR64RegClass);
    155 
    156   for (MVT VT : MVT::integer_valuetypes())
    157     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
    158 
    159   // We don't accept any truncstore of integer registers.
    160   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    161   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    162   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
    163   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
    164   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
    165   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
    166 
    167   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    168 
    169   // SETOEQ and SETUNE require checking two conditions.
    170   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
    171   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
    172   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
    173   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
    174   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
    175   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
    176 
    177   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
    178   // operation.
    179   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
    180   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
    181   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
    182 
    183   if (Subtarget->is64Bit()) {
    184     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
    185     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    186   } else if (!TM.Options.UseSoftFloat) {
    187     // We have an algorithm for SSE2->double, and we turn this into a
    188     // 64-bit FILD followed by conditional FADD for other targets.
    189     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    190     // We have an algorithm for SSE2, and we turn this into a 64-bit
    191     // FILD for other targets.
    192     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
    193   }
    194 
    195   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
    196   // this operation.
    197   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
    198   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
    199 
    200   if (!TM.Options.UseSoftFloat) {
    201     // SSE has no i16 to fp conversion, only i32
    202     if (X86ScalarSSEf32) {
    203       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    204       // f32 and f64 cases are Legal, f80 case is not
    205       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    206     } else {
    207       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
    208       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    209     }
    210   } else {
    211     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    212     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
    213   }
    214 
    215   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
    216   // are Legal, f80 is custom lowered.
    217   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
    218   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
    219 
    220   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
    221   // this operation.
    222   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
    223   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
    224 
    225   if (X86ScalarSSEf32) {
    226     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    227     // f32 and f64 cases are Legal, f80 case is not
    228     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    229   } else {
    230     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
    231     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    232   }
    233 
    234   // Handle FP_TO_UINT by promoting the destination to a larger signed
    235   // conversion.
    236   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
    237   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
    238   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
    239 
    240   if (Subtarget->is64Bit()) {
    241     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
    242     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
    243   } else if (!TM.Options.UseSoftFloat) {
    244     // Since AVX is a superset of SSE3, only check for SSE here.
    245     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
    246       // Expand FP_TO_UINT into a select.
    247       // FIXME: We would like to use a Custom expander here eventually to do
    248       // the optimal thing for SSE vs. the default expansion in the legalizer.
    249       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
    250     else
    251       // With SSE3 we can use fisttpll to convert to a signed i64; without
    252       // SSE, we're stuck with a fistpll.
    253       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
    254   }
    255 
    256   if (isTargetFTOL()) {
    257     // Use the _ftol2 runtime function, which has a pseudo-instruction
    258     // to handle its weird calling convention.
    259     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
    260   }
    261 
    262   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
    263   if (!X86ScalarSSEf64) {
    264     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
    265     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
    266     if (Subtarget->is64Bit()) {
    267       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
    268       // Without SSE, i64->f64 goes through memory.
    269       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
    270     }
    271   }
    272 
    273   // Scalar integer divide and remainder are lowered to use operations that
    274   // produce two results, to match the available instructions. This exposes
    275   // the two-result form to trivial CSE, which is able to combine x/y and x%y
    276   // into a single instruction.
    277   //
    278   // Scalar integer multiply-high is also lowered to use two-result
    279   // operations, to match the available instructions. However, plain multiply
    280   // (low) operations are left as Legal, as there are single-result
    281   // instructions for this in x86. Using the two-result multiply instructions
    282   // when both high and low results are needed must be arranged by dagcombine.
    283   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
    284     MVT VT = IntVTs[i];
    285     setOperationAction(ISD::MULHS, VT, Expand);
    286     setOperationAction(ISD::MULHU, VT, Expand);
    287     setOperationAction(ISD::SDIV, VT, Expand);
    288     setOperationAction(ISD::UDIV, VT, Expand);
    289     setOperationAction(ISD::SREM, VT, Expand);
    290     setOperationAction(ISD::UREM, VT, Expand);
    291 
    292     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
    293     setOperationAction(ISD::ADDC, VT, Custom);
    294     setOperationAction(ISD::ADDE, VT, Custom);
    295     setOperationAction(ISD::SUBC, VT, Custom);
    296     setOperationAction(ISD::SUBE, VT, Custom);
    297   }
    298 
    299   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
    300   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
    301   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
    302   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
    303   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
    304   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
    305   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
    306   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
    307   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
    308   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
    309   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
    310   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
    311   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
    312   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
    313   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
    314   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
    315   if (Subtarget->is64Bit())
    316     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    317   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
    318   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
    319   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
    320   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
    321   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
    322   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
    323   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    324   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
    325 
    326   // Promote the i8 variants and force them on up to i32 which has a shorter
    327   // encoding.
    328   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
    329   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
    330   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
    331   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
    332   if (Subtarget->hasBMI()) {
    333     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
    334     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
    335     if (Subtarget->is64Bit())
    336       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
    337   } else {
    338     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
    339     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
    340     if (Subtarget->is64Bit())
    341       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
    342   }
    343 
    344   if (Subtarget->hasLZCNT()) {
    345     // When promoting the i8 variants, force them to i32 for a shorter
    346     // encoding.
    347     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
    348     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
    349     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
    350     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
    351     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
    352     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
    353     if (Subtarget->is64Bit())
    354       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
    355   } else {
    356     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
    357     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
    358     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
    359     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
    360     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
    361     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
    362     if (Subtarget->is64Bit()) {
    363       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
    364       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
    365     }
    366   }
    367 
    368   // Special handling for half-precision floating point conversions.
    369   // If we don't have F16C support, then lower half float conversions
    370   // into library calls.
    371   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
    372     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
    373     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
    374   }
    375 
    376   // There's never any support for operations beyond MVT::f32.
    377   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
    378   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
    379   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
    380   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
    381 
    382   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
    383   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
    384   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
    385   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
    386   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
    387   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
    388 
    389   if (Subtarget->hasPOPCNT()) {
    390     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
    391   } else {
    392     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
    393     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
    394     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
    395     if (Subtarget->is64Bit())
    396       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
    397   }
    398 
    399   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
    400 
    401   if (!Subtarget->hasMOVBE())
    402     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
    403 
    404   // These should be promoted to a larger select which is supported.
    405   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
    406   // X86 wants to expand cmov itself.
    407   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
    408   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
    409   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
    410   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
    411   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
    412   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
    413   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
    414   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
    415   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
    416   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
    417   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
    418   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
    419   if (Subtarget->is64Bit()) {
    420     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
    421     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
    422   }
    423   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
    424   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
    425   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
    426   // support continuation, user-level threading, and etc.. As a result, no
    427   // other SjLj exception interfaces are implemented and please don't build
    428   // your own exception handling based on them.
    429   // LLVM/Clang supports zero-cost DWARF exception handling.
    430   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
    431   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
    432 
    433   // Darwin ABI issue.
    434   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
    435   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
    436   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
    437   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
    438   if (Subtarget->is64Bit())
    439     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
    440   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
    441   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
    442   if (Subtarget->is64Bit()) {
    443     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
    444     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
    445     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
    446     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
    447     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
    448   }
    449   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
    450   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
    451   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
    452   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
    453   if (Subtarget->is64Bit()) {
    454     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
    455     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
    456     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
    457   }
    458 
    459   if (Subtarget->hasSSE1())
    460     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
    461 
    462   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
    463 
    464   // Expand certain atomics
    465   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
    466     MVT VT = IntVTs[i];
    467     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
    468     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
    469     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
    470   }
    471 
    472   if (Subtarget->hasCmpxchg16b()) {
    473     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
    474   }
    475 
    476   // FIXME - use subtarget debug flags
    477   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
    478       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
    479     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
    480   }
    481 
    482   if (Subtarget->is64Bit()) {
    483     setExceptionPointerRegister(X86::RAX);
    484     setExceptionSelectorRegister(X86::RDX);
    485   } else {
    486     setExceptionPointerRegister(X86::EAX);
    487     setExceptionSelectorRegister(X86::EDX);
    488   }
    489   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
    490   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
    491 
    492   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
    493   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
    494 
    495   setOperationAction(ISD::TRAP, MVT::Other, Legal);
    496   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
    497 
    498   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    499   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    500   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    501   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
    502     // TargetInfo::X86_64ABIBuiltinVaList
    503     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
    504     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
    505   } else {
    506     // TargetInfo::CharPtrBuiltinVaList
    507     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
    508     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
    509   }
    510 
    511   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
    512   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
    513 
    514   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
    515 
    516   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
    517     // f32 and f64 use SSE.
    518     // Set up the FP register classes.
    519     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    520     addRegisterClass(MVT::f64, &X86::FR64RegClass);
    521 
    522     // Use ANDPD to simulate FABS.
    523     setOperationAction(ISD::FABS , MVT::f64, Custom);
    524     setOperationAction(ISD::FABS , MVT::f32, Custom);
    525 
    526     // Use XORP to simulate FNEG.
    527     setOperationAction(ISD::FNEG , MVT::f64, Custom);
    528     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    529 
    530     // Use ANDPD and ORPD to simulate FCOPYSIGN.
    531     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
    532     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    533 
    534     // Lower this to FGETSIGNx86 plus an AND.
    535     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
    536     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
    537 
    538     // We don't support sin/cos/fmod
    539     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    540     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    541     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    542     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    543     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    544     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    545 
    546     // Expand FP immediates into loads from the stack, except for the special
    547     // cases we handle.
    548     addLegalFPImmediate(APFloat(+0.0)); // xorpd
    549     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    550   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
    551     // Use SSE for f32, x87 for f64.
    552     // Set up the FP register classes.
    553     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    554     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    555 
    556     // Use ANDPS to simulate FABS.
    557     setOperationAction(ISD::FABS , MVT::f32, Custom);
    558 
    559     // Use XORP to simulate FNEG.
    560     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    561 
    562     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    563 
    564     // Use ANDPS and ORPS to simulate FCOPYSIGN.
    565     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    566     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    567 
    568     // We don't support sin/cos/fmod
    569     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    570     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    571     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    572 
    573     // Special cases we handle for FP constants.
    574     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    575     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    576     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    577     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    578     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    579 
    580     if (!TM.Options.UnsafeFPMath) {
    581       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    582       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    583       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    584     }
    585   } else if (!TM.Options.UseSoftFloat) {
    586     // f32 and f64 in x87.
    587     // Set up the FP register classes.
    588     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    589     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
    590 
    591     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    592     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
    593     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    594     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    595 
    596     if (!TM.Options.UnsafeFPMath) {
    597       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    598       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    599       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    600       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    601       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    602       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    603     }
    604     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    605     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    606     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    607     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    608     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
    609     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
    610     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
    611     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
    612   }
    613 
    614   // We don't support FMA.
    615   setOperationAction(ISD::FMA, MVT::f64, Expand);
    616   setOperationAction(ISD::FMA, MVT::f32, Expand);
    617 
    618   // Long double always uses X87.
    619   if (!TM.Options.UseSoftFloat) {
    620     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
    621     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
    622     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
    623     {
    624       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
    625       addLegalFPImmediate(TmpFlt);  // FLD0
    626       TmpFlt.changeSign();
    627       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
    628 
    629       bool ignored;
    630       APFloat TmpFlt2(+1.0);
    631       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
    632                       &ignored);
    633       addLegalFPImmediate(TmpFlt2);  // FLD1
    634       TmpFlt2.changeSign();
    635       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
    636     }
    637 
    638     if (!TM.Options.UnsafeFPMath) {
    639       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
    640       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
    641       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
    642     }
    643 
    644     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
    645     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
    646     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
    647     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
    648     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
    649     setOperationAction(ISD::FMA, MVT::f80, Expand);
    650   }
    651 
    652   // Always use a library call for pow.
    653   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
    654   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
    655   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
    656 
    657   setOperationAction(ISD::FLOG, MVT::f80, Expand);
    658   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
    659   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
    660   setOperationAction(ISD::FEXP, MVT::f80, Expand);
    661   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
    662   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
    663   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
    664 
    665   // First set operation action for all vector types to either promote
    666   // (for widening) or expand (for scalarization). Then we will selectively
    667   // turn on ones that can be effectively codegen'd.
    668   for (MVT VT : MVT::vector_valuetypes()) {
    669     setOperationAction(ISD::ADD , VT, Expand);
    670     setOperationAction(ISD::SUB , VT, Expand);
    671     setOperationAction(ISD::FADD, VT, Expand);
    672     setOperationAction(ISD::FNEG, VT, Expand);
    673     setOperationAction(ISD::FSUB, VT, Expand);
    674     setOperationAction(ISD::MUL , VT, Expand);
    675     setOperationAction(ISD::FMUL, VT, Expand);
    676     setOperationAction(ISD::SDIV, VT, Expand);
    677     setOperationAction(ISD::UDIV, VT, Expand);
    678     setOperationAction(ISD::FDIV, VT, Expand);
    679     setOperationAction(ISD::SREM, VT, Expand);
    680     setOperationAction(ISD::UREM, VT, Expand);
    681     setOperationAction(ISD::LOAD, VT, Expand);
    682     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    683     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
    684     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
    685     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
    686     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
    687     setOperationAction(ISD::FABS, VT, Expand);
    688     setOperationAction(ISD::FSIN, VT, Expand);
    689     setOperationAction(ISD::FSINCOS, VT, Expand);
    690     setOperationAction(ISD::FCOS, VT, Expand);
    691     setOperationAction(ISD::FSINCOS, VT, Expand);
    692     setOperationAction(ISD::FREM, VT, Expand);
    693     setOperationAction(ISD::FMA,  VT, Expand);
    694     setOperationAction(ISD::FPOWI, VT, Expand);
    695     setOperationAction(ISD::FSQRT, VT, Expand);
    696     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    697     setOperationAction(ISD::FFLOOR, VT, Expand);
    698     setOperationAction(ISD::FCEIL, VT, Expand);
    699     setOperationAction(ISD::FTRUNC, VT, Expand);
    700     setOperationAction(ISD::FRINT, VT, Expand);
    701     setOperationAction(ISD::FNEARBYINT, VT, Expand);
    702     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    703     setOperationAction(ISD::MULHS, VT, Expand);
    704     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    705     setOperationAction(ISD::MULHU, VT, Expand);
    706     setOperationAction(ISD::SDIVREM, VT, Expand);
    707     setOperationAction(ISD::UDIVREM, VT, Expand);
    708     setOperationAction(ISD::FPOW, VT, Expand);
    709     setOperationAction(ISD::CTPOP, VT, Expand);
    710     setOperationAction(ISD::CTTZ, VT, Expand);
    711     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
    712     setOperationAction(ISD::CTLZ, VT, Expand);
    713     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
    714     setOperationAction(ISD::SHL, VT, Expand);
    715     setOperationAction(ISD::SRA, VT, Expand);
    716     setOperationAction(ISD::SRL, VT, Expand);
    717     setOperationAction(ISD::ROTL, VT, Expand);
    718     setOperationAction(ISD::ROTR, VT, Expand);
    719     setOperationAction(ISD::BSWAP, VT, Expand);
    720     setOperationAction(ISD::SETCC, VT, Expand);
    721     setOperationAction(ISD::FLOG, VT, Expand);
    722     setOperationAction(ISD::FLOG2, VT, Expand);
    723     setOperationAction(ISD::FLOG10, VT, Expand);
    724     setOperationAction(ISD::FEXP, VT, Expand);
    725     setOperationAction(ISD::FEXP2, VT, Expand);
    726     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    727     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    728     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    729     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    730     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
    731     setOperationAction(ISD::TRUNCATE, VT, Expand);
    732     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
    733     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
    734     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
    735     setOperationAction(ISD::VSELECT, VT, Expand);
    736     setOperationAction(ISD::SELECT_CC, VT, Expand);
    737     for (MVT InnerVT : MVT::vector_valuetypes()) {
    738       setTruncStoreAction(InnerVT, VT, Expand);
    739 
    740       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
    741       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
    742 
    743       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
    744       // types, we have to deal with them whether we ask for Expansion or not.
    745       // Setting Expand causes its own optimisation problems though, so leave
    746       // them legal.
    747       if (VT.getVectorElementType() == MVT::i1)
    748         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
    749 
    750       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
    751       // split/scalarized right now.
    752       if (VT.getVectorElementType() == MVT::f16)
    753         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
    754     }
    755   }
    756 
    757   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
    758   // with -msoft-float, disable use of MMX as well.
    759   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
    760     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
    761     // No operations on x86mmx supported, everything uses intrinsics.
    762   }
    763 
    764   // MMX-sized vectors (other than x86mmx) are expected to be expanded
    765   // into smaller operations.
    766   for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) {
    767     setOperationAction(ISD::MULHS,              MMXTy,      Expand);
    768     setOperationAction(ISD::AND,                MMXTy,      Expand);
    769     setOperationAction(ISD::OR,                 MMXTy,      Expand);
    770     setOperationAction(ISD::XOR,                MMXTy,      Expand);
    771     setOperationAction(ISD::SCALAR_TO_VECTOR,   MMXTy,      Expand);
    772     setOperationAction(ISD::SELECT,             MMXTy,      Expand);
    773     setOperationAction(ISD::BITCAST,            MMXTy,      Expand);
    774   }
    775   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
    776 
    777   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
    778     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
    779 
    780     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
    781     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
    782     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
    783     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
    784     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
    785     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
    786     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
    787     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
    788     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
    789     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
    790     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
    791     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    792     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
    793     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
    794   }
    795 
    796   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
    797     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
    798 
    799     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
    800     // registers cannot be used even for integer operations.
    801     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
    802     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
    803     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
    804     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
    805 
    806     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
    807     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
    808     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
    809     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
    810     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
    811     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
    812     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
    813     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
    814     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
    815     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
    816     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
    817     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
    818     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
    819     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
    820     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
    821     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
    822     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
    823     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
    824     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
    825     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
    826     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
    827     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
    828 
    829     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
    830     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
    831     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
    832     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
    833 
    834     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
    835     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
    836     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    837     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    838     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    839 
    840     // Only provide customized ctpop vector bit twiddling for vector types we
    841     // know to perform better than using the popcnt instructions on each vector
    842     // element. If popcnt isn't supported, always provide the custom version.
    843     if (!Subtarget->hasPOPCNT()) {
    844       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
    845       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
    846     }
    847 
    848     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
    849     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
    850       MVT VT = (MVT::SimpleValueType)i;
    851       // Do not attempt to custom lower non-power-of-2 vectors
    852       if (!isPowerOf2_32(VT.getVectorNumElements()))
    853         continue;
    854       // Do not attempt to custom lower non-128-bit vectors
    855       if (!VT.is128BitVector())
    856         continue;
    857       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
    858       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
    859       setOperationAction(ISD::VSELECT,            VT, Custom);
    860       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    861     }
    862 
    863     // We support custom legalizing of sext and anyext loads for specific
    864     // memory vector types which we can load as a scalar (or sequence of
    865     // scalars) and extend in-register to a legal 128-bit vector type. For sext
    866     // loads these must work with a single scalar load.
    867     for (MVT VT : MVT::integer_vector_valuetypes()) {
    868       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
    869       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
    870       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
    871       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
    872       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
    873       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
    874       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
    875       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
    876       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
    877     }
    878 
    879     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
    880     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
    881     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
    882     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
    883     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
    884     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
    885     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
    886     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
    887 
    888     if (Subtarget->is64Bit()) {
    889       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
    890       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
    891     }
    892 
    893     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
    894     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
    895       MVT VT = (MVT::SimpleValueType)i;
    896 
    897       // Do not attempt to promote non-128-bit vectors
    898       if (!VT.is128BitVector())
    899         continue;
    900 
    901       setOperationAction(ISD::AND,    VT, Promote);
    902       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
    903       setOperationAction(ISD::OR,     VT, Promote);
    904       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
    905       setOperationAction(ISD::XOR,    VT, Promote);
    906       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
    907       setOperationAction(ISD::LOAD,   VT, Promote);
    908       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
    909       setOperationAction(ISD::SELECT, VT, Promote);
    910       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
    911     }
    912 
    913     // Custom lower v2i64 and v2f64 selects.
    914     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
    915     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
    916     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
    917     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
    918 
    919     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
    920     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
    921 
    922     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
    923     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
    924     // As there is no 64-bit GPR available, we need build a special custom
    925     // sequence to convert from v2i32 to v2f32.
    926     if (!Subtarget->is64Bit())
    927       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
    928 
    929     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
    930     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
    931 
    932     for (MVT VT : MVT::fp_vector_valuetypes())
    933       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
    934 
    935     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
    936     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
    937     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
    938   }
    939 
    940   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
    941     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
    942       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
    943       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
    944       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
    945       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
    946       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
    947     }
    948 
    949     // FIXME: Do we need to handle scalar-to-vector here?
    950     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
    951 
    952     // We directly match byte blends in the backend as they match the VSELECT
    953     // condition form.
    954     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
    955 
    956     // SSE41 brings specific instructions for doing vector sign extend even in
    957     // cases where we don't have SRA.
    958     for (MVT VT : MVT::integer_vector_valuetypes()) {
    959       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
    960       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
    961       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
    962     }
    963 
    964     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
    965     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
    966     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
    967     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
    968     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
    969     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
    970     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
    971 
    972     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
    973     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
    974     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
    975     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
    976     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
    977     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
    978 
    979     // i8 and i16 vectors are custom because the source register and source
    980     // source memory operand types are not the same width.  f32 vectors are
    981     // custom since the immediate controlling the insert encodes additional
    982     // information.
    983     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
    984     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    985     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    986     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    987 
    988     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
    989     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
    990     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
    991     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    992 
    993     // FIXME: these should be Legal, but that's only for the case where
    994     // the index is constant.  For now custom expand to deal with that.
    995     if (Subtarget->is64Bit()) {
    996       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
    997       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
    998     }
    999   }
   1000 
   1001   if (Subtarget->hasSSE2()) {
   1002     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
   1003     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
   1004 
   1005     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
   1006     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
   1007 
   1008     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
   1009     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
   1010 
   1011     // In the customized shift lowering, the legal cases in AVX2 will be
   1012     // recognized.
   1013     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
   1014     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
   1015 
   1016     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
   1017     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
   1018 
   1019     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
   1020   }
   1021 
   1022   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
   1023     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
   1024     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
   1025     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
   1026     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
   1027     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
   1028     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
   1029 
   1030     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
   1031     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
   1032     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
   1033 
   1034     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
   1035     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
   1036     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
   1037     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
   1038     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
   1039     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
   1040     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
   1041     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
   1042     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
   1043     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
   1044     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
   1045     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
   1046 
   1047     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
   1048     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
   1049     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
   1050     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
   1051     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
   1052     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
   1053     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
   1054     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
   1055     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
   1056     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
   1057     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
   1058     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
   1059 
   1060     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
   1061     // even though v8i16 is a legal type.
   1062     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
   1063     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
   1064     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
   1065 
   1066     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
   1067     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
   1068     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
   1069 
   1070     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
   1071     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
   1072 
   1073     for (MVT VT : MVT::fp_vector_valuetypes())
   1074       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
   1075 
   1076     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
   1077     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
   1078 
   1079     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
   1080     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
   1081 
   1082     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
   1083     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
   1084 
   1085     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
   1086     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
   1087     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
   1088     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
   1089 
   1090     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
   1091     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
   1092     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
   1093 
   1094     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
   1095     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
   1096     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
   1097     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
   1098     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
   1099     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
   1100     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
   1101     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
   1102     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
   1103     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
   1104     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
   1105     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
   1106 
   1107     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
   1108       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
   1109       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
   1110       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
   1111       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
   1112       setOperationAction(ISD::FMA,             MVT::f32, Legal);
   1113       setOperationAction(ISD::FMA,             MVT::f64, Legal);
   1114     }
   1115 
   1116     if (Subtarget->hasInt256()) {
   1117       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
   1118       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
   1119       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
   1120       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
   1121 
   1122       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
   1123       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
   1124       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
   1125       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
   1126 
   1127       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1128       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
   1129       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
   1130       // Don't lower v32i8 because there is no 128-bit byte mul
   1131 
   1132       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
   1133       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
   1134       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
   1135       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
   1136 
   1137       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
   1138       // when we have a 256bit-wide blend with immediate.
   1139       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
   1140 
   1141       // Only provide customized ctpop vector bit twiddling for vector types we
   1142       // know to perform better than using the popcnt instructions on each
   1143       // vector element. If popcnt isn't supported, always provide the custom
   1144       // version.
   1145       if (!Subtarget->hasPOPCNT())
   1146         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
   1147 
   1148       // Custom CTPOP always performs better on natively supported v8i32
   1149       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
   1150 
   1151       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
   1152       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
   1153       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
   1154       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
   1155       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
   1156       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
   1157       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
   1158 
   1159       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
   1160       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
   1161       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
   1162       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
   1163       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
   1164       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
   1165     } else {
   1166       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
   1167       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
   1168       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
   1169       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
   1170 
   1171       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
   1172       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
   1173       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
   1174       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
   1175 
   1176       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
   1177       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
   1178       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
   1179       // Don't lower v32i8 because there is no 128-bit byte mul
   1180     }
   1181 
   1182     // In the customized shift lowering, the legal cases in AVX2 will be
   1183     // recognized.
   1184     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
   1185     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
   1186 
   1187     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
   1188     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
   1189 
   1190     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
   1191 
   1192     // Custom lower several nodes for 256-bit types.
   1193     for (MVT VT : MVT::vector_valuetypes()) {
   1194       if (VT.getScalarSizeInBits() >= 32) {
   1195         setOperationAction(ISD::MLOAD,  VT, Legal);
   1196         setOperationAction(ISD::MSTORE, VT, Legal);
   1197       }
   1198       // Extract subvector is special because the value type
   1199       // (result) is 128-bit but the source is 256-bit wide.
   1200       if (VT.is128BitVector()) {
   1201         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1202       }
   1203       // Do not attempt to custom lower other non-256-bit vectors
   1204       if (!VT.is256BitVector())
   1205         continue;
   1206 
   1207       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
   1208       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
   1209       setOperationAction(ISD::VSELECT,            VT, Custom);
   1210       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
   1211       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   1212       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
   1213       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
   1214       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
   1215     }
   1216 
   1217     if (Subtarget->hasInt256())
   1218       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
   1219 
   1220 
   1221     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
   1222     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
   1223       MVT VT = (MVT::SimpleValueType)i;
   1224 
   1225       // Do not attempt to promote non-256-bit vectors
   1226       if (!VT.is256BitVector())
   1227         continue;
   1228 
   1229       setOperationAction(ISD::AND,    VT, Promote);
   1230       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
   1231       setOperationAction(ISD::OR,     VT, Promote);
   1232       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
   1233       setOperationAction(ISD::XOR,    VT, Promote);
   1234       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
   1235       setOperationAction(ISD::LOAD,   VT, Promote);
   1236       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
   1237       setOperationAction(ISD::SELECT, VT, Promote);
   1238       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
   1239     }
   1240   }
   1241 
   1242   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
   1243     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
   1244     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
   1245     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
   1246     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
   1247 
   1248     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
   1249     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
   1250     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
   1251 
   1252     for (MVT VT : MVT::fp_vector_valuetypes())
   1253       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
   1254 
   1255     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
   1256     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
   1257     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
   1258     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
   1259     setOperationAction(ISD::AND,                MVT::i1,    Legal);
   1260     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
   1261     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
   1262     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
   1263     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
   1264     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
   1265 
   1266     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
   1267     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
   1268     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
   1269     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
   1270     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
   1271     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
   1272 
   1273     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
   1274     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
   1275     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
   1276     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
   1277     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
   1278     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
   1279     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
   1280     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
   1281 
   1282     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
   1283     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
   1284     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
   1285     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
   1286     if (Subtarget->is64Bit()) {
   1287       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
   1288       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
   1289       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
   1290       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
   1291     }
   1292     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
   1293     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
   1294     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
   1295     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
   1296     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
   1297     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
   1298     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
   1299     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
   1300     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
   1301     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
   1302     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
   1303     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
   1304     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
   1305     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
   1306 
   1307     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
   1308     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
   1309     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
   1310     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
   1311     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
   1312     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
   1313     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
   1314     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
   1315     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
   1316     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
   1317     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
   1318     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
   1319     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
   1320 
   1321     setOperationAction(ISD::FFLOOR,             MVT::v16f32, Legal);
   1322     setOperationAction(ISD::FFLOOR,             MVT::v8f64, Legal);
   1323     setOperationAction(ISD::FCEIL,              MVT::v16f32, Legal);
   1324     setOperationAction(ISD::FCEIL,              MVT::v8f64, Legal);
   1325     setOperationAction(ISD::FTRUNC,             MVT::v16f32, Legal);
   1326     setOperationAction(ISD::FTRUNC,             MVT::v8f64, Legal);
   1327     setOperationAction(ISD::FRINT,              MVT::v16f32, Legal);
   1328     setOperationAction(ISD::FRINT,              MVT::v8f64, Legal);
   1329     setOperationAction(ISD::FNEARBYINT,         MVT::v16f32, Legal);
   1330     setOperationAction(ISD::FNEARBYINT,         MVT::v8f64, Legal);
   1331 
   1332     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
   1333     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
   1334     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
   1335     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
   1336     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
   1337 
   1338     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
   1339     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
   1340 
   1341     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
   1342 
   1343     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
   1344     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
   1345     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
   1346     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
   1347     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
   1348     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
   1349     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
   1350     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
   1351     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
   1352 
   1353     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
   1354     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
   1355 
   1356     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
   1357     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
   1358 
   1359     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
   1360 
   1361     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
   1362     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
   1363 
   1364     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
   1365     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
   1366 
   1367     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
   1368     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
   1369 
   1370     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
   1371     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
   1372     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
   1373     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
   1374     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
   1375     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
   1376 
   1377     if (Subtarget->hasCDI()) {
   1378       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
   1379       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
   1380     }
   1381 
   1382     // Custom lower several nodes.
   1383     for (MVT VT : MVT::vector_valuetypes()) {
   1384       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   1385       // Extract subvector is special because the value type
   1386       // (result) is 256/128-bit but the source is 512-bit wide.
   1387       if (VT.is128BitVector() || VT.is256BitVector()) {
   1388         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1389       }
   1390       if (VT.getVectorElementType() == MVT::i1)
   1391         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
   1392 
   1393       // Do not attempt to custom lower other non-512-bit vectors
   1394       if (!VT.is512BitVector())
   1395         continue;
   1396 
   1397       if ( EltSize >= 32) {
   1398         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
   1399         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
   1400         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
   1401         setOperationAction(ISD::VSELECT,             VT, Legal);
   1402         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
   1403         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
   1404         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
   1405         setOperationAction(ISD::MLOAD,               VT, Legal);
   1406         setOperationAction(ISD::MSTORE,              VT, Legal);
   1407       }
   1408     }
   1409     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
   1410       MVT VT = (MVT::SimpleValueType)i;
   1411 
   1412       // Do not attempt to promote non-512-bit vectors.
   1413       if (!VT.is512BitVector())
   1414         continue;
   1415 
   1416       setOperationAction(ISD::SELECT, VT, Promote);
   1417       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
   1418     }
   1419   }// has  AVX-512
   1420 
   1421   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
   1422     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
   1423     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
   1424 
   1425     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
   1426     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
   1427 
   1428     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
   1429     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
   1430     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
   1431     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
   1432     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
   1433     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
   1434     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
   1435     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
   1436     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
   1437     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
   1438     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
   1439     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
   1440     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
   1441 
   1442     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
   1443       const MVT VT = (MVT::SimpleValueType)i;
   1444 
   1445       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
   1446 
   1447       // Do not attempt to promote non-512-bit vectors.
   1448       if (!VT.is512BitVector())
   1449         continue;
   1450 
   1451       if (EltSize < 32) {
   1452         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
   1453         setOperationAction(ISD::VSELECT,             VT, Legal);
   1454       }
   1455     }
   1456   }
   1457 
   1458   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
   1459     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
   1460     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
   1461 
   1462     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
   1463     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
   1464     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
   1465     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
   1466     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
   1467     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
   1468 
   1469     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
   1470     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
   1471     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
   1472     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
   1473     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
   1474     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
   1475   }
   1476 
   1477   // We want to custom lower some of our intrinsics.
   1478   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   1479   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   1480   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   1481   if (!Subtarget->is64Bit())
   1482     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
   1483 
   1484   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   1485   // handle type legalization for these operations here.
   1486   //
   1487   // FIXME: We really should do custom legalization for addition and
   1488   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   1489   // than generic legalization for 64-bit multiplication-with-overflow, though.
   1490   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
   1491     // Add/Sub/Mul with overflow operations are custom lowered.
   1492     MVT VT = IntVTs[i];
   1493     setOperationAction(ISD::SADDO, VT, Custom);
   1494     setOperationAction(ISD::UADDO, VT, Custom);
   1495     setOperationAction(ISD::SSUBO, VT, Custom);
   1496     setOperationAction(ISD::USUBO, VT, Custom);
   1497     setOperationAction(ISD::SMULO, VT, Custom);
   1498     setOperationAction(ISD::UMULO, VT, Custom);
   1499   }
   1500 
   1501 
   1502   if (!Subtarget->is64Bit()) {
   1503     // These libcalls are not available in 32-bit.
   1504     setLibcallName(RTLIB::SHL_I128, nullptr);
   1505     setLibcallName(RTLIB::SRL_I128, nullptr);
   1506     setLibcallName(RTLIB::SRA_I128, nullptr);
   1507   }
   1508 
   1509   // Combine sin / cos into one node or libcall if possible.
   1510   if (Subtarget->hasSinCos()) {
   1511     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
   1512     setLibcallName(RTLIB::SINCOS_F64, "sincos");
   1513     if (Subtarget->isTargetDarwin()) {
   1514       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
   1515       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
   1516       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
   1517       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   1518     }
   1519   }
   1520 
   1521   if (Subtarget->isTargetWin64()) {
   1522     setOperationAction(ISD::SDIV, MVT::i128, Custom);
   1523     setOperationAction(ISD::UDIV, MVT::i128, Custom);
   1524     setOperationAction(ISD::SREM, MVT::i128, Custom);
   1525     setOperationAction(ISD::UREM, MVT::i128, Custom);
   1526     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
   1527     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
   1528   }
   1529 
   1530   // We have target-specific dag combine patterns for the following nodes:
   1531   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   1532   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   1533   setTargetDAGCombine(ISD::BITCAST);
   1534   setTargetDAGCombine(ISD::VSELECT);
   1535   setTargetDAGCombine(ISD::SELECT);
   1536   setTargetDAGCombine(ISD::SHL);
   1537   setTargetDAGCombine(ISD::SRA);
   1538   setTargetDAGCombine(ISD::SRL);
   1539   setTargetDAGCombine(ISD::OR);
   1540   setTargetDAGCombine(ISD::AND);
   1541   setTargetDAGCombine(ISD::ADD);
   1542   setTargetDAGCombine(ISD::FADD);
   1543   setTargetDAGCombine(ISD::FSUB);
   1544   setTargetDAGCombine(ISD::FMA);
   1545   setTargetDAGCombine(ISD::SUB);
   1546   setTargetDAGCombine(ISD::LOAD);
   1547   setTargetDAGCombine(ISD::MLOAD);
   1548   setTargetDAGCombine(ISD::STORE);
   1549   setTargetDAGCombine(ISD::MSTORE);
   1550   setTargetDAGCombine(ISD::ZERO_EXTEND);
   1551   setTargetDAGCombine(ISD::ANY_EXTEND);
   1552   setTargetDAGCombine(ISD::SIGN_EXTEND);
   1553   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   1554   setTargetDAGCombine(ISD::TRUNCATE);
   1555   setTargetDAGCombine(ISD::SINT_TO_FP);
   1556   setTargetDAGCombine(ISD::SETCC);
   1557   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
   1558   setTargetDAGCombine(ISD::BUILD_VECTOR);
   1559   setTargetDAGCombine(ISD::MUL);
   1560   setTargetDAGCombine(ISD::XOR);
   1561 
   1562   computeRegisterProperties(Subtarget->getRegisterInfo());
   1563 
   1564   // On Darwin, -Os means optimize for size without hurting performance,
   1565   // do not reduce the limit.
   1566   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   1567   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
   1568   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   1569   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   1570   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   1571   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   1572   setPrefLoopAlignment(4); // 2^4 bytes.
   1573 
   1574   // Predictable cmov don't hurt on atom because it's in-order.
   1575   PredictableSelectIsExpensive = !Subtarget->isAtom();
   1576   EnableExtLdPromotion = true;
   1577   setPrefFunctionAlignment(4); // 2^4 bytes.
   1578 
   1579   verifyIntrinsicTables();
   1580 }
   1581 
   1582 // This has so far only been implemented for 64-bit MachO.
   1583 bool X86TargetLowering::useLoadStackGuardNode() const {
   1584   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
   1585 }
   1586 
   1587 TargetLoweringBase::LegalizeTypeAction
   1588 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
   1589   if (ExperimentalVectorWideningLegalization &&
   1590       VT.getVectorNumElements() != 1 &&
   1591       VT.getVectorElementType().getSimpleVT() != MVT::i1)
   1592     return TypeWidenVector;
   1593 
   1594   return TargetLoweringBase::getPreferredVectorAction(VT);
   1595 }
   1596 
   1597 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   1598   if (!VT.isVector())
   1599     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
   1600 
   1601   const unsigned NumElts = VT.getVectorNumElements();
   1602   const EVT EltVT = VT.getVectorElementType();
   1603   if (VT.is512BitVector()) {
   1604     if (Subtarget->hasAVX512())
   1605       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
   1606           EltVT == MVT::f32 || EltVT == MVT::f64)
   1607         switch(NumElts) {
   1608         case  8: return MVT::v8i1;
   1609         case 16: return MVT::v16i1;
   1610       }
   1611     if (Subtarget->hasBWI())
   1612       if (EltVT == MVT::i8 || EltVT == MVT::i16)
   1613         switch(NumElts) {
   1614         case 32: return MVT::v32i1;
   1615         case 64: return MVT::v64i1;
   1616       }
   1617   }
   1618 
   1619   if (VT.is256BitVector() || VT.is128BitVector()) {
   1620     if (Subtarget->hasVLX())
   1621       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
   1622           EltVT == MVT::f32 || EltVT == MVT::f64)
   1623         switch(NumElts) {
   1624         case 2: return MVT::v2i1;
   1625         case 4: return MVT::v4i1;
   1626         case 8: return MVT::v8i1;
   1627       }
   1628     if (Subtarget->hasBWI() && Subtarget->hasVLX())
   1629       if (EltVT == MVT::i8 || EltVT == MVT::i16)
   1630         switch(NumElts) {
   1631         case  8: return MVT::v8i1;
   1632         case 16: return MVT::v16i1;
   1633         case 32: return MVT::v32i1;
   1634       }
   1635   }
   1636 
   1637   return VT.changeVectorElementTypeToInteger();
   1638 }
   1639 
   1640 /// Helper for getByValTypeAlignment to determine
   1641 /// the desired ByVal argument alignment.
   1642 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   1643   if (MaxAlign == 16)
   1644     return;
   1645   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
   1646     if (VTy->getBitWidth() == 128)
   1647       MaxAlign = 16;
   1648   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
   1649     unsigned EltAlign = 0;
   1650     getMaxByValAlign(ATy->getElementType(), EltAlign);
   1651     if (EltAlign > MaxAlign)
   1652       MaxAlign = EltAlign;
   1653   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
   1654     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
   1655       unsigned EltAlign = 0;
   1656       getMaxByValAlign(STy->getElementType(i), EltAlign);
   1657       if (EltAlign > MaxAlign)
   1658         MaxAlign = EltAlign;
   1659       if (MaxAlign == 16)
   1660         break;
   1661     }
   1662   }
   1663 }
   1664 
   1665 /// Return the desired alignment for ByVal aggregate
   1666 /// function arguments in the caller parameter area. For X86, aggregates
   1667 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
   1668 /// are at 4-byte boundaries.
   1669 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
   1670   if (Subtarget->is64Bit()) {
   1671     // Max of 8 and alignment of type.
   1672     unsigned TyAlign = TD->getABITypeAlignment(Ty);
   1673     if (TyAlign > 8)
   1674       return TyAlign;
   1675     return 8;
   1676   }
   1677 
   1678   unsigned Align = 4;
   1679   if (Subtarget->hasSSE1())
   1680     getMaxByValAlign(Ty, Align);
   1681   return Align;
   1682 }
   1683 
   1684 /// Returns the target specific optimal type for load
   1685 /// and store operations as a result of memset, memcpy, and memmove
   1686 /// lowering. If DstAlign is zero that means it's safe to destination
   1687 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
   1688 /// means there isn't a need to check it against alignment requirement,
   1689 /// probably because the source does not need to be loaded. If 'IsMemset' is
   1690 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
   1691 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
   1692 /// source is constant so it does not need to be loaded.
   1693 /// It returns EVT::Other if the type should be determined using generic
   1694 /// target-independent logic.
   1695 EVT
   1696 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   1697                                        unsigned DstAlign, unsigned SrcAlign,
   1698                                        bool IsMemset, bool ZeroMemset,
   1699                                        bool MemcpyStrSrc,
   1700                                        MachineFunction &MF) const {
   1701   const Function *F = MF.getFunction();
   1702   if ((!IsMemset || ZeroMemset) &&
   1703       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
   1704     if (Size >= 16 &&
   1705         (Subtarget->isUnalignedMemAccessFast() ||
   1706          ((DstAlign == 0 || DstAlign >= 16) &&
   1707           (SrcAlign == 0 || SrcAlign >= 16)))) {
   1708       if (Size >= 32) {
   1709         if (Subtarget->hasInt256())
   1710           return MVT::v8i32;
   1711         if (Subtarget->hasFp256())
   1712           return MVT::v8f32;
   1713       }
   1714       if (Subtarget->hasSSE2())
   1715         return MVT::v4i32;
   1716       if (Subtarget->hasSSE1())
   1717         return MVT::v4f32;
   1718     } else if (!MemcpyStrSrc && Size >= 8 &&
   1719                !Subtarget->is64Bit() &&
   1720                Subtarget->hasSSE2()) {
   1721       // Do not use f64 to lower memcpy if source is string constant. It's
   1722       // better to use i32 to avoid the loads.
   1723       return MVT::f64;
   1724     }
   1725   }
   1726   if (Subtarget->is64Bit() && Size >= 8)
   1727     return MVT::i64;
   1728   return MVT::i32;
   1729 }
   1730 
   1731 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   1732   if (VT == MVT::f32)
   1733     return X86ScalarSSEf32;
   1734   else if (VT == MVT::f64)
   1735     return X86ScalarSSEf64;
   1736   return true;
   1737 }
   1738 
   1739 bool
   1740 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   1741                                                   unsigned,
   1742                                                   unsigned,
   1743                                                   bool *Fast) const {
   1744   if (Fast)
   1745     *Fast = Subtarget->isUnalignedMemAccessFast();
   1746   return true;
   1747 }
   1748 
   1749 /// Return the entry encoding for a jump table in the
   1750 /// current function.  The returned value is a member of the
   1751 /// MachineJumpTableInfo::JTEntryKind enum.
   1752 unsigned X86TargetLowering::getJumpTableEncoding() const {
   1753   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
   1754   // symbol.
   1755   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
   1756       Subtarget->isPICStyleGOT())
   1757     return MachineJumpTableInfo::EK_Custom32;
   1758 
   1759   // Otherwise, use the normal jump table encoding heuristics.
   1760   return TargetLowering::getJumpTableEncoding();
   1761 }
   1762 
   1763 const MCExpr *
   1764 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
   1765                                              const MachineBasicBlock *MBB,
   1766                                              unsigned uid,MCContext &Ctx) const{
   1767   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
   1768          Subtarget->isPICStyleGOT());
   1769   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   1770   // entries.
   1771   return MCSymbolRefExpr::Create(MBB->getSymbol(),
   1772                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
   1773 }
   1774 
   1775 /// Returns relocation base for the given PIC jumptable.
   1776 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   1777                                                     SelectionDAG &DAG) const {
   1778   if (!Subtarget->is64Bit())
   1779     // This doesn't have SDLoc associated with it, but is not really the
   1780     // same as a Register.
   1781     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
   1782   return Table;
   1783 }
   1784 
   1785 /// This returns the relocation base for the given PIC jumptable,
   1786 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
   1787 const MCExpr *X86TargetLowering::
   1788 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
   1789                              MCContext &Ctx) const {
   1790   // X86-64 uses RIP relative addressing based on the jump table label.
   1791   if (Subtarget->isPICStyleRIPRel())
   1792     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
   1793 
   1794   // Otherwise, the reference is relative to the PIC base.
   1795   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
   1796 }
   1797 
   1798 std::pair<const TargetRegisterClass *, uint8_t>
   1799 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
   1800                                            MVT VT) const {
   1801   const TargetRegisterClass *RRC = nullptr;
   1802   uint8_t Cost = 1;
   1803   switch (VT.SimpleTy) {
   1804   default:
   1805     return TargetLowering::findRepresentativeClass(TRI, VT);
   1806   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
   1807     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
   1808     break;
   1809   case MVT::x86mmx:
   1810     RRC = &X86::VR64RegClass;
   1811     break;
   1812   case MVT::f32: case MVT::f64:
   1813   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   1814   case MVT::v4f32: case MVT::v2f64:
   1815   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
   1816   case MVT::v4f64:
   1817     RRC = &X86::VR128RegClass;
   1818     break;
   1819   }
   1820   return std::make_pair(RRC, Cost);
   1821 }
   1822 
   1823 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
   1824                                                unsigned &Offset) const {
   1825   if (!Subtarget->isTargetLinux())
   1826     return false;
   1827 
   1828   if (Subtarget->is64Bit()) {
   1829     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
   1830     Offset = 0x28;
   1831     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
   1832       AddressSpace = 256;
   1833     else
   1834       AddressSpace = 257;
   1835   } else {
   1836     // %gs:0x14 on i386
   1837     Offset = 0x14;
   1838     AddressSpace = 256;
   1839   }
   1840   return true;
   1841 }
   1842 
   1843 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
   1844                                             unsigned DestAS) const {
   1845   assert(SrcAS != DestAS && "Expected different address spaces!");
   1846 
   1847   return SrcAS < 256 && DestAS < 256;
   1848 }
   1849 
   1850 //===----------------------------------------------------------------------===//
   1851 //               Return Value Calling Convention Implementation
   1852 //===----------------------------------------------------------------------===//
   1853 
   1854 #include "X86GenCallingConv.inc"
   1855 
   1856 bool
   1857 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
   1858                                   MachineFunction &MF, bool isVarArg,
   1859                         const SmallVectorImpl<ISD::OutputArg> &Outs,
   1860                         LLVMContext &Context) const {
   1861   SmallVector<CCValAssign, 16> RVLocs;
   1862   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   1863   return CCInfo.CheckReturn(Outs, RetCC_X86);
   1864 }
   1865 
   1866 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
   1867   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
   1868   return ScratchRegs;
   1869 }
   1870 
   1871 SDValue
   1872 X86TargetLowering::LowerReturn(SDValue Chain,
   1873                                CallingConv::ID CallConv, bool isVarArg,
   1874                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   1875                                const SmallVectorImpl<SDValue> &OutVals,
   1876                                SDLoc dl, SelectionDAG &DAG) const {
   1877   MachineFunction &MF = DAG.getMachineFunction();
   1878   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   1879 
   1880   SmallVector<CCValAssign, 16> RVLocs;
   1881   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
   1882   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
   1883 
   1884   SDValue Flag;
   1885   SmallVector<SDValue, 6> RetOps;
   1886   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   1887   // Operand #1 = Bytes To Pop
   1888   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
   1889                    MVT::i16));
   1890 
   1891   // Copy the result values into the output registers.
   1892   for (unsigned i = 0; i != RVLocs.size(); ++i) {
   1893     CCValAssign &VA = RVLocs[i];
   1894     assert(VA.isRegLoc() && "Can only return in registers!");
   1895     SDValue ValToCopy = OutVals[i];
   1896     EVT ValVT = ValToCopy.getValueType();
   1897 
   1898     // Promote values to the appropriate types.
   1899     if (VA.getLocInfo() == CCValAssign::SExt)
   1900       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
   1901     else if (VA.getLocInfo() == CCValAssign::ZExt)
   1902       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
   1903     else if (VA.getLocInfo() == CCValAssign::AExt)
   1904       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
   1905     else if (VA.getLocInfo() == CCValAssign::BCvt)
   1906       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
   1907 
   1908     assert(VA.getLocInfo() != CCValAssign::FPExt &&
   1909            "Unexpected FP-extend for return value.");
   1910 
   1911     // If this is x86-64, and we disabled SSE, we can't return FP values,
   1912     // or SSE or MMX vectors.
   1913     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
   1914          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
   1915           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
   1916       report_fatal_error("SSE register return with SSE disabled");
   1917     }
   1918     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
   1919     // llvm-gcc has never done it right and no one has noticed, so this
   1920     // should be OK for now.
   1921     if (ValVT == MVT::f64 &&
   1922         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
   1923       report_fatal_error("SSE2 register return with SSE2 disabled");
   1924 
   1925     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
   1926     // the RET instruction and handled by the FP Stackifier.
   1927     if (VA.getLocReg() == X86::FP0 ||
   1928         VA.getLocReg() == X86::FP1) {
   1929       // If this is a copy from an xmm register to ST(0), use an FPExtend to
   1930       // change the value to the FP stack register class.
   1931       if (isScalarFPTypeInSSEReg(VA.getValVT()))
   1932         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
   1933       RetOps.push_back(ValToCopy);
   1934       // Don't emit a copytoreg.
   1935       continue;
   1936     }
   1937 
   1938     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
   1939     // which is returned in RAX / RDX.
   1940     if (Subtarget->is64Bit()) {
   1941       if (ValVT == MVT::x86mmx) {
   1942         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
   1943           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
   1944           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   1945                                   ValToCopy);
   1946           // If we don't have SSE2 available, convert to v4f32 so the generated
   1947           // register is legal.
   1948           if (!Subtarget->hasSSE2())
   1949             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
   1950         }
   1951       }
   1952     }
   1953 
   1954     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
   1955     Flag = Chain.getValue(1);
   1956     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   1957   }
   1958 
   1959   // The x86-64 ABIs require that for returning structs by value we copy
   1960   // the sret argument into %rax/%eax (depending on ABI) for the return.
   1961   // Win32 requires us to put the sret argument to %eax as well.
   1962   // We saved the argument into a virtual register in the entry block,
   1963   // so now we copy the value out and into %rax/%eax.
   1964   //
   1965   // Checking Function.hasStructRetAttr() here is insufficient because the IR
   1966   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
   1967   // false, then an sret argument may be implicitly inserted in the SelDAG. In
   1968   // either case FuncInfo->setSRetReturnReg() will have been called.
   1969   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
   1970     assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
   1971            "No need for an sret register");
   1972     SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
   1973 
   1974     unsigned RetValReg
   1975         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
   1976           X86::RAX : X86::EAX;
   1977     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
   1978     Flag = Chain.getValue(1);
   1979 
   1980     // RAX/EAX now acts like a return value.
   1981     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
   1982   }
   1983 
   1984   RetOps[0] = Chain;  // Update chain.
   1985 
   1986   // Add the flag if we have it.
   1987   if (Flag.getNode())
   1988     RetOps.push_back(Flag);
   1989 
   1990   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
   1991 }
   1992 
   1993 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   1994   if (N->getNumValues() != 1)
   1995     return false;
   1996   if (!N->hasNUsesOfValue(1, 0))
   1997     return false;
   1998 
   1999   SDValue TCChain = Chain;
   2000   SDNode *Copy = *N->use_begin();
   2001   if (Copy->getOpcode() == ISD::CopyToReg) {
   2002     // If the copy has a glue operand, we conservatively assume it isn't safe to
   2003     // perform a tail call.
   2004     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
   2005       return false;
   2006     TCChain = Copy->getOperand(0);
   2007   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
   2008     return false;
   2009 
   2010   bool HasRet = false;
   2011   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
   2012        UI != UE; ++UI) {
   2013     if (UI->getOpcode() != X86ISD::RET_FLAG)
   2014       return false;
   2015     // If we are returning more than one value, we can definitely
   2016     // not make a tail call see PR19530
   2017     if (UI->getNumOperands() > 4)
   2018       return false;
   2019     if (UI->getNumOperands() == 4 &&
   2020         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
   2021       return false;
   2022     HasRet = true;
   2023   }
   2024 
   2025   if (!HasRet)
   2026     return false;
   2027 
   2028   Chain = TCChain;
   2029   return true;
   2030 }
   2031 
   2032 EVT
   2033 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
   2034                                             ISD::NodeType ExtendKind) const {
   2035   MVT ReturnMVT;
   2036   // TODO: Is this also valid on 32-bit?
   2037   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
   2038     ReturnMVT = MVT::i8;
   2039   else
   2040     ReturnMVT = MVT::i32;
   2041 
   2042   EVT MinVT = getRegisterType(Context, ReturnMVT);
   2043   return VT.bitsLT(MinVT) ? MinVT : VT;
   2044 }
   2045 
   2046 /// Lower the result values of a call into the
   2047 /// appropriate copies out of appropriate physical registers.
   2048 ///
   2049 SDValue
   2050 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   2051                                    CallingConv::ID CallConv, bool isVarArg,
   2052                                    const SmallVectorImpl<ISD::InputArg> &Ins,
   2053                                    SDLoc dl, SelectionDAG &DAG,
   2054                                    SmallVectorImpl<SDValue> &InVals) const {
   2055 
   2056   // Assign locations to each value returned by this call.
   2057   SmallVector<CCValAssign, 16> RVLocs;
   2058   bool Is64Bit = Subtarget->is64Bit();
   2059   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
   2060                  *DAG.getContext());
   2061   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   2062 
   2063   // Copy all of the result registers out of their specified physreg.
   2064   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   2065     CCValAssign &VA = RVLocs[i];
   2066     EVT CopyVT = VA.getValVT();
   2067 
   2068     // If this is x86-64, and we disabled SSE, we can't return FP values
   2069     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
   2070         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
   2071       report_fatal_error("SSE register return with SSE disabled");
   2072     }
   2073 
   2074     // If we prefer to use the value in xmm registers, copy it out as f80 and
   2075     // use a truncate to move it from fp stack reg to xmm reg.
   2076     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
   2077         isScalarFPTypeInSSEReg(VA.getValVT()))
   2078       CopyVT = MVT::f80;
   2079 
   2080     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
   2081                                CopyVT, InFlag).getValue(1);
   2082     SDValue Val = Chain.getValue(0);
   2083 
   2084     if (CopyVT != VA.getValVT())
   2085       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
   2086                         // This truncation won't change the value.
   2087                         DAG.getIntPtrConstant(1));
   2088 
   2089     InFlag = Chain.getValue(2);
   2090     InVals.push_back(Val);
   2091   }
   2092 
   2093   return Chain;
   2094 }
   2095 
   2096 //===----------------------------------------------------------------------===//
   2097 //                C & StdCall & Fast Calling Convention implementation
   2098 //===----------------------------------------------------------------------===//
   2099 //  StdCall calling convention seems to be standard for many Windows' API
   2100 //  routines and around. It differs from C calling convention just a little:
   2101 //  callee should clean up the stack, not caller. Symbols should be also
   2102 //  decorated in some fancy way :) It doesn't support any vector arguments.
   2103 //  For info on fast calling convention see Fast Calling Convention (tail call)
   2104 //  implementation LowerX86_32FastCCCallTo.
   2105 
   2106 /// CallIsStructReturn - Determines whether a call uses struct return
   2107 /// semantics.
   2108 enum StructReturnType {
   2109   NotStructReturn,
   2110   RegStructReturn,
   2111   StackStructReturn
   2112 };
   2113 static StructReturnType
   2114 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
   2115   if (Outs.empty())
   2116     return NotStructReturn;
   2117 
   2118   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
   2119   if (!Flags.isSRet())
   2120     return NotStructReturn;
   2121   if (Flags.isInReg())
   2122     return RegStructReturn;
   2123   return StackStructReturn;
   2124 }
   2125 
   2126 /// Determines whether a function uses struct return semantics.
   2127 static StructReturnType
   2128 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
   2129   if (Ins.empty())
   2130     return NotStructReturn;
   2131 
   2132   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
   2133   if (!Flags.isSRet())
   2134     return NotStructReturn;
   2135   if (Flags.isInReg())
   2136     return RegStructReturn;
   2137   return StackStructReturn;
   2138 }
   2139 
   2140 /// Make a copy of an aggregate at address specified by "Src" to address
   2141 /// "Dst" with size and alignment information specified by the specific
   2142 /// parameter attribute. The copy will be passed as a byval function parameter.
   2143 static SDValue
   2144 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
   2145                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
   2146                           SDLoc dl) {
   2147   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   2148 
   2149   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
   2150                        /*isVolatile*/false, /*AlwaysInline=*/true,
   2151                        /*isTailCall*/false,
   2152                        MachinePointerInfo(), MachinePointerInfo());
   2153 }
   2154 
   2155 /// Return true if the calling convention is one that
   2156 /// supports tail call optimization.
   2157 static bool IsTailCallConvention(CallingConv::ID CC) {
   2158   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
   2159           CC == CallingConv::HiPE);
   2160 }
   2161 
   2162 /// \brief Return true if the calling convention is a C calling convention.
   2163 static bool IsCCallConvention(CallingConv::ID CC) {
   2164   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
   2165           CC == CallingConv::X86_64_SysV);
   2166 }
   2167 
   2168 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   2169   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
   2170     return false;
   2171 
   2172   CallSite CS(CI);
   2173   CallingConv::ID CalleeCC = CS.getCallingConv();
   2174   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
   2175     return false;
   2176 
   2177   return true;
   2178 }
   2179 
   2180 /// Return true if the function is being made into
   2181 /// a tailcall target by changing its ABI.
   2182 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
   2183                                    bool GuaranteedTailCallOpt) {
   2184   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
   2185 }
   2186 
   2187 SDValue
   2188 X86TargetLowering::LowerMemArgument(SDValue Chain,
   2189                                     CallingConv::ID CallConv,
   2190                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   2191                                     SDLoc dl, SelectionDAG &DAG,
   2192                                     const CCValAssign &VA,
   2193                                     MachineFrameInfo *MFI,
   2194                                     unsigned i) const {
   2195   // Create the nodes corresponding to a load from this parameter slot.
   2196   ISD::ArgFlagsTy Flags = Ins[i].Flags;
   2197   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
   2198       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   2199   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   2200   EVT ValVT;
   2201 
   2202   // If value is passed by pointer we have address passed instead of the value
   2203   // itself.
   2204   if (VA.getLocInfo() == CCValAssign::Indirect)
   2205     ValVT = VA.getLocVT();
   2206   else
   2207     ValVT = VA.getValVT();
   2208 
   2209   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   2210   // changed with more analysis.
   2211   // In case of tail call optimization mark all arguments mutable. Since they
   2212   // could be overwritten by lowering of arguments in case of a tail call.
   2213   if (Flags.isByVal()) {
   2214     unsigned Bytes = Flags.getByValSize();
   2215     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
   2216     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
   2217     return DAG.getFrameIndex(FI, getPointerTy());
   2218   } else {
   2219     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
   2220                                     VA.getLocMemOffset(), isImmutable);
   2221     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
   2222     return DAG.getLoad(ValVT, dl, Chain, FIN,
   2223                        MachinePointerInfo::getFixedStack(FI),
   2224                        false, false, false, 0);
   2225   }
   2226 }
   2227 
   2228 // FIXME: Get this from tablegen.
   2229 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
   2230                                                 const X86Subtarget *Subtarget) {
   2231   assert(Subtarget->is64Bit());
   2232 
   2233   if (Subtarget->isCallingConvWin64(CallConv)) {
   2234     static const MCPhysReg GPR64ArgRegsWin64[] = {
   2235       X86::RCX, X86::RDX, X86::R8,  X86::R9
   2236     };
   2237     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
   2238   }
   2239 
   2240   static const MCPhysReg GPR64ArgRegs64Bit[] = {
   2241     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
   2242   };
   2243   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
   2244 }
   2245 
   2246 // FIXME: Get this from tablegen.
   2247 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
   2248                                                 CallingConv::ID CallConv,
   2249                                                 const X86Subtarget *Subtarget) {
   2250   assert(Subtarget->is64Bit());
   2251   if (Subtarget->isCallingConvWin64(CallConv)) {
   2252     // The XMM registers which might contain var arg parameters are shadowed
   2253     // in their paired GPR.  So we only need to save the GPR to their home
   2254     // slots.
   2255     // TODO: __vectorcall will change this.
   2256     return None;
   2257   }
   2258 
   2259   const Function *Fn = MF.getFunction();
   2260   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
   2261   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
   2262          "SSE register cannot be used when SSE is disabled!");
   2263   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
   2264       !Subtarget->hasSSE1())
   2265     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
   2266     // registers.
   2267     return None;
   2268 
   2269   static const MCPhysReg XMMArgRegs64Bit[] = {
   2270     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2271     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2272   };
   2273   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
   2274 }
   2275 
   2276 SDValue
   2277 X86TargetLowering::LowerFormalArguments(SDValue Chain,
   2278                                         CallingConv::ID CallConv,
   2279                                         bool isVarArg,
   2280                                       const SmallVectorImpl<ISD::InputArg> &Ins,
   2281                                         SDLoc dl,
   2282                                         SelectionDAG &DAG,
   2283                                         SmallVectorImpl<SDValue> &InVals)
   2284                                           const {
   2285   MachineFunction &MF = DAG.getMachineFunction();
   2286   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   2287   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   2288 
   2289   const Function* Fn = MF.getFunction();
   2290   if (Fn->hasExternalLinkage() &&
   2291       Subtarget->isTargetCygMing() &&
   2292       Fn->getName() == "main")
   2293     FuncInfo->setForceFramePointer(true);
   2294 
   2295   MachineFrameInfo *MFI = MF.getFrameInfo();
   2296   bool Is64Bit = Subtarget->is64Bit();
   2297   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
   2298 
   2299   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
   2300          "Var args not supported with calling convention fastcc, ghc or hipe");
   2301 
   2302   // Assign locations to all of the incoming arguments.
   2303   SmallVector<CCValAssign, 16> ArgLocs;
   2304   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
   2305 
   2306   // Allocate shadow area for Win64
   2307   if (IsWin64)
   2308     CCInfo.AllocateStack(32, 8);
   2309 
   2310   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
   2311 
   2312   unsigned LastVal = ~0U;
   2313   SDValue ArgValue;
   2314   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2315     CCValAssign &VA = ArgLocs[i];
   2316     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
   2317     // places.
   2318     assert(VA.getValNo() != LastVal &&
   2319            "Don't support value assigned to multiple locs yet");
   2320     (void)LastVal;
   2321     LastVal = VA.getValNo();
   2322 
   2323     if (VA.isRegLoc()) {
   2324       EVT RegVT = VA.getLocVT();
   2325       const TargetRegisterClass *RC;
   2326       if (RegVT == MVT::i32)
   2327         RC = &X86::GR32RegClass;
   2328       else if (Is64Bit && RegVT == MVT::i64)
   2329         RC = &X86::GR64RegClass;
   2330       else if (RegVT == MVT::f32)
   2331         RC = &X86::FR32RegClass;
   2332       else if (RegVT == MVT::f64)
   2333         RC = &X86::FR64RegClass;
   2334       else if (RegVT.is512BitVector())
   2335         RC = &X86::VR512RegClass;
   2336       else if (RegVT.is256BitVector())
   2337         RC = &X86::VR256RegClass;
   2338       else if (RegVT.is128BitVector())
   2339         RC = &X86::VR128RegClass;
   2340       else if (RegVT == MVT::x86mmx)
   2341         RC = &X86::VR64RegClass;
   2342       else if (RegVT == MVT::i1)
   2343         RC = &X86::VK1RegClass;
   2344       else if (RegVT == MVT::v8i1)
   2345         RC = &X86::VK8RegClass;
   2346       else if (RegVT == MVT::v16i1)
   2347         RC = &X86::VK16RegClass;
   2348       else if (RegVT == MVT::v32i1)
   2349         RC = &X86::VK32RegClass;
   2350       else if (RegVT == MVT::v64i1)
   2351         RC = &X86::VK64RegClass;
   2352       else
   2353         llvm_unreachable("Unknown argument type!");
   2354 
   2355       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   2356       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
   2357 
   2358       // If this is an 8 or 16-bit value, it is really passed promoted to 32
   2359       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
   2360       // right size.
   2361       if (VA.getLocInfo() == CCValAssign::SExt)
   2362         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
   2363                                DAG.getValueType(VA.getValVT()));
   2364       else if (VA.getLocInfo() == CCValAssign::ZExt)
   2365         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
   2366                                DAG.getValueType(VA.getValVT()));
   2367       else if (VA.getLocInfo() == CCValAssign::BCvt)
   2368         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
   2369 
   2370       if (VA.isExtInLoc()) {
   2371         // Handle MMX values passed in XMM regs.
   2372         if (RegVT.isVector())
   2373           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
   2374         else
   2375           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
   2376       }
   2377     } else {
   2378       assert(VA.isMemLoc());
   2379       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
   2380     }
   2381 
   2382     // If value is passed via pointer - do a load.
   2383     if (VA.getLocInfo() == CCValAssign::Indirect)
   2384       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
   2385                              MachinePointerInfo(), false, false, false, 0);
   2386 
   2387     InVals.push_back(ArgValue);
   2388   }
   2389 
   2390   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
   2391     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2392       // The x86-64 ABIs require that for returning structs by value we copy
   2393       // the sret argument into %rax/%eax (depending on ABI) for the return.
   2394       // Win32 requires us to put the sret argument to %eax as well.
   2395       // Save the argument into a virtual register so that we can access it
   2396       // from the return points.
   2397       if (Ins[i].Flags.isSRet()) {
   2398         unsigned Reg = FuncInfo->getSRetReturnReg();
   2399         if (!Reg) {
   2400           MVT PtrTy = getPointerTy();
   2401           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
   2402           FuncInfo->setSRetReturnReg(Reg);
   2403         }
   2404         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
   2405         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   2406         break;
   2407       }
   2408     }
   2409   }
   2410 
   2411   unsigned StackSize = CCInfo.getNextStackOffset();
   2412   // Align stack specially for tail calls.
   2413   if (FuncIsMadeTailCallSafe(CallConv,
   2414                              MF.getTarget().Options.GuaranteedTailCallOpt))
   2415     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
   2416 
   2417   // If the function takes variable number of arguments, make a frame index for
   2418   // the start of the first vararg value... for expansion of llvm.va_start. We
   2419   // can skip this if there are no va_start calls.
   2420   if (MFI->hasVAStart() &&
   2421       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
   2422                    CallConv != CallingConv::X86_ThisCall))) {
   2423     FuncInfo->setVarArgsFrameIndex(
   2424         MFI->CreateFixedObject(1, StackSize, true));
   2425   }
   2426 
   2427   MachineModuleInfo &MMI = MF.getMMI();
   2428   const Function *WinEHParent = nullptr;
   2429   if (IsWin64 && MMI.hasWinEHFuncInfo(Fn))
   2430     WinEHParent = MMI.getWinEHParent(Fn);
   2431   bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn;
   2432   bool IsWinEHParent = WinEHParent && WinEHParent == Fn;
   2433 
   2434   // Figure out if XMM registers are in use.
   2435   assert(!(MF.getTarget().Options.UseSoftFloat &&
   2436            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
   2437          "SSE register cannot be used when SSE is disabled!");
   2438 
   2439   // 64-bit calling conventions support varargs and register parameters, so we
   2440   // have to do extra work to spill them in the prologue.
   2441   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
   2442     // Find the first unallocated argument registers.
   2443     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
   2444     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
   2445     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
   2446     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
   2447     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
   2448            "SSE register cannot be used when SSE is disabled!");
   2449 
   2450     // Gather all the live in physical registers.
   2451     SmallVector<SDValue, 6> LiveGPRs;
   2452     SmallVector<SDValue, 8> LiveXMMRegs;
   2453     SDValue ALVal;
   2454     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
   2455       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
   2456       LiveGPRs.push_back(
   2457           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
   2458     }
   2459     if (!ArgXMMs.empty()) {
   2460       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
   2461       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
   2462       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
   2463         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
   2464         LiveXMMRegs.push_back(
   2465             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
   2466       }
   2467     }
   2468 
   2469     if (IsWin64) {
   2470       // Get to the caller-allocated home save location.  Add 8 to account
   2471       // for the return address.
   2472       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
   2473       FuncInfo->setRegSaveFrameIndex(
   2474           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
   2475       // Fixup to set vararg frame on shadow area (4 x i64).
   2476       if (NumIntRegs < 4)
   2477         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
   2478     } else {
   2479       // For X86-64, if there are vararg parameters that are passed via
   2480       // registers, then we must store them to their spots on the stack so
   2481       // they may be loaded by deferencing the result of va_next.
   2482       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
   2483       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
   2484       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
   2485           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
   2486     }
   2487 
   2488     // Store the integer parameter registers.
   2489     SmallVector<SDValue, 8> MemOps;
   2490     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   2491                                       getPointerTy());
   2492     unsigned Offset = FuncInfo->getVarArgsGPOffset();
   2493     for (SDValue Val : LiveGPRs) {
   2494       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
   2495                                 DAG.getIntPtrConstant(Offset));
   2496       SDValue Store =
   2497         DAG.getStore(Val.getValue(1), dl, Val, FIN,
   2498                      MachinePointerInfo::getFixedStack(
   2499                        FuncInfo->getRegSaveFrameIndex(), Offset),
   2500                      false, false, 0);
   2501       MemOps.push_back(Store);
   2502       Offset += 8;
   2503     }
   2504 
   2505     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
   2506       // Now store the XMM (fp + vector) parameter registers.
   2507       SmallVector<SDValue, 12> SaveXMMOps;
   2508       SaveXMMOps.push_back(Chain);
   2509       SaveXMMOps.push_back(ALVal);
   2510       SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2511                              FuncInfo->getRegSaveFrameIndex()));
   2512       SaveXMMOps.push_back(DAG.getIntPtrConstant(
   2513                              FuncInfo->getVarArgsFPOffset()));
   2514       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
   2515                         LiveXMMRegs.end());
   2516       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
   2517                                    MVT::Other, SaveXMMOps));
   2518     }
   2519 
   2520     if (!MemOps.empty())
   2521       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
   2522   } else if (IsWinEHOutlined) {
   2523     // Get to the caller-allocated home save location.  Add 8 to account
   2524     // for the return address.
   2525     int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
   2526     FuncInfo->setRegSaveFrameIndex(MFI->CreateFixedObject(
   2527         /*Size=*/1, /*SPOffset=*/HomeOffset + 8, /*Immutable=*/false));
   2528 
   2529     MMI.getWinEHFuncInfo(Fn)
   2530         .CatchHandlerParentFrameObjIdx[const_cast<Function *>(Fn)] =
   2531         FuncInfo->getRegSaveFrameIndex();
   2532 
   2533     // Store the second integer parameter (rdx) into rsp+16 relative to the
   2534     // stack pointer at the entry of the function.
   2535     SDValue RSFIN =
   2536         DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy());
   2537     unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass);
   2538     SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64);
   2539     Chain = DAG.getStore(
   2540         Val.getValue(1), dl, Val, RSFIN,
   2541         MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()),
   2542         /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0);
   2543   }
   2544 
   2545   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
   2546     // Find the largest legal vector type.
   2547     MVT VecVT = MVT::Other;
   2548     // FIXME: Only some x86_32 calling conventions support AVX512.
   2549     if (Subtarget->hasAVX512() &&
   2550         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
   2551                      CallConv == CallingConv::Intel_OCL_BI)))
   2552       VecVT = MVT::v16f32;
   2553     else if (Subtarget->hasAVX())
   2554       VecVT = MVT::v8f32;
   2555     else if (Subtarget->hasSSE2())
   2556       VecVT = MVT::v4f32;
   2557 
   2558     // We forward some GPRs and some vector types.
   2559     SmallVector<MVT, 2> RegParmTypes;
   2560     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
   2561     RegParmTypes.push_back(IntVT);
   2562     if (VecVT != MVT::Other)
   2563       RegParmTypes.push_back(VecVT);
   2564 
   2565     // Compute the set of forwarded registers. The rest are scratch.
   2566     SmallVectorImpl<ForwardedRegister> &Forwards =
   2567         FuncInfo->getForwardedMustTailRegParms();
   2568     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
   2569 
   2570     // Conservatively forward AL on x86_64, since it might be used for varargs.
   2571     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
   2572       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
   2573       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
   2574     }
   2575 
   2576     // Copy all forwards from physical to virtual registers.
   2577     for (ForwardedRegister &F : Forwards) {
   2578       // FIXME: Can we use a less constrained schedule?
   2579       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
   2580       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
   2581       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
   2582     }
   2583   }
   2584 
   2585   // Some CCs need callee pop.
   2586   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   2587                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
   2588     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   2589   } else {
   2590     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
   2591     // If this is an sret function, the return should pop the hidden pointer.
   2592     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
   2593         !Subtarget->getTargetTriple().isOSMSVCRT() &&
   2594         argsAreStructReturn(Ins) == StackStructReturn)
   2595       FuncInfo->setBytesToPopOnReturn(4);
   2596   }
   2597 
   2598   if (!Is64Bit) {
   2599     // RegSaveFrameIndex is X86-64 only.
   2600     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
   2601     if (CallConv == CallingConv::X86_FastCall ||
   2602         CallConv == CallingConv::X86_ThisCall)
   2603       // fastcc functions can't have varargs.
   2604       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   2605   }
   2606 
   2607   FuncInfo->setArgumentStackSize(StackSize);
   2608 
   2609   if (IsWinEHParent) {
   2610     int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
   2611     SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
   2612     MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
   2613     SDValue Neg2 = DAG.getConstant(-2, MVT::i64);
   2614     Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
   2615                          MachinePointerInfo::getFixedStack(UnwindHelpFI),
   2616                          /*isVolatile=*/true,
   2617                          /*isNonTemporal=*/false, /*Alignment=*/0);
   2618   }
   2619 
   2620   return Chain;
   2621 }
   2622 
   2623 SDValue
   2624 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
   2625                                     SDValue StackPtr, SDValue Arg,
   2626                                     SDLoc dl, SelectionDAG &DAG,
   2627                                     const CCValAssign &VA,
   2628                                     ISD::ArgFlagsTy Flags) const {
   2629   unsigned LocMemOffset = VA.getLocMemOffset();
   2630   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
   2631   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
   2632   if (Flags.isByVal())
   2633     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
   2634 
   2635   return DAG.getStore(Chain, dl, Arg, PtrOff,
   2636                       MachinePointerInfo::getStack(LocMemOffset),
   2637                       false, false, 0);
   2638 }
   2639 
   2640 /// Emit a load of return address if tail call
   2641 /// optimization is performed and it is required.
   2642 SDValue
   2643 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
   2644                                            SDValue &OutRetAddr, SDValue Chain,
   2645                                            bool IsTailCall, bool Is64Bit,
   2646                                            int FPDiff, SDLoc dl) const {
   2647   // Adjust the Return address stack slot.
   2648   EVT VT = getPointerTy();
   2649   OutRetAddr = getReturnAddressFrameIndex(DAG);
   2650 
   2651   // Load the "old" Return address.
   2652   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
   2653                            false, false, false, 0);
   2654   return SDValue(OutRetAddr.getNode(), 1);
   2655 }
   2656 
   2657 /// Emit a store of the return address if tail call
   2658 /// optimization is performed and it is required (FPDiff!=0).
   2659 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
   2660                                         SDValue Chain, SDValue RetAddrFrIdx,
   2661                                         EVT PtrVT, unsigned SlotSize,
   2662                                         int FPDiff, SDLoc dl) {
   2663   // Store the return address to the appropriate stack slot.
   2664   if (!FPDiff) return Chain;
   2665   // Calculate the new stack slot for the return address.
   2666   int NewReturnAddrFI =
   2667     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
   2668                                          false);
   2669   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   2670   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
   2671                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
   2672                        false, false, 0);
   2673   return Chain;
   2674 }
   2675 
   2676 SDValue
   2677 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   2678                              SmallVectorImpl<SDValue> &InVals) const {
   2679   SelectionDAG &DAG                     = CLI.DAG;
   2680   SDLoc &dl                             = CLI.DL;
   2681   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   2682   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
   2683   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   2684   SDValue Chain                         = CLI.Chain;
   2685   SDValue Callee                        = CLI.Callee;
   2686   CallingConv::ID CallConv              = CLI.CallConv;
   2687   bool &isTailCall                      = CLI.IsTailCall;
   2688   bool isVarArg                         = CLI.IsVarArg;
   2689 
   2690   MachineFunction &MF = DAG.getMachineFunction();
   2691   bool Is64Bit        = Subtarget->is64Bit();
   2692   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
   2693   StructReturnType SR = callIsStructReturn(Outs);
   2694   bool IsSibcall      = false;
   2695   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   2696 
   2697   if (MF.getTarget().Options.DisableTailCalls)
   2698     isTailCall = false;
   2699 
   2700   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
   2701   if (IsMustTail) {
   2702     // Force this to be a tail call.  The verifier rules are enough to ensure
   2703     // that we can lower this successfully without moving the return address
   2704     // around.
   2705     isTailCall = true;
   2706   } else if (isTailCall) {
   2707     // Check if it's really possible to do a tail call.
   2708     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
   2709                     isVarArg, SR != NotStructReturn,
   2710                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
   2711                     Outs, OutVals, Ins, DAG);
   2712 
   2713     // Sibcalls are automatically detected tailcalls which do not require
   2714     // ABI changes.
   2715     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
   2716       IsSibcall = true;
   2717 
   2718     if (isTailCall)
   2719       ++NumTailCalls;
   2720   }
   2721 
   2722   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
   2723          "Var args not supported with calling convention fastcc, ghc or hipe");
   2724 
   2725   // Analyze operands of the call, assigning locations to each operand.
   2726   SmallVector<CCValAssign, 16> ArgLocs;
   2727   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
   2728 
   2729   // Allocate shadow area for Win64
   2730   if (IsWin64)
   2731     CCInfo.AllocateStack(32, 8);
   2732 
   2733   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   2734 
   2735   // Get a count of how many bytes are to be pushed on the stack.
   2736   unsigned NumBytes = CCInfo.getNextStackOffset();
   2737   if (IsSibcall)
   2738     // This is a sibcall. The memory operands are available in caller's
   2739     // own caller's stack.
   2740     NumBytes = 0;
   2741   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
   2742            IsTailCallConvention(CallConv))
   2743     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
   2744 
   2745   int FPDiff = 0;
   2746   if (isTailCall && !IsSibcall && !IsMustTail) {
   2747     // Lower arguments at fp - stackoffset + fpdiff.
   2748     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
   2749 
   2750     FPDiff = NumBytesCallerPushed - NumBytes;
   2751 
   2752     // Set the delta of movement of the returnaddr stackslot.
   2753     // But only set if delta is greater than previous delta.
   2754     if (FPDiff < X86Info->getTCReturnAddrDelta())
   2755       X86Info->setTCReturnAddrDelta(FPDiff);
   2756   }
   2757 
   2758   unsigned NumBytesToPush = NumBytes;
   2759   unsigned NumBytesToPop = NumBytes;
   2760 
   2761   // If we have an inalloca argument, all stack space has already been allocated
   2762   // for us and be right at the top of the stack.  We don't support multiple
   2763   // arguments passed in memory when using inalloca.
   2764   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
   2765     NumBytesToPush = 0;
   2766     if (!ArgLocs.back().isMemLoc())
   2767       report_fatal_error("cannot use inalloca attribute on a register "
   2768                          "parameter");
   2769     if (ArgLocs.back().getLocMemOffset() != 0)
   2770       report_fatal_error("any parameter with the inalloca attribute must be "
   2771                          "the only memory argument");
   2772   }
   2773 
   2774   if (!IsSibcall)
   2775     Chain = DAG.getCALLSEQ_START(
   2776         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
   2777 
   2778   SDValue RetAddrFrIdx;
   2779   // Load return address for tail calls.
   2780   if (isTailCall && FPDiff)
   2781     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
   2782                                     Is64Bit, FPDiff, dl);
   2783 
   2784   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   2785   SmallVector<SDValue, 8> MemOpChains;
   2786   SDValue StackPtr;
   2787 
   2788   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   2789   // of tail call optimization arguments are handle later.
   2790   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   2791   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2792     // Skip inalloca arguments, they have already been written.
   2793     ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2794     if (Flags.isInAlloca())
   2795       continue;
   2796 
   2797     CCValAssign &VA = ArgLocs[i];
   2798     EVT RegVT = VA.getLocVT();
   2799     SDValue Arg = OutVals[i];
   2800     bool isByVal = Flags.isByVal();
   2801 
   2802     // Promote the value if needed.
   2803     switch (VA.getLocInfo()) {
   2804     default: llvm_unreachable("Unknown loc info!");
   2805     case CCValAssign::Full: break;
   2806     case CCValAssign::SExt:
   2807       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
   2808       break;
   2809     case CCValAssign::ZExt:
   2810       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
   2811       break;
   2812     case CCValAssign::AExt:
   2813       if (RegVT.is128BitVector()) {
   2814         // Special case: passing MMX values in XMM registers.
   2815         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
   2816         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
   2817         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
   2818       } else
   2819         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
   2820       break;
   2821     case CCValAssign::BCvt:
   2822       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
   2823       break;
   2824     case CCValAssign::Indirect: {
   2825       // Store the argument.
   2826       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
   2827       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
   2828       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
   2829                            MachinePointerInfo::getFixedStack(FI),
   2830                            false, false, 0);
   2831       Arg = SpillSlot;
   2832       break;
   2833     }
   2834     }
   2835 
   2836     if (VA.isRegLoc()) {
   2837       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   2838       if (isVarArg && IsWin64) {
   2839         // Win64 ABI requires argument XMM reg to be copied to the corresponding
   2840         // shadow reg if callee is a varargs function.
   2841         unsigned ShadowReg = 0;
   2842         switch (VA.getLocReg()) {
   2843         case X86::XMM0: ShadowReg = X86::RCX; break;
   2844         case X86::XMM1: ShadowReg = X86::RDX; break;
   2845         case X86::XMM2: ShadowReg = X86::R8; break;
   2846         case X86::XMM3: ShadowReg = X86::R9; break;
   2847         }
   2848         if (ShadowReg)
   2849           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
   2850       }
   2851     } else if (!IsSibcall && (!isTailCall || isByVal)) {
   2852       assert(VA.isMemLoc());
   2853       if (!StackPtr.getNode())
   2854         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
   2855                                       getPointerTy());
   2856       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
   2857                                              dl, DAG, VA, Flags));
   2858     }
   2859   }
   2860 
   2861   if (!MemOpChains.empty())
   2862     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
   2863 
   2864   if (Subtarget->isPICStyleGOT()) {
   2865     // ELF / PIC requires GOT in the EBX register before function calls via PLT
   2866     // GOT pointer.
   2867     if (!isTailCall) {
   2868       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
   2869                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
   2870     } else {
   2871       // If we are tail calling and generating PIC/GOT style code load the
   2872       // address of the callee into ECX. The value in ecx is used as target of
   2873       // the tail jump. This is done to circumvent the ebx/callee-saved problem
   2874       // for tail calls on PIC/GOT architectures. Normally we would just put the
   2875       // address of GOT into ebx and then call target@PLT. But for tail calls
   2876       // ebx would be restored (since ebx is callee saved) before jumping to the
   2877       // target@PLT.
   2878 
   2879       // Note: The actual moving to ECX is done further down.
   2880       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   2881       if (G && !G->getGlobal()->hasHiddenVisibility() &&
   2882           !G->getGlobal()->hasProtectedVisibility())
   2883         Callee = LowerGlobalAddress(Callee, DAG);
   2884       else if (isa<ExternalSymbolSDNode>(Callee))
   2885         Callee = LowerExternalSymbol(Callee, DAG);
   2886     }
   2887   }
   2888 
   2889   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
   2890     // From AMD64 ABI document:
   2891     // For calls that may call functions that use varargs or stdargs
   2892     // (prototype-less calls or calls to functions containing ellipsis (...) in
   2893     // the declaration) %al is used as hidden argument to specify the number
   2894     // of SSE registers used. The contents of %al do not need to match exactly
   2895     // the number of registers, but must be an ubound on the number of SSE
   2896     // registers used and is in the range 0 - 8 inclusive.
   2897 
   2898     // Count the number of XMM registers allocated.
   2899     static const MCPhysReg XMMArgRegs[] = {
   2900       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2901       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2902     };
   2903     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
   2904     assert((Subtarget->hasSSE1() || !NumXMMRegs)
   2905            && "SSE registers cannot be used when SSE is disabled");
   2906 
   2907     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
   2908                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
   2909   }
   2910 
   2911   if (isVarArg && IsMustTail) {
   2912     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
   2913     for (const auto &F : Forwards) {
   2914       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
   2915       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
   2916     }
   2917   }
   2918 
   2919   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
   2920   // don't need this because the eligibility check rejects calls that require
   2921   // shuffling arguments passed in memory.
   2922   if (!IsSibcall && isTailCall) {
   2923     // Force all the incoming stack arguments to be loaded from the stack
   2924     // before any new outgoing arguments are stored to the stack, because the
   2925     // outgoing stack slots may alias the incoming argument stack slots, and
   2926     // the alias isn't otherwise explicit. This is slightly more conservative
   2927     // than necessary, because it means that each store effectively depends
   2928     // on every argument instead of just those arguments it would clobber.
   2929     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
   2930 
   2931     SmallVector<SDValue, 8> MemOpChains2;
   2932     SDValue FIN;
   2933     int FI = 0;
   2934     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   2935       CCValAssign &VA = ArgLocs[i];
   2936       if (VA.isRegLoc())
   2937         continue;
   2938       assert(VA.isMemLoc());
   2939       SDValue Arg = OutVals[i];
   2940       ISD::ArgFlagsTy Flags = Outs[i].Flags;
   2941       // Skip inalloca arguments.  They don't require any work.
   2942       if (Flags.isInAlloca())
   2943         continue;
   2944       // Create frame index.
   2945       int32_t Offset = VA.getLocMemOffset()+FPDiff;
   2946       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
   2947       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
   2948       FIN = DAG.getFrameIndex(FI, getPointerTy());
   2949 
   2950       if (Flags.isByVal()) {
   2951         // Copy relative to framepointer.
   2952         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
   2953         if (!StackPtr.getNode())
   2954           StackPtr = DAG.getCopyFromReg(Chain, dl,
   2955                                         RegInfo->getStackRegister(),
   2956                                         getPointerTy());
   2957         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
   2958 
   2959         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
   2960                                                          ArgChain,
   2961                                                          Flags, DAG, dl));
   2962       } else {
   2963         // Store relative to framepointer.
   2964         MemOpChains2.push_back(
   2965           DAG.getStore(ArgChain, dl, Arg, FIN,
   2966                        MachinePointerInfo::getFixedStack(FI),
   2967                        false, false, 0));
   2968       }
   2969     }
   2970 
   2971     if (!MemOpChains2.empty())
   2972       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
   2973 
   2974     // Store the return address to the appropriate stack slot.
   2975     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
   2976                                      getPointerTy(), RegInfo->getSlotSize(),
   2977                                      FPDiff, dl);
   2978   }
   2979 
   2980   // Build a sequence of copy-to-reg nodes chained together with token chain
   2981   // and flag operands which copy the outgoing args into registers.
   2982   SDValue InFlag;
   2983   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   2984     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   2985                              RegsToPass[i].second, InFlag);
   2986     InFlag = Chain.getValue(1);
   2987   }
   2988 
   2989   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
   2990     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
   2991     // In the 64-bit large code model, we have to make all calls
   2992     // through a register, since the call instruction's 32-bit
   2993     // pc-relative offset may not be large enough to hold the whole
   2994     // address.
   2995   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
   2996     // If the callee is a GlobalAddress node (quite common, every direct call
   2997     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
   2998     // it.
   2999     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
   3000 
   3001     // We should use extra load for direct calls to dllimported functions in
   3002     // non-JIT mode.
   3003     const GlobalValue *GV = G->getGlobal();
   3004     if (!GV->hasDLLImportStorageClass()) {
   3005       unsigned char OpFlags = 0;
   3006       bool ExtraLoad = false;
   3007       unsigned WrapperKind = ISD::DELETED_NODE;
   3008 
   3009       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
   3010       // external symbols most go through the PLT in PIC mode.  If the symbol
   3011       // has hidden or protected visibility, or if it is static or local, then
   3012       // we don't need to use the PLT - we can directly call it.
   3013       if (Subtarget->isTargetELF() &&
   3014           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
   3015           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
   3016         OpFlags = X86II::MO_PLT;
   3017       } else if (Subtarget->isPICStyleStubAny() &&
   3018                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
   3019                  (!Subtarget->getTargetTriple().isMacOSX() ||
   3020                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   3021         // PC-relative references to external symbols should go through $stub,
   3022         // unless we're building with the leopard linker or later, which
   3023         // automatically synthesizes these stubs.
   3024         OpFlags = X86II::MO_DARWIN_STUB;
   3025       } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
   3026                  cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
   3027         // If the function is marked as non-lazy, generate an indirect call
   3028         // which loads from the GOT directly. This avoids runtime overhead
   3029         // at the cost of eager binding (and one extra byte of encoding).
   3030         OpFlags = X86II::MO_GOTPCREL;
   3031         WrapperKind = X86ISD::WrapperRIP;
   3032         ExtraLoad = true;
   3033       }
   3034 
   3035       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
   3036                                           G->getOffset(), OpFlags);
   3037 
   3038       // Add a wrapper if needed.
   3039       if (WrapperKind != ISD::DELETED_NODE)
   3040         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
   3041       // Add extra indirection if needed.
   3042       if (ExtraLoad)
   3043         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
   3044                              MachinePointerInfo::getGOT(),
   3045                              false, false, false, 0);
   3046     }
   3047   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   3048     unsigned char OpFlags = 0;
   3049 
   3050     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
   3051     // external symbols should go through the PLT.
   3052     if (Subtarget->isTargetELF() &&
   3053         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
   3054       OpFlags = X86II::MO_PLT;
   3055     } else if (Subtarget->isPICStyleStubAny() &&
   3056                (!Subtarget->getTargetTriple().isMacOSX() ||
   3057                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
   3058       // PC-relative references to external symbols should go through $stub,
   3059       // unless we're building with the leopard linker or later, which
   3060       // automatically synthesizes these stubs.
   3061       OpFlags = X86II::MO_DARWIN_STUB;
   3062     }
   3063 
   3064     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
   3065                                          OpFlags);
   3066   } else if (Subtarget->isTarget64BitILP32() &&
   3067              Callee->getValueType(0) == MVT::i32) {
   3068     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
   3069     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
   3070   }
   3071 
   3072   // Returns a chain & a flag for retval copy to use.
   3073   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   3074   SmallVector<SDValue, 8> Ops;
   3075 
   3076   if (!IsSibcall && isTailCall) {
   3077     Chain = DAG.getCALLSEQ_END(Chain,
   3078                                DAG.getIntPtrConstant(NumBytesToPop, true),
   3079                                DAG.getIntPtrConstant(0, true), InFlag, dl);
   3080     InFlag = Chain.getValue(1);
   3081   }
   3082 
   3083   Ops.push_back(Chain);
   3084   Ops.push_back(Callee);
   3085 
   3086   if (isTailCall)
   3087     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
   3088 
   3089   // Add argument registers to the end of the list so that they are known live
   3090   // into the call.
   3091   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   3092     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   3093                                   RegsToPass[i].second.getValueType()));
   3094 
   3095   // Add a register mask operand representing the call-preserved registers.
   3096   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   3097   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
   3098   assert(Mask && "Missing call preserved mask for calling convention");
   3099   Ops.push_back(DAG.getRegisterMask(Mask));
   3100 
   3101   if (InFlag.getNode())
   3102     Ops.push_back(InFlag);
   3103 
   3104   if (isTailCall) {
   3105     // We used to do:
   3106     //// If this is the first return lowered for this function, add the regs
   3107     //// to the liveout set for the function.
   3108     // This isn't right, although it's probably harmless on x86; liveouts
   3109     // should be computed from returns not tail calls.  Consider a void
   3110     // function making a tail call to a function returning int.
   3111     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
   3112   }
   3113 
   3114   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
   3115   InFlag = Chain.getValue(1);
   3116 
   3117   // Create the CALLSEQ_END node.
   3118   unsigned NumBytesForCalleeToPop;
   3119   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   3120                        DAG.getTarget().Options.GuaranteedTailCallOpt))
   3121     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
   3122   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
   3123            !Subtarget->getTargetTriple().isOSMSVCRT() &&
   3124            SR == StackStructReturn)
   3125     // If this is a call to a struct-return function, the callee
   3126     // pops the hidden struct pointer, so we have to push it back.
   3127     // This is common for Darwin/X86, Linux & Mingw32 targets.
   3128     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
   3129     NumBytesForCalleeToPop = 4;
   3130   else
   3131     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
   3132 
   3133   // Returns a flag for retval copy to use.
   3134   if (!IsSibcall) {
   3135     Chain = DAG.getCALLSEQ_END(Chain,
   3136                                DAG.getIntPtrConstant(NumBytesToPop, true),
   3137                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
   3138                                                      true),
   3139                                InFlag, dl);
   3140     InFlag = Chain.getValue(1);
   3141   }
   3142 
   3143   // Handle result values, copying them out of physregs into vregs that we
   3144   // return.
   3145   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
   3146                          Ins, dl, DAG, InVals);
   3147 }
   3148 
   3149 //===----------------------------------------------------------------------===//
   3150 //                Fast Calling Convention (tail call) implementation
   3151 //===----------------------------------------------------------------------===//
   3152 
   3153 //  Like std call, callee cleans arguments, convention except that ECX is
   3154 //  reserved for storing the tail called function address. Only 2 registers are
   3155 //  free for argument passing (inreg). Tail call optimization is performed
   3156 //  provided:
   3157 //                * tailcallopt is enabled
   3158 //                * caller/callee are fastcc
   3159 //  On X86_64 architecture with GOT-style position independent code only local
   3160 //  (within module) calls are supported at the moment.
   3161 //  To keep the stack aligned according to platform abi the function
   3162 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
   3163 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
   3164 //  If a tail called function callee has more arguments than the caller the
   3165 //  caller needs to make sure that there is room to move the RETADDR to. This is
   3166 //  achieved by reserving an area the size of the argument delta right after the
   3167 //  original RETADDR, but before the saved framepointer or the spilled registers
   3168 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
   3169 //  stack layout:
   3170 //    arg1
   3171 //    arg2
   3172 //    RETADDR
   3173 //    [ new RETADDR
   3174 //      move area ]
   3175 //    (possible EBP)
   3176 //    ESI
   3177 //    EDI
   3178 //    local1 ..
   3179 
   3180 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
   3181 /// for a 16 byte align requirement.
   3182 unsigned
   3183 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   3184                                                SelectionDAG& DAG) const {
   3185   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   3186   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   3187   unsigned StackAlignment = TFI.getStackAlignment();
   3188   uint64_t AlignMask = StackAlignment - 1;
   3189   int64_t Offset = StackSize;
   3190   unsigned SlotSize = RegInfo->getSlotSize();
   3191   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
   3192     // Number smaller than 12 so just add the difference.
   3193     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
   3194   } else {
   3195     // Mask out lower bits, add stackalignment once plus the 12 bytes.
   3196     Offset = ((~AlignMask) & Offset) + StackAlignment +
   3197       (StackAlignment-SlotSize);
   3198   }
   3199   return Offset;
   3200 }
   3201 
   3202 /// MatchingStackOffset - Return true if the given stack call argument is
   3203 /// already available in the same position (relatively) of the caller's
   3204 /// incoming argument stack.
   3205 static
   3206 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   3207                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
   3208                          const X86InstrInfo *TII) {
   3209   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
   3210   int FI = INT_MAX;
   3211   if (Arg.getOpcode() == ISD::CopyFromReg) {
   3212     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
   3213     if (!TargetRegisterInfo::isVirtualRegister(VR))
   3214       return false;
   3215     MachineInstr *Def = MRI->getVRegDef(VR);
   3216     if (!Def)
   3217       return false;
   3218     if (!Flags.isByVal()) {
   3219       if (!TII->isLoadFromStackSlot(Def, FI))
   3220         return false;
   3221     } else {
   3222       unsigned Opcode = Def->getOpcode();
   3223       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
   3224            Opcode == X86::LEA64_32r) &&
   3225           Def->getOperand(1).isFI()) {
   3226         FI = Def->getOperand(1).getIndex();
   3227         Bytes = Flags.getByValSize();
   3228       } else
   3229         return false;
   3230     }
   3231   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
   3232     if (Flags.isByVal())
   3233       // ByVal argument is passed in as a pointer but it's now being
   3234       // dereferenced. e.g.
   3235       // define @foo(%struct.X* %A) {
   3236       //   tail call @bar(%struct.X* byval %A)
   3237       // }
   3238       return false;
   3239     SDValue Ptr = Ld->getBasePtr();
   3240     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
   3241     if (!FINode)
   3242       return false;
   3243     FI = FINode->getIndex();
   3244   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
   3245     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
   3246     FI = FINode->getIndex();
   3247     Bytes = Flags.getByValSize();
   3248   } else
   3249     return false;
   3250 
   3251   assert(FI != INT_MAX);
   3252   if (!MFI->isFixedObjectIndex(FI))
   3253     return false;
   3254   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
   3255 }
   3256 
   3257 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
   3258 /// for tail call optimization. Targets which want to do tail call
   3259 /// optimization should implement this function.
   3260 bool
   3261 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   3262                                                      CallingConv::ID CalleeCC,
   3263                                                      bool isVarArg,
   3264                                                      bool isCalleeStructRet,
   3265                                                      bool isCallerStructRet,
   3266                                                      Type *RetTy,
   3267                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
   3268                                     const SmallVectorImpl<SDValue> &OutVals,
   3269                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   3270                                                      SelectionDAG &DAG) const {
   3271   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
   3272     return false;
   3273 
   3274   // If -tailcallopt is specified, make fastcc functions tail-callable.
   3275   const MachineFunction &MF = DAG.getMachineFunction();
   3276   const Function *CallerF = MF.getFunction();
   3277 
   3278   // If the function return type is x86_fp80 and the callee return type is not,
   3279   // then the FP_EXTEND of the call result is not a nop. It's not safe to
   3280   // perform a tailcall optimization here.
   3281   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
   3282     return false;
   3283 
   3284   CallingConv::ID CallerCC = CallerF->getCallingConv();
   3285   bool CCMatch = CallerCC == CalleeCC;
   3286   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
   3287   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
   3288 
   3289   // Win64 functions have extra shadow space for argument homing. Don't do the
   3290   // sibcall if the caller and callee have mismatched expectations for this
   3291   // space.
   3292   if (IsCalleeWin64 != IsCallerWin64)
   3293     return false;
   3294 
   3295   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
   3296     if (IsTailCallConvention(CalleeCC) && CCMatch)
   3297       return true;
   3298     return false;
   3299   }
   3300 
   3301   // Look for obvious safe cases to perform tail call optimization that do not
   3302   // require ABI changes. This is what gcc calls sibcall.
   3303 
   3304   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   3305   // emit a special epilogue.
   3306   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   3307   if (RegInfo->needsStackRealignment(MF))
   3308     return false;
   3309 
   3310   // Also avoid sibcall optimization if either caller or callee uses struct
   3311   // return semantics.
   3312   if (isCalleeStructRet || isCallerStructRet)
   3313     return false;
   3314 
   3315   // An stdcall/thiscall caller is expected to clean up its arguments; the
   3316   // callee isn't going to do that.
   3317   // FIXME: this is more restrictive than needed. We could produce a tailcall
   3318   // when the stack adjustment matches. For example, with a thiscall that takes
   3319   // only one argument.
   3320   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
   3321                    CallerCC == CallingConv::X86_ThisCall))
   3322     return false;
   3323 
   3324   // Do not sibcall optimize vararg calls unless all arguments are passed via
   3325   // registers.
   3326   if (isVarArg && !Outs.empty()) {
   3327 
   3328     // Optimizing for varargs on Win64 is unlikely to be safe without
   3329     // additional testing.
   3330     if (IsCalleeWin64 || IsCallerWin64)
   3331       return false;
   3332 
   3333     SmallVector<CCValAssign, 16> ArgLocs;
   3334     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
   3335                    *DAG.getContext());
   3336 
   3337     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   3338     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
   3339       if (!ArgLocs[i].isRegLoc())
   3340         return false;
   3341   }
   3342 
   3343   // If the call result is in ST0 / ST1, it needs to be popped off the x87
   3344   // stack.  Therefore, if it's not used by the call it is not safe to optimize
   3345   // this into a sibcall.
   3346   bool Unused = false;
   3347   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
   3348     if (!Ins[i].Used) {
   3349       Unused = true;
   3350       break;
   3351     }
   3352   }
   3353   if (Unused) {
   3354     SmallVector<CCValAssign, 16> RVLocs;
   3355     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
   3356                    *DAG.getContext());
   3357     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   3358     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   3359       CCValAssign &VA = RVLocs[i];
   3360       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
   3361         return false;
   3362     }
   3363   }
   3364 
   3365   // If the calling conventions do not match, then we'd better make sure the
   3366   // results are returned in the same way as what the caller expects.
   3367   if (!CCMatch) {
   3368     SmallVector<CCValAssign, 16> RVLocs1;
   3369     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
   3370                     *DAG.getContext());
   3371     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
   3372 
   3373     SmallVector<CCValAssign, 16> RVLocs2;
   3374     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
   3375                     *DAG.getContext());
   3376     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
   3377 
   3378     if (RVLocs1.size() != RVLocs2.size())
   3379       return false;
   3380     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
   3381       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
   3382         return false;
   3383       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
   3384         return false;
   3385       if (RVLocs1[i].isRegLoc()) {
   3386         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
   3387           return false;
   3388       } else {
   3389         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
   3390           return false;
   3391       }
   3392     }
   3393   }
   3394 
   3395   // If the callee takes no arguments then go on to check the results of the
   3396   // call.
   3397   if (!Outs.empty()) {
   3398     // Check if stack adjustment is needed. For now, do not do this if any
   3399     // argument is passed on the stack.
   3400     SmallVector<CCValAssign, 16> ArgLocs;
   3401     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
   3402                    *DAG.getContext());
   3403 
   3404     // Allocate shadow area for Win64
   3405     if (IsCalleeWin64)
   3406       CCInfo.AllocateStack(32, 8);
   3407 
   3408     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   3409     if (CCInfo.getNextStackOffset()) {
   3410       MachineFunction &MF = DAG.getMachineFunction();
   3411       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
   3412         return false;
   3413 
   3414       // Check if the arguments are already laid out in the right way as
   3415       // the caller's fixed stack objects.
   3416       MachineFrameInfo *MFI = MF.getFrameInfo();
   3417       const MachineRegisterInfo *MRI = &MF.getRegInfo();
   3418       const X86InstrInfo *TII = Subtarget->getInstrInfo();
   3419       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3420         CCValAssign &VA = ArgLocs[i];
   3421         SDValue Arg = OutVals[i];
   3422         ISD::ArgFlagsTy Flags = Outs[i].Flags;
   3423         if (VA.getLocInfo() == CCValAssign::Indirect)
   3424           return false;
   3425         if (!VA.isRegLoc()) {
   3426           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
   3427                                    MFI, MRI, TII))
   3428             return false;
   3429         }
   3430       }
   3431     }
   3432 
   3433     // If the tailcall address may be in a register, then make sure it's
   3434     // possible to register allocate for it. In 32-bit, the call address can
   3435     // only target EAX, EDX, or ECX since the tail call must be scheduled after
   3436     // callee-saved registers are restored. These happen to be the same
   3437     // registers used to pass 'inreg' arguments so watch out for those.
   3438     if (!Subtarget->is64Bit() &&
   3439         ((!isa<GlobalAddressSDNode>(Callee) &&
   3440           !isa<ExternalSymbolSDNode>(Callee)) ||
   3441          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
   3442       unsigned NumInRegs = 0;
   3443       // In PIC we need an extra register to formulate the address computation
   3444       // for the callee.
   3445       unsigned MaxInRegs =
   3446         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
   3447 
   3448       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   3449         CCValAssign &VA = ArgLocs[i];
   3450         if (!VA.isRegLoc())
   3451           continue;
   3452         unsigned Reg = VA.getLocReg();
   3453         switch (Reg) {
   3454         default: break;
   3455         case X86::EAX: case X86::EDX: case X86::ECX:
   3456           if (++NumInRegs == MaxInRegs)
   3457             return false;
   3458           break;
   3459         }
   3460       }
   3461     }
   3462   }
   3463 
   3464   return true;
   3465 }
   3466 
   3467 FastISel *
   3468 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
   3469                                   const TargetLibraryInfo *libInfo) const {
   3470   return X86::createFastISel(funcInfo, libInfo);
   3471 }
   3472 
   3473 //===----------------------------------------------------------------------===//
   3474 //                           Other Lowering Hooks
   3475 //===----------------------------------------------------------------------===//
   3476 
   3477 static bool MayFoldLoad(SDValue Op) {
   3478   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
   3479 }
   3480 
   3481 static bool MayFoldIntoStore(SDValue Op) {
   3482   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
   3483 }
   3484 
   3485 static bool isTargetShuffle(unsigned Opcode) {
   3486   switch(Opcode) {
   3487   default: return false;
   3488   case X86ISD::BLENDI:
   3489   case X86ISD::PSHUFB:
   3490   case X86ISD::PSHUFD:
   3491   case X86ISD::PSHUFHW:
   3492   case X86ISD::PSHUFLW:
   3493   case X86ISD::SHUFP:
   3494   case X86ISD::PALIGNR:
   3495   case X86ISD::MOVLHPS:
   3496   case X86ISD::MOVLHPD:
   3497   case X86ISD::MOVHLPS:
   3498   case X86ISD::MOVLPS:
   3499   case X86ISD::MOVLPD:
   3500   case X86ISD::MOVSHDUP:
   3501   case X86ISD::MOVSLDUP:
   3502   case X86ISD::MOVDDUP:
   3503   case X86ISD::MOVSS:
   3504   case X86ISD::MOVSD:
   3505   case X86ISD::UNPCKL:
   3506   case X86ISD::UNPCKH:
   3507   case X86ISD::VPERMILPI:
   3508   case X86ISD::VPERM2X128:
   3509   case X86ISD::VPERMI:
   3510     return true;
   3511   }
   3512 }
   3513 
   3514 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
   3515                                     SDValue V1, unsigned TargetMask,
   3516                                     SelectionDAG &DAG) {
   3517   switch(Opc) {
   3518   default: llvm_unreachable("Unknown x86 shuffle node");
   3519   case X86ISD::PSHUFD:
   3520   case X86ISD::PSHUFHW:
   3521   case X86ISD::PSHUFLW:
   3522   case X86ISD::VPERMILPI:
   3523   case X86ISD::VPERMI:
   3524     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   3525   }
   3526 }
   3527 
   3528 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
   3529                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   3530   switch(Opc) {
   3531   default: llvm_unreachable("Unknown x86 shuffle node");
   3532   case X86ISD::MOVLHPS:
   3533   case X86ISD::MOVLHPD:
   3534   case X86ISD::MOVHLPS:
   3535   case X86ISD::MOVLPS:
   3536   case X86ISD::MOVLPD:
   3537   case X86ISD::MOVSS:
   3538   case X86ISD::MOVSD:
   3539   case X86ISD::UNPCKL:
   3540   case X86ISD::UNPCKH:
   3541     return DAG.getNode(Opc, dl, VT, V1, V2);
   3542   }
   3543 }
   3544 
   3545 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   3546   MachineFunction &MF = DAG.getMachineFunction();
   3547   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   3548   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   3549   int ReturnAddrIndex = FuncInfo->getRAIndex();
   3550 
   3551   if (ReturnAddrIndex == 0) {
   3552     // Set up a frame object for the return address.
   3553     unsigned SlotSize = RegInfo->getSlotSize();
   3554     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
   3555                                                            -(int64_t)SlotSize,
   3556                                                            false);
   3557     FuncInfo->setRAIndex(ReturnAddrIndex);
   3558   }
   3559 
   3560   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
   3561 }
   3562 
   3563 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
   3564                                        bool hasSymbolicDisplacement) {
   3565   // Offset should fit into 32 bit immediate field.
   3566   if (!isInt<32>(Offset))
   3567     return false;
   3568 
   3569   // If we don't have a symbolic displacement - we don't have any extra
   3570   // restrictions.
   3571   if (!hasSymbolicDisplacement)
   3572     return true;
   3573 
   3574   // FIXME: Some tweaks might be needed for medium code model.
   3575   if (M != CodeModel::Small && M != CodeModel::Kernel)
   3576     return false;
   3577 
   3578   // For small code model we assume that latest object is 16MB before end of 31
   3579   // bits boundary. We may also accept pretty large negative constants knowing
   3580   // that all objects are in the positive half of address space.
   3581   if (M == CodeModel::Small && Offset < 16*1024*1024)
   3582     return true;
   3583 
   3584   // For kernel code model we know that all object resist in the negative half
   3585   // of 32bits address space. We may not accept negative offsets, since they may
   3586   // be just off and we may accept pretty large positive ones.
   3587   if (M == CodeModel::Kernel && Offset >= 0)
   3588     return true;
   3589 
   3590   return false;
   3591 }
   3592 
   3593 /// isCalleePop - Determines whether the callee is required to pop its
   3594 /// own arguments. Callee pop is necessary to support tail calls.
   3595 bool X86::isCalleePop(CallingConv::ID CallingConv,
   3596                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
   3597   switch (CallingConv) {
   3598   default:
   3599     return false;
   3600   case CallingConv::X86_StdCall:
   3601   case CallingConv::X86_FastCall:
   3602   case CallingConv::X86_ThisCall:
   3603     return !is64Bit;
   3604   case CallingConv::Fast:
   3605   case CallingConv::GHC:
   3606   case CallingConv::HiPE:
   3607     if (IsVarArg)
   3608       return false;
   3609     return TailCallOpt;
   3610   }
   3611 }
   3612 
   3613 /// \brief Return true if the condition is an unsigned comparison operation.
   3614 static bool isX86CCUnsigned(unsigned X86CC) {
   3615   switch (X86CC) {
   3616   default: llvm_unreachable("Invalid integer condition!");
   3617   case X86::COND_E:     return true;
   3618   case X86::COND_G:     return false;
   3619   case X86::COND_GE:    return false;
   3620   case X86::COND_L:     return false;
   3621   case X86::COND_LE:    return false;
   3622   case X86::COND_NE:    return true;
   3623   case X86::COND_B:     return true;
   3624   case X86::COND_A:     return true;
   3625   case X86::COND_BE:    return true;
   3626   case X86::COND_AE:    return true;
   3627   }
   3628   llvm_unreachable("covered switch fell through?!");
   3629 }
   3630 
   3631 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
   3632 /// specific condition code, returning the condition code and the LHS/RHS of the
   3633 /// comparison to make.
   3634 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
   3635                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
   3636   if (!isFP) {
   3637     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
   3638       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
   3639         // X > -1   -> X == 0, jump !sign.
   3640         RHS = DAG.getConstant(0, RHS.getValueType());
   3641         return X86::COND_NS;
   3642       }
   3643       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
   3644         // X < 0   -> X == 0, jump on sign.
   3645         return X86::COND_S;
   3646       }
   3647       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
   3648         // X < 1   -> X <= 0
   3649         RHS = DAG.getConstant(0, RHS.getValueType());
   3650         return X86::COND_LE;
   3651       }
   3652     }
   3653 
   3654     switch (SetCCOpcode) {
   3655     default: llvm_unreachable("Invalid integer condition!");
   3656     case ISD::SETEQ:  return X86::COND_E;
   3657     case ISD::SETGT:  return X86::COND_G;
   3658     case ISD::SETGE:  return X86::COND_GE;
   3659     case ISD::SETLT:  return X86::COND_L;
   3660     case ISD::SETLE:  return X86::COND_LE;
   3661     case ISD::SETNE:  return X86::COND_NE;
   3662     case ISD::SETULT: return X86::COND_B;
   3663     case ISD::SETUGT: return X86::COND_A;
   3664     case ISD::SETULE: return X86::COND_BE;
   3665     case ISD::SETUGE: return X86::COND_AE;
   3666     }
   3667   }
   3668 
   3669   // First determine if it is required or is profitable to flip the operands.
   3670 
   3671   // If LHS is a foldable load, but RHS is not, flip the condition.
   3672   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
   3673       !ISD::isNON_EXTLoad(RHS.getNode())) {
   3674     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
   3675     std::swap(LHS, RHS);
   3676   }
   3677 
   3678   switch (SetCCOpcode) {
   3679   default: break;
   3680   case ISD::SETOLT:
   3681   case ISD::SETOLE:
   3682   case ISD::SETUGT:
   3683   case ISD::SETUGE:
   3684     std::swap(LHS, RHS);
   3685     break;
   3686   }
   3687 
   3688   // On a floating point condition, the flags are set as follows:
   3689   // ZF  PF  CF   op
   3690   //  0 | 0 | 0 | X > Y
   3691   //  0 | 0 | 1 | X < Y
   3692   //  1 | 0 | 0 | X == Y
   3693   //  1 | 1 | 1 | unordered
   3694   switch (SetCCOpcode) {
   3695   default: llvm_unreachable("Condcode should be pre-legalized away");
   3696   case ISD::SETUEQ:
   3697   case ISD::SETEQ:   return X86::COND_E;
   3698   case ISD::SETOLT:              // flipped
   3699   case ISD::SETOGT:
   3700   case ISD::SETGT:   return X86::COND_A;
   3701   case ISD::SETOLE:              // flipped
   3702   case ISD::SETOGE:
   3703   case ISD::SETGE:   return X86::COND_AE;
   3704   case ISD::SETUGT:              // flipped
   3705   case ISD::SETULT:
   3706   case ISD::SETLT:   return X86::COND_B;
   3707   case ISD::SETUGE:              // flipped
   3708   case ISD::SETULE:
   3709   case ISD::SETLE:   return X86::COND_BE;
   3710   case ISD::SETONE:
   3711   case ISD::SETNE:   return X86::COND_NE;
   3712   case ISD::SETUO:   return X86::COND_P;
   3713   case ISD::SETO:    return X86::COND_NP;
   3714   case ISD::SETOEQ:
   3715   case ISD::SETUNE:  return X86::COND_INVALID;
   3716   }
   3717 }
   3718 
   3719 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
   3720 /// code. Current x86 isa includes the following FP cmov instructions:
   3721 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
   3722 static bool hasFPCMov(unsigned X86CC) {
   3723   switch (X86CC) {
   3724   default:
   3725     return false;
   3726   case X86::COND_B:
   3727   case X86::COND_BE:
   3728   case X86::COND_E:
   3729   case X86::COND_P:
   3730   case X86::COND_A:
   3731   case X86::COND_AE:
   3732   case X86::COND_NE:
   3733   case X86::COND_NP:
   3734     return true;
   3735   }
   3736 }
   3737 
   3738 /// isFPImmLegal - Returns true if the target can instruction select the
   3739 /// specified FP immediate natively. If false, the legalizer will
   3740 /// materialize the FP immediate as a load from a constant pool.
   3741 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   3742   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
   3743     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
   3744       return true;
   3745   }
   3746   return false;
   3747 }
   3748 
   3749 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
   3750                                               ISD::LoadExtType ExtTy,
   3751                                               EVT NewVT) const {
   3752   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
   3753   // relocation target a movq or addq instruction: don't let the load shrink.
   3754   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
   3755   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
   3756     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
   3757       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
   3758   return true;
   3759 }
   3760 
   3761 /// \brief Returns true if it is beneficial to convert a load of a constant
   3762 /// to just the constant itself.
   3763 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   3764                                                           Type *Ty) const {
   3765   assert(Ty->isIntegerTy());
   3766 
   3767   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   3768   if (BitSize == 0 || BitSize > 64)
   3769     return false;
   3770   return true;
   3771 }
   3772 
   3773 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
   3774                                                 unsigned Index) const {
   3775   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
   3776     return false;
   3777 
   3778   return (Index == 0 || Index == ResVT.getVectorNumElements());
   3779 }
   3780 
   3781 bool X86TargetLowering::isCheapToSpeculateCttz() const {
   3782   // Speculate cttz only if we can directly use TZCNT.
   3783   return Subtarget->hasBMI();
   3784 }
   3785 
   3786 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
   3787   // Speculate ctlz only if we can directly use LZCNT.
   3788   return Subtarget->hasLZCNT();
   3789 }
   3790 
   3791 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
   3792 /// the specified range (L, H].
   3793 static bool isUndefOrInRange(int Val, int Low, int Hi) {
   3794   return (Val < 0) || (Val >= Low && Val < Hi);
   3795 }
   3796 
   3797 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
   3798 /// specified value.
   3799 static bool isUndefOrEqual(int Val, int CmpVal) {
   3800   return (Val < 0 || Val == CmpVal);
   3801 }
   3802 
   3803 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
   3804 /// from position Pos and ending in Pos+Size, falls within the specified
   3805 /// sequential range (Low, Low+Size]. or is undef.
   3806 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
   3807                                        unsigned Pos, unsigned Size, int Low) {
   3808   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
   3809     if (!isUndefOrEqual(Mask[i], Low))
   3810       return false;
   3811   return true;
   3812 }
   3813 
   3814 /// isVEXTRACTIndex - Return true if the specified
   3815 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
   3816 /// suitable for instruction that extract 128 or 256 bit vectors
   3817 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
   3818   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   3819   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   3820     return false;
   3821 
   3822   // The index should be aligned on a vecWidth-bit boundary.
   3823   uint64_t Index =
   3824     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   3825 
   3826   MVT VT = N->getSimpleValueType(0);
   3827   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   3828   bool Result = (Index * ElSize) % vecWidth == 0;
   3829 
   3830   return Result;
   3831 }
   3832 
   3833 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
   3834 /// operand specifies a subvector insert that is suitable for input to
   3835 /// insertion of 128 or 256-bit subvectors
   3836 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
   3837   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   3838   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   3839     return false;
   3840   // The index should be aligned on a vecWidth-bit boundary.
   3841   uint64_t Index =
   3842     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   3843 
   3844   MVT VT = N->getSimpleValueType(0);
   3845   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   3846   bool Result = (Index * ElSize) % vecWidth == 0;
   3847 
   3848   return Result;
   3849 }
   3850 
   3851 bool X86::isVINSERT128Index(SDNode *N) {
   3852   return isVINSERTIndex(N, 128);
   3853 }
   3854 
   3855 bool X86::isVINSERT256Index(SDNode *N) {
   3856   return isVINSERTIndex(N, 256);
   3857 }
   3858 
   3859 bool X86::isVEXTRACT128Index(SDNode *N) {
   3860   return isVEXTRACTIndex(N, 128);
   3861 }
   3862 
   3863 bool X86::isVEXTRACT256Index(SDNode *N) {
   3864   return isVEXTRACTIndex(N, 256);
   3865 }
   3866 
   3867 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
   3868   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   3869   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
   3870     llvm_unreachable("Illegal extract subvector for VEXTRACT");
   3871 
   3872   uint64_t Index =
   3873     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
   3874 
   3875   MVT VecVT = N->getOperand(0).getSimpleValueType();
   3876   MVT ElVT = VecVT.getVectorElementType();
   3877 
   3878   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
   3879   return Index / NumElemsPerChunk;
   3880 }
   3881 
   3882 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
   3883   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   3884   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
   3885     llvm_unreachable("Illegal insert subvector for VINSERT");
   3886 
   3887   uint64_t Index =
   3888     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
   3889 
   3890   MVT VecVT = N->getSimpleValueType(0);
   3891   MVT ElVT = VecVT.getVectorElementType();
   3892 
   3893   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
   3894   return Index / NumElemsPerChunk;
   3895 }
   3896 
   3897 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
   3898 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
   3899 /// and VINSERTI128 instructions.
   3900 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
   3901   return getExtractVEXTRACTImmediate(N, 128);
   3902 }
   3903 
   3904 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
   3905 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
   3906 /// and VINSERTI64x4 instructions.
   3907 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
   3908   return getExtractVEXTRACTImmediate(N, 256);
   3909 }
   3910 
   3911 /// getInsertVINSERT128Immediate - Return the appropriate immediate
   3912 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
   3913 /// and VINSERTI128 instructions.
   3914 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
   3915   return getInsertVINSERTImmediate(N, 128);
   3916 }
   3917 
   3918 /// getInsertVINSERT256Immediate - Return the appropriate immediate
   3919 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
   3920 /// and VINSERTI64x4 instructions.
   3921 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
   3922   return getInsertVINSERTImmediate(N, 256);
   3923 }
   3924 
   3925 /// isZero - Returns true if Elt is a constant integer zero
   3926 static bool isZero(SDValue V) {
   3927   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   3928   return C && C->isNullValue();
   3929 }
   3930 
   3931 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
   3932 /// constant +0.0.
   3933 bool X86::isZeroNode(SDValue Elt) {
   3934   if (isZero(Elt))
   3935     return true;
   3936   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
   3937     return CFP->getValueAPF().isPosZero();
   3938   return false;
   3939 }
   3940 
   3941 /// getZeroVector - Returns a vector of specified type with all zero elements.
   3942 ///
   3943 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
   3944                              SelectionDAG &DAG, SDLoc dl) {
   3945   assert(VT.isVector() && "Expected a vector type");
   3946 
   3947   // Always build SSE zero vectors as <4 x i32> bitcasted
   3948   // to their dest type. This ensures they get CSE'd.
   3949   SDValue Vec;
   3950   if (VT.is128BitVector()) {  // SSE
   3951     if (Subtarget->hasSSE2()) {  // SSE2
   3952       SDValue Cst = DAG.getConstant(0, MVT::i32);
   3953       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   3954     } else { // SSE1
   3955       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
   3956       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
   3957     }
   3958   } else if (VT.is256BitVector()) { // AVX
   3959     if (Subtarget->hasInt256()) { // AVX2
   3960       SDValue Cst = DAG.getConstant(0, MVT::i32);
   3961       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   3962       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
   3963     } else {
   3964       // 256-bit logic and arithmetic instructions in AVX are all
   3965       // floating-point, no support for integer ops. Emit fp zeroed vectors.
   3966       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
   3967       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   3968       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
   3969     }
   3970   } else if (VT.is512BitVector()) { // AVX-512
   3971       SDValue Cst = DAG.getConstant(0, MVT::i32);
   3972       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
   3973                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   3974       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
   3975   } else if (VT.getScalarType() == MVT::i1) {
   3976 
   3977     assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
   3978             && "Unexpected vector type");
   3979     assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8)
   3980             && "Unexpected vector type");
   3981     SDValue Cst = DAG.getConstant(0, MVT::i1);
   3982     SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst);
   3983     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   3984   } else
   3985     llvm_unreachable("Unexpected vector type");
   3986 
   3987   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
   3988 }
   3989 
   3990 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
   3991                                 SelectionDAG &DAG, SDLoc dl,
   3992                                 unsigned vectorWidth) {
   3993   assert((vectorWidth == 128 || vectorWidth == 256) &&
   3994          "Unsupported vector width");
   3995   EVT VT = Vec.getValueType();
   3996   EVT ElVT = VT.getVectorElementType();
   3997   unsigned Factor = VT.getSizeInBits()/vectorWidth;
   3998   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
   3999                                   VT.getVectorNumElements()/Factor);
   4000 
   4001   // Extract from UNDEF is UNDEF.
   4002   if (Vec.getOpcode() == ISD::UNDEF)
   4003     return DAG.getUNDEF(ResultVT);
   4004 
   4005   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
   4006   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
   4007 
   4008   // This is the index of the first element of the vectorWidth-bit chunk
   4009   // we want.
   4010   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
   4011                                * ElemsPerChunk);
   4012 
   4013   // If the input is a buildvector just emit a smaller one.
   4014   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
   4015     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
   4016                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
   4017                                     ElemsPerChunk));
   4018 
   4019   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   4020   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
   4021 }
   4022 
   4023 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
   4024 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
   4025 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
   4026 /// instructions or a simple subregister reference. Idx is an index in the
   4027 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
   4028 /// lowering EXTRACT_VECTOR_ELT operations easier.
   4029 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
   4030                                    SelectionDAG &DAG, SDLoc dl) {
   4031   assert((Vec.getValueType().is256BitVector() ||
   4032           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
   4033   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
   4034 }
   4035 
   4036 /// Generate a DAG to grab 256-bits from a 512-bit vector.
   4037 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
   4038                                    SelectionDAG &DAG, SDLoc dl) {
   4039   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
   4040   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
   4041 }
   4042 
   4043 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
   4044                                unsigned IdxVal, SelectionDAG &DAG,
   4045                                SDLoc dl, unsigned vectorWidth) {
   4046   assert((vectorWidth == 128 || vectorWidth == 256) &&
   4047          "Unsupported vector width");
   4048   // Inserting UNDEF is Result
   4049   if (Vec.getOpcode() == ISD::UNDEF)
   4050     return Result;
   4051   EVT VT = Vec.getValueType();
   4052   EVT ElVT = VT.getVectorElementType();
   4053   EVT ResultVT = Result.getValueType();
   4054 
   4055   // Insert the relevant vectorWidth bits.
   4056   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
   4057 
   4058   // This is the index of the first element of the vectorWidth-bit chunk
   4059   // we want.
   4060   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
   4061                                * ElemsPerChunk);
   4062 
   4063   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   4064   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
   4065 }
   4066 
   4067 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
   4068 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
   4069 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
   4070 /// simple superregister reference.  Idx is an index in the 128 bits
   4071 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
   4072 /// lowering INSERT_VECTOR_ELT operations easier.
   4073 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   4074                                   SelectionDAG &DAG, SDLoc dl) {
   4075   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
   4076 
   4077   // For insertion into the zero index (low half) of a 256-bit vector, it is
   4078   // more efficient to generate a blend with immediate instead of an insert*128.
   4079   // We are still creating an INSERT_SUBVECTOR below with an undef node to
   4080   // extend the subvector to the size of the result vector. Make sure that
   4081   // we are not recursing on that node by checking for undef here.
   4082   if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
   4083       Result.getOpcode() != ISD::UNDEF) {
   4084     EVT ResultVT = Result.getValueType();
   4085     SDValue ZeroIndex = DAG.getIntPtrConstant(0);
   4086     SDValue Undef = DAG.getUNDEF(ResultVT);
   4087     SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
   4088                                  Vec, ZeroIndex);
   4089 
   4090     // The blend instruction, and therefore its mask, depend on the data type.
   4091     MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
   4092     if (ScalarType.isFloatingPoint()) {
   4093       // Choose either vblendps (float) or vblendpd (double).
   4094       unsigned ScalarSize = ScalarType.getSizeInBits();
   4095       assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
   4096       unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
   4097       SDValue Mask = DAG.getConstant(MaskVal, MVT::i8);
   4098       return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
   4099     }
   4100 
   4101     const X86Subtarget &Subtarget =
   4102     static_cast<const X86Subtarget &>(DAG.getSubtarget());
   4103 
   4104     // AVX2 is needed for 256-bit integer blend support.
   4105     // Integers must be cast to 32-bit because there is only vpblendd;
   4106     // vpblendw can't be used for this because it has a handicapped mask.
   4107 
   4108     // If we don't have AVX2, then cast to float. Using a wrong domain blend
   4109     // is still more efficient than using the wrong domain vinsertf128 that
   4110     // will be created by InsertSubVector().
   4111     MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
   4112 
   4113     SDValue Mask = DAG.getConstant(0x0f, MVT::i8);
   4114     Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256);
   4115     Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
   4116     return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256);
   4117   }
   4118 
   4119   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
   4120 }
   4121 
   4122 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   4123                                   SelectionDAG &DAG, SDLoc dl) {
   4124   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
   4125   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
   4126 }
   4127 
   4128 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
   4129 /// instructions. This is used because creating CONCAT_VECTOR nodes of
   4130 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
   4131 /// large BUILD_VECTORS.
   4132 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
   4133                                    unsigned NumElems, SelectionDAG &DAG,
   4134                                    SDLoc dl) {
   4135   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
   4136   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
   4137 }
   4138 
   4139 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
   4140                                    unsigned NumElems, SelectionDAG &DAG,
   4141                                    SDLoc dl) {
   4142   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
   4143   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
   4144 }
   4145 
   4146 /// getOnesVector - Returns a vector of specified type with all bits set.
   4147 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
   4148 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
   4149 /// Then bitcast to their original type, ensuring they get CSE'd.
   4150 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
   4151                              SDLoc dl) {
   4152   assert(VT.isVector() && "Expected a vector type");
   4153 
   4154   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
   4155   SDValue Vec;
   4156   if (VT.is256BitVector()) {
   4157     if (HasInt256) { // AVX2
   4158       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
   4159       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
   4160     } else { // AVX
   4161       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4162       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
   4163     }
   4164   } else if (VT.is128BitVector()) {
   4165     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   4166   } else
   4167     llvm_unreachable("Unexpected vector type");
   4168 
   4169   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
   4170 }
   4171 
   4172 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
   4173 /// operation of specified width.
   4174 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
   4175                        SDValue V2) {
   4176   unsigned NumElems = VT.getVectorNumElements();
   4177   SmallVector<int, 8> Mask;
   4178   Mask.push_back(NumElems);
   4179   for (unsigned i = 1; i != NumElems; ++i)
   4180     Mask.push_back(i);
   4181   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4182 }
   4183 
   4184 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
   4185 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
   4186                           SDValue V2) {
   4187   unsigned NumElems = VT.getVectorNumElements();
   4188   SmallVector<int, 8> Mask;
   4189   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
   4190     Mask.push_back(i);
   4191     Mask.push_back(i + NumElems);
   4192   }
   4193   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4194 }
   4195 
   4196 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
   4197 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
   4198                           SDValue V2) {
   4199   unsigned NumElems = VT.getVectorNumElements();
   4200   SmallVector<int, 8> Mask;
   4201   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
   4202     Mask.push_back(i + Half);
   4203     Mask.push_back(i + NumElems + Half);
   4204   }
   4205   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
   4206 }
   4207 
   4208 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
   4209 /// vector of zero or undef vector.  This produces a shuffle where the low
   4210 /// element of V2 is swizzled into the zero/undef vector, landing at element
   4211 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
   4212 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
   4213                                            bool IsZero,
   4214                                            const X86Subtarget *Subtarget,
   4215                                            SelectionDAG &DAG) {
   4216   MVT VT = V2.getSimpleValueType();
   4217   SDValue V1 = IsZero
   4218     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   4219   unsigned NumElems = VT.getVectorNumElements();
   4220   SmallVector<int, 16> MaskVec;
   4221   for (unsigned i = 0; i != NumElems; ++i)
   4222     // If this is the insertion idx, put the low elt of V2 here.
   4223     MaskVec.push_back(i == Idx ? NumElems : i);
   4224   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
   4225 }
   4226 
   4227 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
   4228 /// target specific opcode. Returns true if the Mask could be calculated. Sets
   4229 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
   4230 /// shuffles which use a single input multiple times, and in those cases it will
   4231 /// adjust the mask to only have indices within that single input.
   4232 static bool getTargetShuffleMask(SDNode *N, MVT VT,
   4233                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   4234   unsigned NumElems = VT.getVectorNumElements();
   4235   SDValue ImmN;
   4236 
   4237   IsUnary = false;
   4238   bool IsFakeUnary = false;
   4239   switch(N->getOpcode()) {
   4240   case X86ISD::BLENDI:
   4241     ImmN = N->getOperand(N->getNumOperands()-1);
   4242     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4243     break;
   4244   case X86ISD::SHUFP:
   4245     ImmN = N->getOperand(N->getNumOperands()-1);
   4246     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4247     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4248     break;
   4249   case X86ISD::UNPCKH:
   4250     DecodeUNPCKHMask(VT, Mask);
   4251     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4252     break;
   4253   case X86ISD::UNPCKL:
   4254     DecodeUNPCKLMask(VT, Mask);
   4255     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4256     break;
   4257   case X86ISD::MOVHLPS:
   4258     DecodeMOVHLPSMask(NumElems, Mask);
   4259     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4260     break;
   4261   case X86ISD::MOVLHPS:
   4262     DecodeMOVLHPSMask(NumElems, Mask);
   4263     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   4264     break;
   4265   case X86ISD::PALIGNR:
   4266     ImmN = N->getOperand(N->getNumOperands()-1);
   4267     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4268     break;
   4269   case X86ISD::PSHUFD:
   4270   case X86ISD::VPERMILPI:
   4271     ImmN = N->getOperand(N->getNumOperands()-1);
   4272     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4273     IsUnary = true;
   4274     break;
   4275   case X86ISD::PSHUFHW:
   4276     ImmN = N->getOperand(N->getNumOperands()-1);
   4277     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4278     IsUnary = true;
   4279     break;
   4280   case X86ISD::PSHUFLW:
   4281     ImmN = N->getOperand(N->getNumOperands()-1);
   4282     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4283     IsUnary = true;
   4284     break;
   4285   case X86ISD::PSHUFB: {
   4286     IsUnary = true;
   4287     SDValue MaskNode = N->getOperand(1);
   4288     while (MaskNode->getOpcode() == ISD::BITCAST)
   4289       MaskNode = MaskNode->getOperand(0);
   4290 
   4291     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
   4292       // If we have a build-vector, then things are easy.
   4293       EVT VT = MaskNode.getValueType();
   4294       assert(VT.isVector() &&
   4295              "Can't produce a non-vector with a build_vector!");
   4296       if (!VT.isInteger())
   4297         return false;
   4298 
   4299       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
   4300 
   4301       SmallVector<uint64_t, 32> RawMask;
   4302       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
   4303         SDValue Op = MaskNode->getOperand(i);
   4304         if (Op->getOpcode() == ISD::UNDEF) {
   4305           RawMask.push_back((uint64_t)SM_SentinelUndef);
   4306           continue;
   4307         }
   4308         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
   4309         if (!CN)
   4310           return false;
   4311         APInt MaskElement = CN->getAPIntValue();
   4312 
   4313         // We now have to decode the element which could be any integer size and
   4314         // extract each byte of it.
   4315         for (int j = 0; j < NumBytesPerElement; ++j) {
   4316           // Note that this is x86 and so always little endian: the low byte is
   4317           // the first byte of the mask.
   4318           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
   4319           MaskElement = MaskElement.lshr(8);
   4320         }
   4321       }
   4322       DecodePSHUFBMask(RawMask, Mask);
   4323       break;
   4324     }
   4325 
   4326     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
   4327     if (!MaskLoad)
   4328       return false;
   4329 
   4330     SDValue Ptr = MaskLoad->getBasePtr();
   4331     if (Ptr->getOpcode() == X86ISD::Wrapper)
   4332       Ptr = Ptr->getOperand(0);
   4333 
   4334     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
   4335     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
   4336       return false;
   4337 
   4338     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
   4339       DecodePSHUFBMask(C, Mask);
   4340       if (Mask.empty())
   4341         return false;
   4342       break;
   4343     }
   4344 
   4345     return false;
   4346   }
   4347   case X86ISD::VPERMI:
   4348     ImmN = N->getOperand(N->getNumOperands()-1);
   4349     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4350     IsUnary = true;
   4351     break;
   4352   case X86ISD::MOVSS:
   4353   case X86ISD::MOVSD:
   4354     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
   4355     break;
   4356   case X86ISD::VPERM2X128:
   4357     ImmN = N->getOperand(N->getNumOperands()-1);
   4358     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   4359     if (Mask.empty()) return false;
   4360     break;
   4361   case X86ISD::MOVSLDUP:
   4362     DecodeMOVSLDUPMask(VT, Mask);
   4363     IsUnary = true;
   4364     break;
   4365   case X86ISD::MOVSHDUP:
   4366     DecodeMOVSHDUPMask(VT, Mask);
   4367     IsUnary = true;
   4368     break;
   4369   case X86ISD::MOVDDUP:
   4370     DecodeMOVDDUPMask(VT, Mask);
   4371     IsUnary = true;
   4372     break;
   4373   case X86ISD::MOVLHPD:
   4374   case X86ISD::MOVLPD:
   4375   case X86ISD::MOVLPS:
   4376     // Not yet implemented
   4377     return false;
   4378   default: llvm_unreachable("unknown target shuffle node");
   4379   }
   4380 
   4381   // If we have a fake unary shuffle, the shuffle mask is spread across two
   4382   // inputs that are actually the same node. Re-map the mask to always point
   4383   // into the first input.
   4384   if (IsFakeUnary)
   4385     for (int &M : Mask)
   4386       if (M >= (int)Mask.size())
   4387         M -= Mask.size();
   4388 
   4389   return true;
   4390 }
   4391 
   4392 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
   4393 /// element of the result of the vector shuffle.
   4394 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   4395                                    unsigned Depth) {
   4396   if (Depth == 6)
   4397     return SDValue();  // Limit search depth.
   4398 
   4399   SDValue V = SDValue(N, 0);
   4400   EVT VT = V.getValueType();
   4401   unsigned Opcode = V.getOpcode();
   4402 
   4403   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
   4404   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
   4405     int Elt = SV->getMaskElt(Index);
   4406 
   4407     if (Elt < 0)
   4408       return DAG.getUNDEF(VT.getVectorElementType());
   4409 
   4410     unsigned NumElems = VT.getVectorNumElements();
   4411     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
   4412                                          : SV->getOperand(1);
   4413     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
   4414   }
   4415 
   4416   // Recurse into target specific vector shuffles to find scalars.
   4417   if (isTargetShuffle(Opcode)) {
   4418     MVT ShufVT = V.getSimpleValueType();
   4419     unsigned NumElems = ShufVT.getVectorNumElements();
   4420     SmallVector<int, 16> ShuffleMask;
   4421     bool IsUnary;
   4422 
   4423     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
   4424       return SDValue();
   4425 
   4426     int Elt = ShuffleMask[Index];
   4427     if (Elt < 0)
   4428       return DAG.getUNDEF(ShufVT.getVectorElementType());
   4429 
   4430     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
   4431                                          : N->getOperand(1);
   4432     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
   4433                                Depth+1);
   4434   }
   4435 
   4436   // Actual nodes that may contain scalar elements
   4437   if (Opcode == ISD::BITCAST) {
   4438     V = V.getOperand(0);
   4439     EVT SrcVT = V.getValueType();
   4440     unsigned NumElems = VT.getVectorNumElements();
   4441 
   4442     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
   4443       return SDValue();
   4444   }
   4445 
   4446   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   4447     return (Index == 0) ? V.getOperand(0)
   4448                         : DAG.getUNDEF(VT.getVectorElementType());
   4449 
   4450   if (V.getOpcode() == ISD::BUILD_VECTOR)
   4451     return V.getOperand(Index);
   4452 
   4453   return SDValue();
   4454 }
   4455 
   4456 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
   4457 ///
   4458 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   4459                                        unsigned NumNonZero, unsigned NumZero,
   4460                                        SelectionDAG &DAG,
   4461                                        const X86Subtarget* Subtarget,
   4462                                        const TargetLowering &TLI) {
   4463   if (NumNonZero > 8)
   4464     return SDValue();
   4465 
   4466   SDLoc dl(Op);
   4467   SDValue V;
   4468   bool First = true;
   4469 
   4470   // SSE4.1 - use PINSRB to insert each byte directly.
   4471   if (Subtarget->hasSSE41()) {
   4472     for (unsigned i = 0; i < 16; ++i) {
   4473       bool isNonZero = (NonZeros & (1 << i)) != 0;
   4474       if (isNonZero) {
   4475         if (First) {
   4476           if (NumZero)
   4477             V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
   4478           else
   4479             V = DAG.getUNDEF(MVT::v16i8);
   4480           First = false;
   4481         }
   4482         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
   4483                         MVT::v16i8, V, Op.getOperand(i),
   4484                         DAG.getIntPtrConstant(i));
   4485       }
   4486     }
   4487 
   4488     return V;
   4489   }
   4490 
   4491   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
   4492   for (unsigned i = 0; i < 16; ++i) {
   4493     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
   4494     if (ThisIsNonZero && First) {
   4495       if (NumZero)
   4496         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   4497       else
   4498         V = DAG.getUNDEF(MVT::v8i16);
   4499       First = false;
   4500     }
   4501 
   4502     if ((i & 1) != 0) {
   4503       SDValue ThisElt, LastElt;
   4504       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
   4505       if (LastIsNonZero) {
   4506         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
   4507                               MVT::i16, Op.getOperand(i-1));
   4508       }
   4509       if (ThisIsNonZero) {
   4510         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
   4511         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
   4512                               ThisElt, DAG.getConstant(8, MVT::i8));
   4513         if (LastIsNonZero)
   4514           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
   4515       } else
   4516         ThisElt = LastElt;
   4517 
   4518       if (ThisElt.getNode())
   4519         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
   4520                         DAG.getIntPtrConstant(i/2));
   4521     }
   4522   }
   4523 
   4524   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
   4525 }
   4526 
   4527 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
   4528 ///
   4529 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   4530                                      unsigned NumNonZero, unsigned NumZero,
   4531                                      SelectionDAG &DAG,
   4532                                      const X86Subtarget* Subtarget,
   4533                                      const TargetLowering &TLI) {
   4534   if (NumNonZero > 4)
   4535     return SDValue();
   4536 
   4537   SDLoc dl(Op);
   4538   SDValue V;
   4539   bool First = true;
   4540   for (unsigned i = 0; i < 8; ++i) {
   4541     bool isNonZero = (NonZeros & (1 << i)) != 0;
   4542     if (isNonZero) {
   4543       if (First) {
   4544         if (NumZero)
   4545           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   4546         else
   4547           V = DAG.getUNDEF(MVT::v8i16);
   4548         First = false;
   4549       }
   4550       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
   4551                       MVT::v8i16, V, Op.getOperand(i),
   4552                       DAG.getIntPtrConstant(i));
   4553     }
   4554   }
   4555 
   4556   return V;
   4557 }
   4558 
   4559 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
   4560 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
   4561                                      const X86Subtarget *Subtarget,
   4562                                      const TargetLowering &TLI) {
   4563   // Find all zeroable elements.
   4564   std::bitset<4> Zeroable;
   4565   for (int i=0; i < 4; ++i) {
   4566     SDValue Elt = Op->getOperand(i);
   4567     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
   4568   }
   4569   assert(Zeroable.size() - Zeroable.count() > 1 &&
   4570          "We expect at least two non-zero elements!");
   4571 
   4572   // We only know how to deal with build_vector nodes where elements are either
   4573   // zeroable or extract_vector_elt with constant index.
   4574   SDValue FirstNonZero;
   4575   unsigned FirstNonZeroIdx;
   4576   for (unsigned i=0; i < 4; ++i) {
   4577     if (Zeroable[i])
   4578       continue;
   4579     SDValue Elt = Op->getOperand(i);
   4580     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   4581         !isa<ConstantSDNode>(Elt.getOperand(1)))
   4582       return SDValue();
   4583     // Make sure that this node is extracting from a 128-bit vector.
   4584     MVT VT = Elt.getOperand(0).getSimpleValueType();
   4585     if (!VT.is128BitVector())
   4586       return SDValue();
   4587     if (!FirstNonZero.getNode()) {
   4588       FirstNonZero = Elt;
   4589       FirstNonZeroIdx = i;
   4590     }
   4591   }
   4592 
   4593   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
   4594   SDValue V1 = FirstNonZero.getOperand(0);
   4595   MVT VT = V1.getSimpleValueType();
   4596 
   4597   // See if this build_vector can be lowered as a blend with zero.
   4598   SDValue Elt;
   4599   unsigned EltMaskIdx, EltIdx;
   4600   int Mask[4];
   4601   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
   4602     if (Zeroable[EltIdx]) {
   4603       // The zero vector will be on the right hand side.
   4604       Mask[EltIdx] = EltIdx+4;
   4605       continue;
   4606     }
   4607 
   4608     Elt = Op->getOperand(EltIdx);
   4609     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
   4610     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
   4611     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
   4612       break;
   4613     Mask[EltIdx] = EltIdx;
   4614   }
   4615 
   4616   if (EltIdx == 4) {
   4617     // Let the shuffle legalizer deal with blend operations.
   4618     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
   4619     if (V1.getSimpleValueType() != VT)
   4620       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
   4621     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
   4622   }
   4623 
   4624   // See if we can lower this build_vector to a INSERTPS.
   4625   if (!Subtarget->hasSSE41())
   4626     return SDValue();
   4627 
   4628   SDValue V2 = Elt.getOperand(0);
   4629   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
   4630     V1 = SDValue();
   4631 
   4632   bool CanFold = true;
   4633   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
   4634     if (Zeroable[i])
   4635       continue;
   4636 
   4637     SDValue Current = Op->getOperand(i);
   4638     SDValue SrcVector = Current->getOperand(0);
   4639     if (!V1.getNode())
   4640       V1 = SrcVector;
   4641     CanFold = SrcVector == V1 &&
   4642       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
   4643   }
   4644 
   4645   if (!CanFold)
   4646     return SDValue();
   4647 
   4648   assert(V1.getNode() && "Expected at least two non-zero elements!");
   4649   if (V1.getSimpleValueType() != MVT::v4f32)
   4650     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
   4651   if (V2.getSimpleValueType() != MVT::v4f32)
   4652     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
   4653 
   4654   // Ok, we can emit an INSERTPS instruction.
   4655   unsigned ZMask = Zeroable.to_ulong();
   4656 
   4657   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
   4658   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
   4659   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
   4660                                DAG.getIntPtrConstant(InsertPSMask));
   4661   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
   4662 }
   4663 
   4664 /// Return a vector logical shift node.
   4665 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
   4666                          unsigned NumBits, SelectionDAG &DAG,
   4667                          const TargetLowering &TLI, SDLoc dl) {
   4668   assert(VT.is128BitVector() && "Unknown type for VShift");
   4669   MVT ShVT = MVT::v2i64;
   4670   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   4671   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
   4672   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
   4673   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
   4674   SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy);
   4675   return DAG.getNode(ISD::BITCAST, dl, VT,
   4676                      DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
   4677 }
   4678 
   4679 static SDValue
   4680 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
   4681 
   4682   // Check if the scalar load can be widened into a vector load. And if
   4683   // the address is "base + cst" see if the cst can be "absorbed" into
   4684   // the shuffle mask.
   4685   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
   4686     SDValue Ptr = LD->getBasePtr();
   4687     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
   4688       return SDValue();
   4689     EVT PVT = LD->getValueType(0);
   4690     if (PVT != MVT::i32 && PVT != MVT::f32)
   4691       return SDValue();
   4692 
   4693     int FI = -1;
   4694     int64_t Offset = 0;
   4695     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
   4696       FI = FINode->getIndex();
   4697       Offset = 0;
   4698     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
   4699                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
   4700       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
   4701       Offset = Ptr.getConstantOperandVal(1);
   4702       Ptr = Ptr.getOperand(0);
   4703     } else {
   4704       return SDValue();
   4705     }
   4706 
   4707     // FIXME: 256-bit vector instructions don't require a strict alignment,
   4708     // improve this code to support it better.
   4709     unsigned RequiredAlign = VT.getSizeInBits()/8;
   4710     SDValue Chain = LD->getChain();
   4711     // Make sure the stack object alignment is at least 16 or 32.
   4712     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   4713     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
   4714       if (MFI->isFixedObjectIndex(FI)) {
   4715         // Can't change the alignment. FIXME: It's possible to compute
   4716         // the exact stack offset and reference FI + adjust offset instead.
   4717         // If someone *really* cares about this. That's the way to implement it.
   4718         return SDValue();
   4719       } else {
   4720         MFI->setObjectAlignment(FI, RequiredAlign);
   4721       }
   4722     }
   4723 
   4724     // (Offset % 16 or 32) must be multiple of 4. Then address is then
   4725     // Ptr + (Offset & ~15).
   4726     if (Offset < 0)
   4727       return SDValue();
   4728     if ((Offset % RequiredAlign) & 3)
   4729       return SDValue();
   4730     int64_t StartOffset = Offset & ~(RequiredAlign-1);
   4731     if (StartOffset)
   4732       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
   4733                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
   4734 
   4735     int EltNo = (Offset - StartOffset) >> 2;
   4736     unsigned NumElems = VT.getVectorNumElements();
   4737 
   4738     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
   4739     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
   4740                              LD->getPointerInfo().getWithOffset(StartOffset),
   4741                              false, false, false, 0);
   4742 
   4743     SmallVector<int, 8> Mask(NumElems, EltNo);
   4744 
   4745     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
   4746   }
   4747 
   4748   return SDValue();
   4749 }
   4750 
   4751 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
   4752 /// elements can be replaced by a single large load which has the same value as
   4753 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
   4754 ///
   4755 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
   4756 ///
   4757 /// FIXME: we'd also like to handle the case where the last elements are zero
   4758 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
   4759 /// There's even a handy isZeroNode for that purpose.
   4760 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   4761                                         SDLoc &DL, SelectionDAG &DAG,
   4762                                         bool isAfterLegalize) {
   4763   unsigned NumElems = Elts.size();
   4764 
   4765   LoadSDNode *LDBase = nullptr;
   4766   unsigned LastLoadedElt = -1U;
   4767 
   4768   // For each element in the initializer, see if we've found a load or an undef.
   4769   // If we don't find an initial load element, or later load elements are
   4770   // non-consecutive, bail out.
   4771   for (unsigned i = 0; i < NumElems; ++i) {
   4772     SDValue Elt = Elts[i];
   4773     // Look through a bitcast.
   4774     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
   4775       Elt = Elt.getOperand(0);
   4776     if (!Elt.getNode() ||
   4777         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
   4778       return SDValue();
   4779     if (!LDBase) {
   4780       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
   4781         return SDValue();
   4782       LDBase = cast<LoadSDNode>(Elt.getNode());
   4783       LastLoadedElt = i;
   4784       continue;
   4785     }
   4786     if (Elt.getOpcode() == ISD::UNDEF)
   4787       continue;
   4788 
   4789     LoadSDNode *LD = cast<LoadSDNode>(Elt);
   4790     EVT LdVT = Elt.getValueType();
   4791     // Each loaded element must be the correct fractional portion of the
   4792     // requested vector load.
   4793     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
   4794       return SDValue();
   4795     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
   4796       return SDValue();
   4797     LastLoadedElt = i;
   4798   }
   4799 
   4800   // If we have found an entire vector of loads and undefs, then return a large
   4801   // load of the entire vector width starting at the base pointer.  If we found
   4802   // consecutive loads for the low half, generate a vzext_load node.
   4803   if (LastLoadedElt == NumElems - 1) {
   4804     assert(LDBase && "Did not find base load for merging consecutive loads");
   4805     EVT EltVT = LDBase->getValueType(0);
   4806     // Ensure that the input vector size for the merged loads matches the
   4807     // cumulative size of the input elements.
   4808     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
   4809       return SDValue();
   4810 
   4811     if (isAfterLegalize &&
   4812         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
   4813       return SDValue();
   4814 
   4815     SDValue NewLd = SDValue();
   4816 
   4817     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
   4818                         LDBase->getPointerInfo(), LDBase->isVolatile(),
   4819                         LDBase->isNonTemporal(), LDBase->isInvariant(),
   4820                         LDBase->getAlignment());
   4821 
   4822     if (LDBase->hasAnyUseOfValue(1)) {
   4823       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
   4824                                      SDValue(LDBase, 1),
   4825                                      SDValue(NewLd.getNode(), 1));
   4826       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
   4827       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
   4828                              SDValue(NewLd.getNode(), 1));
   4829     }
   4830 
   4831     return NewLd;
   4832   }
   4833 
   4834   //TODO: The code below fires only for for loading the low v2i32 / v2f32
   4835   //of a v4i32 / v4f32. It's probably worth generalizing.
   4836   EVT EltVT = VT.getVectorElementType();
   4837   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
   4838       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
   4839     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
   4840     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
   4841     SDValue ResNode =
   4842         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
   4843                                 LDBase->getPointerInfo(),
   4844                                 LDBase->getAlignment(),
   4845                                 false/*isVolatile*/, true/*ReadMem*/,
   4846                                 false/*WriteMem*/);
   4847 
   4848     // Make sure the newly-created LOAD is in the same position as LDBase in
   4849     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
   4850     // update uses of LDBase's output chain to use the TokenFactor.
   4851     if (LDBase->hasAnyUseOfValue(1)) {
   4852       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
   4853                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
   4854       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
   4855       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
   4856                              SDValue(ResNode.getNode(), 1));
   4857     }
   4858 
   4859     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
   4860   }
   4861   return SDValue();
   4862 }
   4863 
   4864 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
   4865 /// to generate a splat value for the following cases:
   4866 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
   4867 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
   4868 /// a scalar load, or a constant.
   4869 /// The VBROADCAST node is returned when a pattern is found,
   4870 /// or SDValue() otherwise.
   4871 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
   4872                                     SelectionDAG &DAG) {
   4873   // VBROADCAST requires AVX.
   4874   // TODO: Splats could be generated for non-AVX CPUs using SSE
   4875   // instructions, but there's less potential gain for only 128-bit vectors.
   4876   if (!Subtarget->hasAVX())
   4877     return SDValue();
   4878 
   4879   MVT VT = Op.getSimpleValueType();
   4880   SDLoc dl(Op);
   4881 
   4882   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
   4883          "Unsupported vector type for broadcast.");
   4884 
   4885   SDValue Ld;
   4886   bool ConstSplatVal;
   4887 
   4888   switch (Op.getOpcode()) {
   4889     default:
   4890       // Unknown pattern found.
   4891       return SDValue();
   4892 
   4893     case ISD::BUILD_VECTOR: {
   4894       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
   4895       BitVector UndefElements;
   4896       SDValue Splat = BVOp->getSplatValue(&UndefElements);
   4897 
   4898       // We need a splat of a single value to use broadcast, and it doesn't
   4899       // make any sense if the value is only in one element of the vector.
   4900       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
   4901         return SDValue();
   4902 
   4903       Ld = Splat;
   4904       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   4905                        Ld.getOpcode() == ISD::ConstantFP);
   4906 
   4907       // Make sure that all of the users of a non-constant load are from the
   4908       // BUILD_VECTOR node.
   4909       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
   4910         return SDValue();
   4911       break;
   4912     }
   4913 
   4914     case ISD::VECTOR_SHUFFLE: {
   4915       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   4916 
   4917       // Shuffles must have a splat mask where the first element is
   4918       // broadcasted.
   4919       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
   4920         return SDValue();
   4921 
   4922       SDValue Sc = Op.getOperand(0);
   4923       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
   4924           Sc.getOpcode() != ISD::BUILD_VECTOR) {
   4925 
   4926         if (!Subtarget->hasInt256())
   4927           return SDValue();
   4928 
   4929         // Use the register form of the broadcast instruction available on AVX2.
   4930         if (VT.getSizeInBits() >= 256)
   4931           Sc = Extract128BitVector(Sc, 0, DAG, dl);
   4932         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
   4933       }
   4934 
   4935       Ld = Sc.getOperand(0);
   4936       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
   4937                        Ld.getOpcode() == ISD::ConstantFP);
   4938 
   4939       // The scalar_to_vector node and the suspected
   4940       // load node must have exactly one user.
   4941       // Constants may have multiple users.
   4942 
   4943       // AVX-512 has register version of the broadcast
   4944       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
   4945         Ld.getValueType().getSizeInBits() >= 32;
   4946       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
   4947           !hasRegVer))
   4948         return SDValue();
   4949       break;
   4950     }
   4951   }
   4952 
   4953   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
   4954   bool IsGE256 = (VT.getSizeInBits() >= 256);
   4955 
   4956   // When optimizing for size, generate up to 5 extra bytes for a broadcast
   4957   // instruction to save 8 or more bytes of constant pool data.
   4958   // TODO: If multiple splats are generated to load the same constant,
   4959   // it may be detrimental to overall size. There needs to be a way to detect
   4960   // that condition to know if this is truly a size win.
   4961   const Function *F = DAG.getMachineFunction().getFunction();
   4962   bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
   4963 
   4964   // Handle broadcasting a single constant scalar from the constant pool
   4965   // into a vector.
   4966   // On Sandybridge (no AVX2), it is still better to load a constant vector
   4967   // from the constant pool and not to broadcast it from a scalar.
   4968   // But override that restriction when optimizing for size.
   4969   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
   4970   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
   4971     EVT CVT = Ld.getValueType();
   4972     assert(!CVT.isVector() && "Must not broadcast a vector type");
   4973 
   4974     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
   4975     // For size optimization, also splat v2f64 and v2i64, and for size opt
   4976     // with AVX2, also splat i8 and i16.
   4977     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
   4978     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
   4979         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
   4980       const Constant *C = nullptr;
   4981       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
   4982         C = CI->getConstantIntValue();
   4983       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
   4984         C = CF->getConstantFPValue();
   4985 
   4986       assert(C && "Invalid constant type");
   4987 
   4988       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   4989       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
   4990       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   4991       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
   4992                        MachinePointerInfo::getConstantPool(),
   4993                        false, false, false, Alignment);
   4994 
   4995       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   4996     }
   4997   }
   4998 
   4999   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
   5000 
   5001   // Handle AVX2 in-register broadcasts.
   5002   if (!IsLoad && Subtarget->hasInt256() &&
   5003       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
   5004     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5005 
   5006   // The scalar source must be a normal load.
   5007   if (!IsLoad)
   5008     return SDValue();
   5009 
   5010   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
   5011       (Subtarget->hasVLX() && ScalarSize == 64))
   5012     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5013 
   5014   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   5015   // double since there is no vbroadcastsd xmm
   5016   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
   5017     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
   5018       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   5019   }
   5020 
   5021   // Unsupported broadcast.
   5022   return SDValue();
   5023 }
   5024 
   5025 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
   5026 /// underlying vector and index.
   5027 ///
   5028 /// Modifies \p ExtractedFromVec to the real vector and returns the real
   5029 /// index.
   5030 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
   5031                                          SDValue ExtIdx) {
   5032   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
   5033   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
   5034     return Idx;
   5035 
   5036   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
   5037   // lowered this:
   5038   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
   5039   // to:
   5040   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
   5041   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
   5042   //                           undef)
   5043   //                       Constant<0>)
   5044   // In this case the vector is the extract_subvector expression and the index
   5045   // is 2, as specified by the shuffle.
   5046   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
   5047   SDValue ShuffleVec = SVOp->getOperand(0);
   5048   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
   5049   assert(ShuffleVecVT.getVectorElementType() ==
   5050          ExtractedFromVec.getSimpleValueType().getVectorElementType());
   5051 
   5052   int ShuffleIdx = SVOp->getMaskElt(Idx);
   5053   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
   5054     ExtractedFromVec = ShuffleVec;
   5055     return ShuffleIdx;
   5056   }
   5057   return Idx;
   5058 }
   5059 
   5060 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
   5061   MVT VT = Op.getSimpleValueType();
   5062 
   5063   // Skip if insert_vec_elt is not supported.
   5064   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   5065   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
   5066     return SDValue();
   5067 
   5068   SDLoc DL(Op);
   5069   unsigned NumElems = Op.getNumOperands();
   5070 
   5071   SDValue VecIn1;
   5072   SDValue VecIn2;
   5073   SmallVector<unsigned, 4> InsertIndices;
   5074   SmallVector<int, 8> Mask(NumElems, -1);
   5075 
   5076   for (unsigned i = 0; i != NumElems; ++i) {
   5077     unsigned Opc = Op.getOperand(i).getOpcode();
   5078 
   5079     if (Opc == ISD::UNDEF)
   5080       continue;
   5081 
   5082     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
   5083       // Quit if more than 1 elements need inserting.
   5084       if (InsertIndices.size() > 1)
   5085         return SDValue();
   5086 
   5087       InsertIndices.push_back(i);
   5088       continue;
   5089     }
   5090 
   5091     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
   5092     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
   5093     // Quit if non-constant index.
   5094     if (!isa<ConstantSDNode>(ExtIdx))
   5095       return SDValue();
   5096     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
   5097 
   5098     // Quit if extracted from vector of different type.
   5099     if (ExtractedFromVec.getValueType() != VT)
   5100       return SDValue();
   5101 
   5102     if (!VecIn1.getNode())
   5103       VecIn1 = ExtractedFromVec;
   5104     else if (VecIn1 != ExtractedFromVec) {
   5105       if (!VecIn2.getNode())
   5106         VecIn2 = ExtractedFromVec;
   5107       else if (VecIn2 != ExtractedFromVec)
   5108         // Quit if more than 2 vectors to shuffle
   5109         return SDValue();
   5110     }
   5111 
   5112     if (ExtractedFromVec == VecIn1)
   5113       Mask[i] = Idx;
   5114     else if (ExtractedFromVec == VecIn2)
   5115       Mask[i] = Idx + NumElems;
   5116   }
   5117 
   5118   if (!VecIn1.getNode())
   5119     return SDValue();
   5120 
   5121   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
   5122   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
   5123   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
   5124     unsigned Idx = InsertIndices[i];
   5125     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
   5126                      DAG.getIntPtrConstant(Idx));
   5127   }
   5128 
   5129   return NV;
   5130 }
   5131 
   5132 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
   5133 SDValue
   5134 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
   5135 
   5136   MVT VT = Op.getSimpleValueType();
   5137   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
   5138          "Unexpected type in LowerBUILD_VECTORvXi1!");
   5139 
   5140   SDLoc dl(Op);
   5141   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
   5142     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
   5143     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
   5144     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   5145   }
   5146 
   5147   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
   5148     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
   5149     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
   5150     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   5151   }
   5152 
   5153   bool AllContants = true;
   5154   uint64_t Immediate = 0;
   5155   int NonConstIdx = -1;
   5156   bool IsSplat = true;
   5157   unsigned NumNonConsts = 0;
   5158   unsigned NumConsts = 0;
   5159   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
   5160     SDValue In = Op.getOperand(idx);
   5161     if (In.getOpcode() == ISD::UNDEF)
   5162       continue;
   5163     if (!isa<ConstantSDNode>(In)) {
   5164       AllContants = false;
   5165       NonConstIdx = idx;
   5166       NumNonConsts++;
   5167     } else {
   5168       NumConsts++;
   5169       if (cast<ConstantSDNode>(In)->getZExtValue())
   5170       Immediate |= (1ULL << idx);
   5171     }
   5172     if (In != Op.getOperand(0))
   5173       IsSplat = false;
   5174   }
   5175 
   5176   if (AllContants) {
   5177     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
   5178       DAG.getConstant(Immediate, MVT::i16));
   5179     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
   5180                        DAG.getIntPtrConstant(0));
   5181   }
   5182 
   5183   if (NumNonConsts == 1 && NonConstIdx != 0) {
   5184     SDValue DstVec;
   5185     if (NumConsts) {
   5186       SDValue VecAsImm = DAG.getConstant(Immediate,
   5187                                          MVT::getIntegerVT(VT.getSizeInBits()));
   5188       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
   5189     }
   5190     else
   5191       DstVec = DAG.getUNDEF(VT);
   5192     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
   5193                        Op.getOperand(NonConstIdx),
   5194                        DAG.getIntPtrConstant(NonConstIdx));
   5195   }
   5196   if (!IsSplat && (NonConstIdx != 0))
   5197     llvm_unreachable("Unsupported BUILD_VECTOR operation");
   5198   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
   5199   SDValue Select;
   5200   if (IsSplat)
   5201     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
   5202                           DAG.getConstant(-1, SelectVT),
   5203                           DAG.getConstant(0, SelectVT));
   5204   else
   5205     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
   5206                          DAG.getConstant((Immediate | 1), SelectVT),
   5207                          DAG.getConstant(Immediate, SelectVT));
   5208   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
   5209 }
   5210 
   5211 /// \brief Return true if \p N implements a horizontal binop and return the
   5212 /// operands for the horizontal binop into V0 and V1.
   5213 ///
   5214 /// This is a helper function of PerformBUILD_VECTORCombine.
   5215 /// This function checks that the build_vector \p N in input implements a
   5216 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
   5217 /// operation to match.
   5218 /// For example, if \p Opcode is equal to ISD::ADD, then this function
   5219 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
   5220 /// is equal to ISD::SUB, then this function checks if this is a horizontal
   5221 /// arithmetic sub.
   5222 ///
   5223 /// This function only analyzes elements of \p N whose indices are
   5224 /// in range [BaseIdx, LastIdx).
   5225 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
   5226                               SelectionDAG &DAG,
   5227                               unsigned BaseIdx, unsigned LastIdx,
   5228                               SDValue &V0, SDValue &V1) {
   5229   EVT VT = N->getValueType(0);
   5230 
   5231   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
   5232   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
   5233          "Invalid Vector in input!");
   5234 
   5235   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
   5236   bool CanFold = true;
   5237   unsigned ExpectedVExtractIdx = BaseIdx;
   5238   unsigned NumElts = LastIdx - BaseIdx;
   5239   V0 = DAG.getUNDEF(VT);
   5240   V1 = DAG.getUNDEF(VT);
   5241 
   5242   // Check if N implements a horizontal binop.
   5243   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
   5244     SDValue Op = N->getOperand(i + BaseIdx);
   5245 
   5246     // Skip UNDEFs.
   5247     if (Op->getOpcode() == ISD::UNDEF) {
   5248       // Update the expected vector extract index.
   5249       if (i * 2 == NumElts)
   5250         ExpectedVExtractIdx = BaseIdx;
   5251       ExpectedVExtractIdx += 2;
   5252       continue;
   5253     }
   5254 
   5255     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
   5256 
   5257     if (!CanFold)
   5258       break;
   5259 
   5260     SDValue Op0 = Op.getOperand(0);
   5261     SDValue Op1 = Op.getOperand(1);
   5262 
   5263     // Try to match the following pattern:
   5264     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
   5265     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   5266         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   5267         Op0.getOperand(0) == Op1.getOperand(0) &&
   5268         isa<ConstantSDNode>(Op0.getOperand(1)) &&
   5269         isa<ConstantSDNode>(Op1.getOperand(1)));
   5270     if (!CanFold)
   5271       break;
   5272 
   5273     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
   5274     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
   5275 
   5276     if (i * 2 < NumElts) {
   5277       if (V0.getOpcode() == ISD::UNDEF)
   5278         V0 = Op0.getOperand(0);
   5279     } else {
   5280       if (V1.getOpcode() == ISD::UNDEF)
   5281         V1 = Op0.getOperand(0);
   5282       if (i * 2 == NumElts)
   5283         ExpectedVExtractIdx = BaseIdx;
   5284     }
   5285 
   5286     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
   5287     if (I0 == ExpectedVExtractIdx)
   5288       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
   5289     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
   5290       // Try to match the following dag sequence:
   5291       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
   5292       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
   5293     } else
   5294       CanFold = false;
   5295 
   5296     ExpectedVExtractIdx += 2;
   5297   }
   5298 
   5299   return CanFold;
   5300 }
   5301 
   5302 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
   5303 /// a concat_vector.
   5304 ///
   5305 /// This is a helper function of PerformBUILD_VECTORCombine.
   5306 /// This function expects two 256-bit vectors called V0 and V1.
   5307 /// At first, each vector is split into two separate 128-bit vectors.
   5308 /// Then, the resulting 128-bit vectors are used to implement two
   5309 /// horizontal binary operations.
   5310 ///
   5311 /// The kind of horizontal binary operation is defined by \p X86Opcode.
   5312 ///
   5313 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
   5314 /// the two new horizontal binop.
   5315 /// When Mode is set, the first horizontal binop dag node would take as input
   5316 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
   5317 /// horizontal binop dag node would take as input the lower 128-bit of V1
   5318 /// and the upper 128-bit of V1.
   5319 ///   Example:
   5320 ///     HADD V0_LO, V0_HI
   5321 ///     HADD V1_LO, V1_HI
   5322 ///
   5323 /// Otherwise, the first horizontal binop dag node takes as input the lower
   5324 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
   5325 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
   5326 ///   Example:
   5327 ///     HADD V0_LO, V1_LO
   5328 ///     HADD V0_HI, V1_HI
   5329 ///
   5330 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
   5331 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
   5332 /// the upper 128-bits of the result.
   5333 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
   5334                                      SDLoc DL, SelectionDAG &DAG,
   5335                                      unsigned X86Opcode, bool Mode,
   5336                                      bool isUndefLO, bool isUndefHI) {
   5337   EVT VT = V0.getValueType();
   5338   assert(VT.is256BitVector() && VT == V1.getValueType() &&
   5339          "Invalid nodes in input!");
   5340 
   5341   unsigned NumElts = VT.getVectorNumElements();
   5342   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
   5343   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
   5344   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
   5345   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
   5346   EVT NewVT = V0_LO.getValueType();
   5347 
   5348   SDValue LO = DAG.getUNDEF(NewVT);
   5349   SDValue HI = DAG.getUNDEF(NewVT);
   5350 
   5351   if (Mode) {
   5352     // Don't emit a horizontal binop if the result is expected to be UNDEF.
   5353     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
   5354       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
   5355     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
   5356       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
   5357   } else {
   5358     // Don't emit a horizontal binop if the result is expected to be UNDEF.
   5359     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
   5360                        V1_LO->getOpcode() != ISD::UNDEF))
   5361       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
   5362 
   5363     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
   5364                        V1_HI->getOpcode() != ISD::UNDEF))
   5365       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
   5366   }
   5367 
   5368   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
   5369 }
   5370 
   5371 /// \brief Try to fold a build_vector that performs an 'addsub' into the
   5372 /// sequence of 'vadd + vsub + blendi'.
   5373 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
   5374                            const X86Subtarget *Subtarget) {
   5375   SDLoc DL(BV);
   5376   EVT VT = BV->getValueType(0);
   5377   unsigned NumElts = VT.getVectorNumElements();
   5378   SDValue InVec0 = DAG.getUNDEF(VT);
   5379   SDValue InVec1 = DAG.getUNDEF(VT);
   5380 
   5381   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
   5382           VT == MVT::v2f64) && "build_vector with an invalid type found!");
   5383 
   5384   // Odd-numbered elements in the input build vector are obtained from
   5385   // adding two integer/float elements.
   5386   // Even-numbered elements in the input build vector are obtained from
   5387   // subtracting two integer/float elements.
   5388   unsigned ExpectedOpcode = ISD::FSUB;
   5389   unsigned NextExpectedOpcode = ISD::FADD;
   5390   bool AddFound = false;
   5391   bool SubFound = false;
   5392 
   5393   for (unsigned i = 0, e = NumElts; i != e; ++i) {
   5394     SDValue Op = BV->getOperand(i);
   5395 
   5396     // Skip 'undef' values.
   5397     unsigned Opcode = Op.getOpcode();
   5398     if (Opcode == ISD::UNDEF) {
   5399       std::swap(ExpectedOpcode, NextExpectedOpcode);
   5400       continue;
   5401     }
   5402 
   5403     // Early exit if we found an unexpected opcode.
   5404     if (Opcode != ExpectedOpcode)
   5405       return SDValue();
   5406 
   5407     SDValue Op0 = Op.getOperand(0);
   5408     SDValue Op1 = Op.getOperand(1);
   5409 
   5410     // Try to match the following pattern:
   5411     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
   5412     // Early exit if we cannot match that sequence.
   5413     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   5414         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   5415         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
   5416         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
   5417         Op0.getOperand(1) != Op1.getOperand(1))
   5418       return SDValue();
   5419 
   5420     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
   5421     if (I0 != i)
   5422       return SDValue();
   5423 
   5424     // We found a valid add/sub node. Update the information accordingly.
   5425     if (i & 1)
   5426       AddFound = true;
   5427     else
   5428       SubFound = true;
   5429 
   5430     // Update InVec0 and InVec1.
   5431     if (InVec0.getOpcode() == ISD::UNDEF)
   5432       InVec0 = Op0.getOperand(0);
   5433     if (InVec1.getOpcode() == ISD::UNDEF)
   5434       InVec1 = Op1.getOperand(0);
   5435 
   5436     // Make sure that operands in input to each add/sub node always
   5437     // come from a same pair of vectors.
   5438     if (InVec0 != Op0.getOperand(0)) {
   5439       if (ExpectedOpcode == ISD::FSUB)
   5440         return SDValue();
   5441 
   5442       // FADD is commutable. Try to commute the operands
   5443       // and then test again.
   5444       std::swap(Op0, Op1);
   5445       if (InVec0 != Op0.getOperand(0))
   5446         return SDValue();
   5447     }
   5448 
   5449     if (InVec1 != Op1.getOperand(0))
   5450       return SDValue();
   5451 
   5452     // Update the pair of expected opcodes.
   5453     std::swap(ExpectedOpcode, NextExpectedOpcode);
   5454   }
   5455 
   5456   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
   5457   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
   5458       InVec1.getOpcode() != ISD::UNDEF)
   5459     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
   5460 
   5461   return SDValue();
   5462 }
   5463 
   5464 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
   5465                                           const X86Subtarget *Subtarget) {
   5466   SDLoc DL(N);
   5467   EVT VT = N->getValueType(0);
   5468   unsigned NumElts = VT.getVectorNumElements();
   5469   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
   5470   SDValue InVec0, InVec1;
   5471 
   5472   // Try to match an ADDSUB.
   5473   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   5474       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
   5475     SDValue Value = matchAddSub(BV, DAG, Subtarget);
   5476     if (Value.getNode())
   5477       return Value;
   5478   }
   5479 
   5480   // Try to match horizontal ADD/SUB.
   5481   unsigned NumUndefsLO = 0;
   5482   unsigned NumUndefsHI = 0;
   5483   unsigned Half = NumElts/2;
   5484 
   5485   // Count the number of UNDEF operands in the build_vector in input.
   5486   for (unsigned i = 0, e = Half; i != e; ++i)
   5487     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
   5488       NumUndefsLO++;
   5489 
   5490   for (unsigned i = Half, e = NumElts; i != e; ++i)
   5491     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
   5492       NumUndefsHI++;
   5493 
   5494   // Early exit if this is either a build_vector of all UNDEFs or all the
   5495   // operands but one are UNDEF.
   5496   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
   5497     return SDValue();
   5498 
   5499   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
   5500     // Try to match an SSE3 float HADD/HSUB.
   5501     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
   5502       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
   5503 
   5504     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
   5505       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   5506   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
   5507     // Try to match an SSSE3 integer HADD/HSUB.
   5508     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
   5509       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
   5510 
   5511     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
   5512       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
   5513   }
   5514 
   5515   if (!Subtarget->hasAVX())
   5516     return SDValue();
   5517 
   5518   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
   5519     // Try to match an AVX horizontal add/sub of packed single/double
   5520     // precision floating point values from 256-bit vectors.
   5521     SDValue InVec2, InVec3;
   5522     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
   5523         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
   5524         ((InVec0.getOpcode() == ISD::UNDEF ||
   5525           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
   5526         ((InVec1.getOpcode() == ISD::UNDEF ||
   5527           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
   5528       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
   5529 
   5530     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
   5531         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
   5532         ((InVec0.getOpcode() == ISD::UNDEF ||
   5533           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
   5534         ((InVec1.getOpcode() == ISD::UNDEF ||
   5535           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
   5536       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   5537   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
   5538     // Try to match an AVX2 horizontal add/sub of signed integers.
   5539     SDValue InVec2, InVec3;
   5540     unsigned X86Opcode;
   5541     bool CanFold = true;
   5542 
   5543     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
   5544         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
   5545         ((InVec0.getOpcode() == ISD::UNDEF ||
   5546           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
   5547         ((InVec1.getOpcode() == ISD::UNDEF ||
   5548           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
   5549       X86Opcode = X86ISD::HADD;
   5550     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
   5551         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
   5552         ((InVec0.getOpcode() == ISD::UNDEF ||
   5553           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
   5554         ((InVec1.getOpcode() == ISD::UNDEF ||
   5555           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
   5556       X86Opcode = X86ISD::HSUB;
   5557     else
   5558       CanFold = false;
   5559 
   5560     if (CanFold) {
   5561       // Fold this build_vector into a single horizontal add/sub.
   5562       // Do this only if the target has AVX2.
   5563       if (Subtarget->hasAVX2())
   5564         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
   5565 
   5566       // Do not try to expand this build_vector into a pair of horizontal
   5567       // add/sub if we can emit a pair of scalar add/sub.
   5568       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
   5569         return SDValue();
   5570 
   5571       // Convert this build_vector into a pair of horizontal binop followed by
   5572       // a concat vector.
   5573       bool isUndefLO = NumUndefsLO == Half;
   5574       bool isUndefHI = NumUndefsHI == Half;
   5575       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
   5576                                    isUndefLO, isUndefHI);
   5577     }
   5578   }
   5579 
   5580   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
   5581        VT == MVT::v16i16) && Subtarget->hasAVX()) {
   5582     unsigned X86Opcode;
   5583     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
   5584       X86Opcode = X86ISD::HADD;
   5585     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
   5586       X86Opcode = X86ISD::HSUB;
   5587     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
   5588       X86Opcode = X86ISD::FHADD;
   5589     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
   5590       X86Opcode = X86ISD::FHSUB;
   5591     else
   5592       return SDValue();
   5593 
   5594     // Don't try to expand this build_vector into a pair of horizontal add/sub
   5595     // if we can simply emit a pair of scalar add/sub.
   5596     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
   5597       return SDValue();
   5598 
   5599     // Convert this build_vector into two horizontal add/sub followed by
   5600     // a concat vector.
   5601     bool isUndefLO = NumUndefsLO == Half;
   5602     bool isUndefHI = NumUndefsHI == Half;
   5603     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
   5604                                  isUndefLO, isUndefHI);
   5605   }
   5606 
   5607   return SDValue();
   5608 }
   5609 
   5610 SDValue
   5611 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   5612   SDLoc dl(Op);
   5613 
   5614   MVT VT = Op.getSimpleValueType();
   5615   MVT ExtVT = VT.getVectorElementType();
   5616   unsigned NumElems = Op.getNumOperands();
   5617 
   5618   // Generate vectors for predicate vectors.
   5619   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
   5620     return LowerBUILD_VECTORvXi1(Op, DAG);
   5621 
   5622   // Vectors containing all zeros can be matched by pxor and xorps later
   5623   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
   5624     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
   5625     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
   5626     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
   5627       return Op;
   5628 
   5629     return getZeroVector(VT, Subtarget, DAG, dl);
   5630   }
   5631 
   5632   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   5633   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   5634   // vpcmpeqd on 256-bit vectors.
   5635   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
   5636     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
   5637       return Op;
   5638 
   5639     if (!VT.is512BitVector())
   5640       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
   5641   }
   5642 
   5643   if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
   5644     return Broadcast;
   5645 
   5646   unsigned EVTBits = ExtVT.getSizeInBits();
   5647 
   5648   unsigned NumZero  = 0;
   5649   unsigned NumNonZero = 0;
   5650   unsigned NonZeros = 0;
   5651   bool IsAllConstants = true;
   5652   SmallSet<SDValue, 8> Values;
   5653   for (unsigned i = 0; i < NumElems; ++i) {
   5654     SDValue Elt = Op.getOperand(i);
   5655     if (Elt.getOpcode() == ISD::UNDEF)
   5656       continue;
   5657     Values.insert(Elt);
   5658     if (Elt.getOpcode() != ISD::Constant &&
   5659         Elt.getOpcode() != ISD::ConstantFP)
   5660       IsAllConstants = false;
   5661     if (X86::isZeroNode(Elt))
   5662       NumZero++;
   5663     else {
   5664       NonZeros |= (1 << i);
   5665       NumNonZero++;
   5666     }
   5667   }
   5668 
   5669   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
   5670   if (NumNonZero == 0)
   5671     return DAG.getUNDEF(VT);
   5672 
   5673   // Special case for single non-zero, non-undef, element.
   5674   if (NumNonZero == 1) {
   5675     unsigned Idx = countTrailingZeros(NonZeros);
   5676     SDValue Item = Op.getOperand(Idx);
   5677 
   5678     // If this is an insertion of an i64 value on x86-32, and if the top bits of
   5679     // the value are obviously zero, truncate the value to i32 and do the
   5680     // insertion that way.  Only do this if the value is non-constant or if the
   5681     // value is a constant being inserted into element 0.  It is cheaper to do
   5682     // a constant pool load than it is to do a movd + shuffle.
   5683     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
   5684         (!IsAllConstants || Idx == 0)) {
   5685       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
   5686         // Handle SSE only.
   5687         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
   5688         EVT VecVT = MVT::v4i32;
   5689 
   5690         // Truncate the value (which may itself be a constant) to i32, and
   5691         // convert it to a vector with movd (S2V+shuffle to zero extend).
   5692         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
   5693         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
   5694         return DAG.getNode(
   5695             ISD::BITCAST, dl, VT,
   5696             getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
   5697       }
   5698     }
   5699 
   5700     // If we have a constant or non-constant insertion into the low element of
   5701     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
   5702     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
   5703     // depending on what the source datatype is.
   5704     if (Idx == 0) {
   5705       if (NumZero == 0)
   5706         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   5707 
   5708       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
   5709           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
   5710         if (VT.is512BitVector()) {
   5711           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
   5712           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
   5713                              Item, DAG.getIntPtrConstant(0));
   5714         }
   5715         assert((VT.is128BitVector() || VT.is256BitVector()) &&
   5716                "Expected an SSE value type!");
   5717         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   5718         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
   5719         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   5720       }
   5721 
   5722       // We can't directly insert an i8 or i16 into a vector, so zero extend
   5723       // it to i32 first.
   5724       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
   5725         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
   5726         if (VT.is256BitVector()) {
   5727           if (Subtarget->hasAVX()) {
   5728             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item);
   5729             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   5730           } else {
   5731             // Without AVX, we need to extend to a 128-bit vector and then
   5732             // insert into the 256-bit vector.
   5733             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
   5734             SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
   5735             Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
   5736           }
   5737         } else {
   5738           assert(VT.is128BitVector() && "Expected an SSE value type!");
   5739           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
   5740           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   5741         }
   5742         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
   5743       }
   5744     }
   5745 
   5746     // Is it a vector logical left shift?
   5747     if (NumElems == 2 && Idx == 1 &&
   5748         X86::isZeroNode(Op.getOperand(0)) &&
   5749         !X86::isZeroNode(Op.getOperand(1))) {
   5750       unsigned NumBits = VT.getSizeInBits();
   5751       return getVShift(true, VT,
   5752                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   5753                                    VT, Op.getOperand(1)),
   5754                        NumBits/2, DAG, *this, dl);
   5755     }
   5756 
   5757     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
   5758       return SDValue();
   5759 
   5760     // Otherwise, if this is a vector with i32 or f32 elements, and the element
   5761     // is a non-constant being inserted into an element other than the low one,
   5762     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
   5763     // movd/movss) to move this into the low element, then shuffle it into
   5764     // place.
   5765     if (EVTBits == 32) {
   5766       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   5767       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
   5768     }
   5769   }
   5770 
   5771   // Splat is obviously ok. Let legalizer expand it to a shuffle.
   5772   if (Values.size() == 1) {
   5773     if (EVTBits == 32) {
   5774       // Instead of a shuffle like this:
   5775       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
   5776       // Check if it's possible to issue this instead.
   5777       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
   5778       unsigned Idx = countTrailingZeros(NonZeros);
   5779       SDValue Item = Op.getOperand(Idx);
   5780       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
   5781         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
   5782     }
   5783     return SDValue();
   5784   }
   5785 
   5786   // A vector full of immediates; various special cases are already
   5787   // handled, so this is best done with a single constant-pool load.
   5788   if (IsAllConstants)
   5789     return SDValue();
   5790 
   5791   // For AVX-length vectors, see if we can use a vector load to get all of the
   5792   // elements, otherwise build the individual 128-bit pieces and use
   5793   // shuffles to put them in place.
   5794   if (VT.is256BitVector() || VT.is512BitVector()) {
   5795     SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
   5796 
   5797     // Check for a build vector of consecutive loads.
   5798     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
   5799       return LD;
   5800 
   5801     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
   5802 
   5803     // Build both the lower and upper subvector.
   5804     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
   5805                                 makeArrayRef(&V[0], NumElems/2));
   5806     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
   5807                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
   5808 
   5809     // Recreate the wider vector with the lower and upper part.
   5810     if (VT.is256BitVector())
   5811       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   5812     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   5813   }
   5814 
   5815   // Let legalizer expand 2-wide build_vectors.
   5816   if (EVTBits == 64) {
   5817     if (NumNonZero == 1) {
   5818       // One half is zero or undef.
   5819       unsigned Idx = countTrailingZeros(NonZeros);
   5820       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
   5821                                  Op.getOperand(Idx));
   5822       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
   5823     }
   5824     return SDValue();
   5825   }
   5826 
   5827   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   5828   if (EVTBits == 8 && NumElems == 16)
   5829     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
   5830                                         Subtarget, *this))
   5831       return V;
   5832 
   5833   if (EVTBits == 16 && NumElems == 8)
   5834     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
   5835                                       Subtarget, *this))
   5836       return V;
   5837 
   5838   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
   5839   if (EVTBits == 32 && NumElems == 4)
   5840     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
   5841       return V;
   5842 
   5843   // If element VT is == 32 bits, turn it into a number of shuffles.
   5844   SmallVector<SDValue, 8> V(NumElems);
   5845   if (NumElems == 4 && NumZero > 0) {
   5846     for (unsigned i = 0; i < 4; ++i) {
   5847       bool isZero = !(NonZeros & (1 << i));
   5848       if (isZero)
   5849         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
   5850       else
   5851         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   5852     }
   5853 
   5854     for (unsigned i = 0; i < 2; ++i) {
   5855       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
   5856         default: break;
   5857         case 0:
   5858           V[i] = V[i*2];  // Must be a zero vector.
   5859           break;
   5860         case 1:
   5861           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
   5862           break;
   5863         case 2:
   5864           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
   5865           break;
   5866         case 3:
   5867           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
   5868           break;
   5869       }
   5870     }
   5871 
   5872     bool Reverse1 = (NonZeros & 0x3) == 2;
   5873     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
   5874     int MaskVec[] = {
   5875       Reverse1 ? 1 : 0,
   5876       Reverse1 ? 0 : 1,
   5877       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
   5878       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
   5879     };
   5880     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
   5881   }
   5882 
   5883   if (Values.size() > 1 && VT.is128BitVector()) {
   5884     // Check for a build vector of consecutive loads.
   5885     for (unsigned i = 0; i < NumElems; ++i)
   5886       V[i] = Op.getOperand(i);
   5887 
   5888     // Check for elements which are consecutive loads.
   5889     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
   5890       return LD;
   5891 
   5892     // Check for a build vector from mostly shuffle plus few inserting.
   5893     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
   5894       return Sh;
   5895 
   5896     // For SSE 4.1, use insertps to put the high elements into the low element.
   5897     if (Subtarget->hasSSE41()) {
   5898       SDValue Result;
   5899       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
   5900         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
   5901       else
   5902         Result = DAG.getUNDEF(VT);
   5903 
   5904       for (unsigned i = 1; i < NumElems; ++i) {
   5905         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
   5906         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
   5907                              Op.getOperand(i), DAG.getIntPtrConstant(i));
   5908       }
   5909       return Result;
   5910     }
   5911 
   5912     // Otherwise, expand into a number of unpckl*, start by extending each of
   5913     // our (non-undef) elements to the full vector width with the element in the
   5914     // bottom slot of the vector (which generates no code for SSE).
   5915     for (unsigned i = 0; i < NumElems; ++i) {
   5916       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
   5917         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   5918       else
   5919         V[i] = DAG.getUNDEF(VT);
   5920     }
   5921 
   5922     // Next, we iteratively mix elements, e.g. for v4f32:
   5923     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
   5924     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
   5925     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
   5926     unsigned EltStride = NumElems >> 1;
   5927     while (EltStride != 0) {
   5928       for (unsigned i = 0; i < EltStride; ++i) {
   5929         // If V[i+EltStride] is undef and this is the first round of mixing,
   5930         // then it is safe to just drop this shuffle: V[i] is already in the
   5931         // right place, the one element (since it's the first round) being
   5932         // inserted as undef can be dropped.  This isn't safe for successive
   5933         // rounds because they will permute elements within both vectors.
   5934         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
   5935             EltStride == NumElems/2)
   5936           continue;
   5937 
   5938         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
   5939       }
   5940       EltStride >>= 1;
   5941     }
   5942     return V[0];
   5943   }
   5944   return SDValue();
   5945 }
   5946 
   5947 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
   5948 // to create 256-bit vectors from two other 128-bit ones.
   5949 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   5950   SDLoc dl(Op);
   5951   MVT ResVT = Op.getSimpleValueType();
   5952 
   5953   assert((ResVT.is256BitVector() ||
   5954           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
   5955 
   5956   SDValue V1 = Op.getOperand(0);
   5957   SDValue V2 = Op.getOperand(1);
   5958   unsigned NumElems = ResVT.getVectorNumElements();
   5959   if (ResVT.is256BitVector())
   5960     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
   5961 
   5962   if (Op.getNumOperands() == 4) {
   5963     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
   5964                                 ResVT.getVectorNumElements()/2);
   5965     SDValue V3 = Op.getOperand(2);
   5966     SDValue V4 = Op.getOperand(3);
   5967     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
   5968       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
   5969   }
   5970   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
   5971 }
   5972 
   5973 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   5974                                        const X86Subtarget *Subtarget,
   5975                                        SelectionDAG & DAG) {
   5976   SDLoc dl(Op);
   5977   MVT ResVT = Op.getSimpleValueType();
   5978   unsigned NumOfOperands = Op.getNumOperands();
   5979 
   5980   assert(isPowerOf2_32(NumOfOperands) &&
   5981          "Unexpected number of operands in CONCAT_VECTORS");
   5982 
   5983   if (NumOfOperands > 2) {
   5984     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
   5985                                   ResVT.getVectorNumElements()/2);
   5986     SmallVector<SDValue, 2> Ops;
   5987     for (unsigned i = 0; i < NumOfOperands/2; i++)
   5988       Ops.push_back(Op.getOperand(i));
   5989     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
   5990     Ops.clear();
   5991     for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
   5992       Ops.push_back(Op.getOperand(i));
   5993     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
   5994     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   5995   }
   5996 
   5997   SDValue V1 = Op.getOperand(0);
   5998   SDValue V2 = Op.getOperand(1);
   5999   bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
   6000   bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
   6001 
   6002   if (IsZeroV1 && IsZeroV2)
   6003     return getZeroVector(ResVT, Subtarget, DAG, dl);
   6004 
   6005   SDValue ZeroIdx = DAG.getIntPtrConstant(0);
   6006   SDValue Undef = DAG.getUNDEF(ResVT);
   6007   unsigned NumElems = ResVT.getVectorNumElements();
   6008   SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8);
   6009 
   6010   V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx);
   6011   V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits);
   6012   if (IsZeroV1)
   6013     return V2;
   6014 
   6015   V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
   6016   // Zero the upper bits of V1
   6017   V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits);
   6018   V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits);
   6019   if (IsZeroV2)
   6020     return V1;
   6021   return DAG.getNode(ISD::OR, dl, ResVT, V1, V2);
   6022 }
   6023 
   6024 static SDValue LowerCONCAT_VECTORS(SDValue Op,
   6025                                    const X86Subtarget *Subtarget,
   6026                                    SelectionDAG &DAG) {
   6027   MVT VT = Op.getSimpleValueType();
   6028   if (VT.getVectorElementType() == MVT::i1)
   6029     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
   6030 
   6031   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
   6032          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
   6033           Op.getNumOperands() == 4)));
   6034 
   6035   // AVX can use the vinsertf128 instruction to create 256-bit vectors
   6036   // from two other 128-bit ones.
   6037 
   6038   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
   6039   return LowerAVXCONCAT_VECTORS(Op, DAG);
   6040 }
   6041 
   6042 
   6043 //===----------------------------------------------------------------------===//
   6044 // Vector shuffle lowering
   6045 //
   6046 // This is an experimental code path for lowering vector shuffles on x86. It is
   6047 // designed to handle arbitrary vector shuffles and blends, gracefully
   6048 // degrading performance as necessary. It works hard to recognize idiomatic
   6049 // shuffles and lower them to optimal instruction patterns without leaving
   6050 // a framework that allows reasonably efficient handling of all vector shuffle
   6051 // patterns.
   6052 //===----------------------------------------------------------------------===//
   6053 
   6054 /// \brief Tiny helper function to identify a no-op mask.
   6055 ///
   6056 /// This is a somewhat boring predicate function. It checks whether the mask
   6057 /// array input, which is assumed to be a single-input shuffle mask of the kind
   6058 /// used by the X86 shuffle instructions (not a fully general
   6059 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
   6060 /// in-place shuffle are 'no-op's.
   6061 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
   6062   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   6063     if (Mask[i] != -1 && Mask[i] != i)
   6064       return false;
   6065   return true;
   6066 }
   6067 
   6068 /// \brief Helper function to classify a mask as a single-input mask.
   6069 ///
   6070 /// This isn't a generic single-input test because in the vector shuffle
   6071 /// lowering we canonicalize single inputs to be the first input operand. This
   6072 /// means we can more quickly test for a single input by only checking whether
   6073 /// an input from the second operand exists. We also assume that the size of
   6074 /// mask corresponds to the size of the input vectors which isn't true in the
   6075 /// fully general case.
   6076 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
   6077   for (int M : Mask)
   6078     if (M >= (int)Mask.size())
   6079       return false;
   6080   return true;
   6081 }
   6082 
   6083 /// \brief Test whether there are elements crossing 128-bit lanes in this
   6084 /// shuffle mask.
   6085 ///
   6086 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
   6087 /// and we routinely test for these.
   6088 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
   6089   int LaneSize = 128 / VT.getScalarSizeInBits();
   6090   int Size = Mask.size();
   6091   for (int i = 0; i < Size; ++i)
   6092     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
   6093       return true;
   6094   return false;
   6095 }
   6096 
   6097 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
   6098 ///
   6099 /// This checks a shuffle mask to see if it is performing the same
   6100 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
   6101 /// that it is also not lane-crossing. It may however involve a blend from the
   6102 /// same lane of a second vector.
   6103 ///
   6104 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
   6105 /// non-trivial to compute in the face of undef lanes. The representation is
   6106 /// *not* suitable for use with existing 128-bit shuffles as it will contain
   6107 /// entries from both V1 and V2 inputs to the wider mask.
   6108 static bool
   6109 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
   6110                                 SmallVectorImpl<int> &RepeatedMask) {
   6111   int LaneSize = 128 / VT.getScalarSizeInBits();
   6112   RepeatedMask.resize(LaneSize, -1);
   6113   int Size = Mask.size();
   6114   for (int i = 0; i < Size; ++i) {
   6115     if (Mask[i] < 0)
   6116       continue;
   6117     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
   6118       // This entry crosses lanes, so there is no way to model this shuffle.
   6119       return false;
   6120 
   6121     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
   6122     if (RepeatedMask[i % LaneSize] == -1)
   6123       // This is the first non-undef entry in this slot of a 128-bit lane.
   6124       RepeatedMask[i % LaneSize] =
   6125           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
   6126     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
   6127       // Found a mismatch with the repeated mask.
   6128       return false;
   6129   }
   6130   return true;
   6131 }
   6132 
   6133 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
   6134 /// arguments.
   6135 ///
   6136 /// This is a fast way to test a shuffle mask against a fixed pattern:
   6137 ///
   6138 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
   6139 ///
   6140 /// It returns true if the mask is exactly as wide as the argument list, and
   6141 /// each element of the mask is either -1 (signifying undef) or the value given
   6142 /// in the argument.
   6143 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
   6144                                 ArrayRef<int> ExpectedMask) {
   6145   if (Mask.size() != ExpectedMask.size())
   6146     return false;
   6147 
   6148   int Size = Mask.size();
   6149 
   6150   // If the values are build vectors, we can look through them to find
   6151   // equivalent inputs that make the shuffles equivalent.
   6152   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
   6153   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
   6154 
   6155   for (int i = 0; i < Size; ++i)
   6156     if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
   6157       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
   6158       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
   6159       if (!MaskBV || !ExpectedBV ||
   6160           MaskBV->getOperand(Mask[i] % Size) !=
   6161               ExpectedBV->getOperand(ExpectedMask[i] % Size))
   6162         return false;
   6163     }
   6164 
   6165   return true;
   6166 }
   6167 
   6168 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
   6169 ///
   6170 /// This helper function produces an 8-bit shuffle immediate corresponding to
   6171 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
   6172 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
   6173 /// example.
   6174 ///
   6175 /// NB: We rely heavily on "undef" masks preserving the input lane.
   6176 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
   6177                                           SelectionDAG &DAG) {
   6178   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
   6179   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
   6180   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
   6181   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
   6182   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
   6183 
   6184   unsigned Imm = 0;
   6185   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
   6186   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
   6187   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
   6188   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
   6189   return DAG.getConstant(Imm, MVT::i8);
   6190 }
   6191 
   6192 /// \brief Try to emit a blend instruction for a shuffle using bit math.
   6193 ///
   6194 /// This is used as a fallback approach when first class blend instructions are
   6195 /// unavailable. Currently it is only suitable for integer vectors, but could
   6196 /// be generalized for floating point vectors if desirable.
   6197 static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
   6198                                             SDValue V2, ArrayRef<int> Mask,
   6199                                             SelectionDAG &DAG) {
   6200   assert(VT.isInteger() && "Only supports integer vector types!");
   6201   MVT EltVT = VT.getScalarType();
   6202   int NumEltBits = EltVT.getSizeInBits();
   6203   SDValue Zero = DAG.getConstant(0, EltVT);
   6204   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), EltVT);
   6205   SmallVector<SDValue, 16> MaskOps;
   6206   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   6207     if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
   6208       return SDValue(); // Shuffled input!
   6209     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
   6210   }
   6211 
   6212   SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
   6213   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
   6214   // We have to cast V2 around.
   6215   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
   6216   V2 = DAG.getNode(ISD::BITCAST, DL, VT,
   6217                    DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
   6218                                DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask),
   6219                                DAG.getNode(ISD::BITCAST, DL, MaskVT, V2)));
   6220   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
   6221 }
   6222 
   6223 /// \brief Try to emit a blend instruction for a shuffle.
   6224 ///
   6225 /// This doesn't do any checks for the availability of instructions for blending
   6226 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
   6227 /// be matched in the backend with the type given. What it does check for is
   6228 /// that the shuffle mask is in fact a blend.
   6229 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
   6230                                          SDValue V2, ArrayRef<int> Mask,
   6231                                          const X86Subtarget *Subtarget,
   6232                                          SelectionDAG &DAG) {
   6233   unsigned BlendMask = 0;
   6234   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   6235     if (Mask[i] >= Size) {
   6236       if (Mask[i] != i + Size)
   6237         return SDValue(); // Shuffled V2 input!
   6238       BlendMask |= 1u << i;
   6239       continue;
   6240     }
   6241     if (Mask[i] >= 0 && Mask[i] != i)
   6242       return SDValue(); // Shuffled V1 input!
   6243   }
   6244   switch (VT.SimpleTy) {
   6245   case MVT::v2f64:
   6246   case MVT::v4f32:
   6247   case MVT::v4f64:
   6248   case MVT::v8f32:
   6249     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
   6250                        DAG.getConstant(BlendMask, MVT::i8));
   6251 
   6252   case MVT::v4i64:
   6253   case MVT::v8i32:
   6254     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
   6255     // FALLTHROUGH
   6256   case MVT::v2i64:
   6257   case MVT::v4i32:
   6258     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
   6259     // that instruction.
   6260     if (Subtarget->hasAVX2()) {
   6261       // Scale the blend by the number of 32-bit dwords per element.
   6262       int Scale =  VT.getScalarSizeInBits() / 32;
   6263       BlendMask = 0;
   6264       for (int i = 0, Size = Mask.size(); i < Size; ++i)
   6265         if (Mask[i] >= Size)
   6266           for (int j = 0; j < Scale; ++j)
   6267             BlendMask |= 1u << (i * Scale + j);
   6268 
   6269       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
   6270       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
   6271       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
   6272       return DAG.getNode(ISD::BITCAST, DL, VT,
   6273                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
   6274                                      DAG.getConstant(BlendMask, MVT::i8)));
   6275     }
   6276     // FALLTHROUGH
   6277   case MVT::v8i16: {
   6278     // For integer shuffles we need to expand the mask and cast the inputs to
   6279     // v8i16s prior to blending.
   6280     int Scale = 8 / VT.getVectorNumElements();
   6281     BlendMask = 0;
   6282     for (int i = 0, Size = Mask.size(); i < Size; ++i)
   6283       if (Mask[i] >= Size)
   6284         for (int j = 0; j < Scale; ++j)
   6285           BlendMask |= 1u << (i * Scale + j);
   6286 
   6287     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
   6288     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
   6289     return DAG.getNode(ISD::BITCAST, DL, VT,
   6290                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
   6291                                    DAG.getConstant(BlendMask, MVT::i8)));
   6292   }
   6293 
   6294   case MVT::v16i16: {
   6295     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
   6296     SmallVector<int, 8> RepeatedMask;
   6297     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
   6298       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
   6299       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
   6300       BlendMask = 0;
   6301       for (int i = 0; i < 8; ++i)
   6302         if (RepeatedMask[i] >= 16)
   6303           BlendMask |= 1u << i;
   6304       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
   6305                          DAG.getConstant(BlendMask, MVT::i8));
   6306     }
   6307   }
   6308     // FALLTHROUGH
   6309   case MVT::v16i8:
   6310   case MVT::v32i8: {
   6311     assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
   6312            "256-bit byte-blends require AVX2 support!");
   6313 
   6314     // Scale the blend by the number of bytes per element.
   6315     int Scale = VT.getScalarSizeInBits() / 8;
   6316 
   6317     // This form of blend is always done on bytes. Compute the byte vector
   6318     // type.
   6319     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
   6320 
   6321     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
   6322     // mix of LLVM's code generator and the x86 backend. We tell the code
   6323     // generator that boolean values in the elements of an x86 vector register
   6324     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
   6325     // mapping a select to operand #1, and 'false' mapping to operand #2. The
   6326     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
   6327     // of the element (the remaining are ignored) and 0 in that high bit would
   6328     // mean operand #1 while 1 in the high bit would mean operand #2. So while
   6329     // the LLVM model for boolean values in vector elements gets the relevant
   6330     // bit set, it is set backwards and over constrained relative to x86's
   6331     // actual model.
   6332     SmallVector<SDValue, 32> VSELECTMask;
   6333     for (int i = 0, Size = Mask.size(); i < Size; ++i)
   6334       for (int j = 0; j < Scale; ++j)
   6335         VSELECTMask.push_back(
   6336             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
   6337                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8));
   6338 
   6339     V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
   6340     V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
   6341     return DAG.getNode(
   6342         ISD::BITCAST, DL, VT,
   6343         DAG.getNode(ISD::VSELECT, DL, BlendVT,
   6344                     DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
   6345                     V1, V2));
   6346   }
   6347 
   6348   default:
   6349     llvm_unreachable("Not a supported integer vector type!");
   6350   }
   6351 }
   6352 
   6353 /// \brief Try to lower as a blend of elements from two inputs followed by
   6354 /// a single-input permutation.
   6355 ///
   6356 /// This matches the pattern where we can blend elements from two inputs and
   6357 /// then reduce the shuffle to a single-input permutation.
   6358 static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
   6359                                                    SDValue V2,
   6360                                                    ArrayRef<int> Mask,
   6361                                                    SelectionDAG &DAG) {
   6362   // We build up the blend mask while checking whether a blend is a viable way
   6363   // to reduce the shuffle.
   6364   SmallVector<int, 32> BlendMask(Mask.size(), -1);
   6365   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
   6366 
   6367   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   6368     if (Mask[i] < 0)
   6369       continue;
   6370 
   6371     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
   6372 
   6373     if (BlendMask[Mask[i] % Size] == -1)
   6374       BlendMask[Mask[i] % Size] = Mask[i];
   6375     else if (BlendMask[Mask[i] % Size] != Mask[i])
   6376       return SDValue(); // Can't blend in the needed input!
   6377 
   6378     PermuteMask[i] = Mask[i] % Size;
   6379   }
   6380 
   6381   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
   6382   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
   6383 }
   6384 
   6385 /// \brief Generic routine to decompose a shuffle and blend into indepndent
   6386 /// blends and permutes.
   6387 ///
   6388 /// This matches the extremely common pattern for handling combined
   6389 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
   6390 /// operations. It will try to pick the best arrangement of shuffles and
   6391 /// blends.
   6392 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
   6393                                                           SDValue V1,
   6394                                                           SDValue V2,
   6395                                                           ArrayRef<int> Mask,
   6396                                                           SelectionDAG &DAG) {
   6397   // Shuffle the input elements into the desired positions in V1 and V2 and
   6398   // blend them together.
   6399   SmallVector<int, 32> V1Mask(Mask.size(), -1);
   6400   SmallVector<int, 32> V2Mask(Mask.size(), -1);
   6401   SmallVector<int, 32> BlendMask(Mask.size(), -1);
   6402   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   6403     if (Mask[i] >= 0 && Mask[i] < Size) {
   6404       V1Mask[i] = Mask[i];
   6405       BlendMask[i] = i;
   6406     } else if (Mask[i] >= Size) {
   6407       V2Mask[i] = Mask[i] - Size;
   6408       BlendMask[i] = i + Size;
   6409     }
   6410 
   6411   // Try to lower with the simpler initial blend strategy unless one of the
   6412   // input shuffles would be a no-op. We prefer to shuffle inputs as the
   6413   // shuffle may be able to fold with a load or other benefit. However, when
   6414   // we'll have to do 2x as many shuffles in order to achieve this, blending
   6415   // first is a better strategy.
   6416   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
   6417     if (SDValue BlendPerm =
   6418             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
   6419       return BlendPerm;
   6420 
   6421   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
   6422   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
   6423   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
   6424 }
   6425 
   6426 /// \brief Try to lower a vector shuffle as a byte rotation.
   6427 ///
   6428 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
   6429 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
   6430 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
   6431 /// try to generically lower a vector shuffle through such an pattern. It
   6432 /// does not check for the profitability of lowering either as PALIGNR or
   6433 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
   6434 /// This matches shuffle vectors that look like:
   6435 ///
   6436 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
   6437 ///
   6438 /// Essentially it concatenates V1 and V2, shifts right by some number of
   6439 /// elements, and takes the low elements as the result. Note that while this is
   6440 /// specified as a *right shift* because x86 is little-endian, it is a *left
   6441 /// rotate* of the vector lanes.
   6442 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
   6443                                               SDValue V2,
   6444                                               ArrayRef<int> Mask,
   6445                                               const X86Subtarget *Subtarget,
   6446                                               SelectionDAG &DAG) {
   6447   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
   6448 
   6449   int NumElts = Mask.size();
   6450   int NumLanes = VT.getSizeInBits() / 128;
   6451   int NumLaneElts = NumElts / NumLanes;
   6452 
   6453   // We need to detect various ways of spelling a rotation:
   6454   //   [11, 12, 13, 14, 15,  0,  1,  2]
   6455   //   [-1, 12, 13, 14, -1, -1,  1, -1]
   6456   //   [-1, -1, -1, -1, -1, -1,  1,  2]
   6457   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
   6458   //   [-1,  4,  5,  6, -1, -1,  9, -1]
   6459   //   [-1,  4,  5,  6, -1, -1, -1, -1]
   6460   int Rotation = 0;
   6461   SDValue Lo, Hi;
   6462   for (int l = 0; l < NumElts; l += NumLaneElts) {
   6463     for (int i = 0; i < NumLaneElts; ++i) {
   6464       if (Mask[l + i] == -1)
   6465         continue;
   6466       assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
   6467 
   6468       // Get the mod-Size index and lane correct it.
   6469       int LaneIdx = (Mask[l + i] % NumElts) - l;
   6470       // Make sure it was in this lane.
   6471       if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
   6472         return SDValue();
   6473 
   6474       // Determine where a rotated vector would have started.
   6475       int StartIdx = i - LaneIdx;
   6476       if (StartIdx == 0)
   6477         // The identity rotation isn't interesting, stop.
   6478         return SDValue();
   6479 
   6480       // If we found the tail of a vector the rotation must be the missing
   6481       // front. If we found the head of a vector, it must be how much of the
   6482       // head.
   6483       int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
   6484 
   6485       if (Rotation == 0)
   6486         Rotation = CandidateRotation;
   6487       else if (Rotation != CandidateRotation)
   6488         // The rotations don't match, so we can't match this mask.
   6489         return SDValue();
   6490 
   6491       // Compute which value this mask is pointing at.
   6492       SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
   6493 
   6494       // Compute which of the two target values this index should be assigned
   6495       // to. This reflects whether the high elements are remaining or the low
   6496       // elements are remaining.
   6497       SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
   6498 
   6499       // Either set up this value if we've not encountered it before, or check
   6500       // that it remains consistent.
   6501       if (!TargetV)
   6502         TargetV = MaskV;
   6503       else if (TargetV != MaskV)
   6504         // This may be a rotation, but it pulls from the inputs in some
   6505         // unsupported interleaving.
   6506         return SDValue();
   6507     }
   6508   }
   6509 
   6510   // Check that we successfully analyzed the mask, and normalize the results.
   6511   assert(Rotation != 0 && "Failed to locate a viable rotation!");
   6512   assert((Lo || Hi) && "Failed to find a rotated input vector!");
   6513   if (!Lo)
   6514     Lo = Hi;
   6515   else if (!Hi)
   6516     Hi = Lo;
   6517 
   6518   // The actual rotate instruction rotates bytes, so we need to scale the
   6519   // rotation based on how many bytes are in the vector lane.
   6520   int Scale = 16 / NumLaneElts;
   6521 
   6522   // SSSE3 targets can use the palignr instruction.
   6523   if (Subtarget->hasSSSE3()) {
   6524     // Cast the inputs to i8 vector of correct length to match PALIGNR.
   6525     MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
   6526     Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo);
   6527     Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi);
   6528 
   6529     return DAG.getNode(ISD::BITCAST, DL, VT,
   6530                        DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
   6531                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
   6532   }
   6533 
   6534   assert(VT.getSizeInBits() == 128 &&
   6535          "Rotate-based lowering only supports 128-bit lowering!");
   6536   assert(Mask.size() <= 16 &&
   6537          "Can shuffle at most 16 bytes in a 128-bit vector!");
   6538 
   6539   // Default SSE2 implementation
   6540   int LoByteShift = 16 - Rotation * Scale;
   6541   int HiByteShift = Rotation * Scale;
   6542 
   6543   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
   6544   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
   6545   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
   6546 
   6547   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
   6548                                 DAG.getConstant(LoByteShift, MVT::i8));
   6549   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
   6550                                 DAG.getConstant(HiByteShift, MVT::i8));
   6551   return DAG.getNode(ISD::BITCAST, DL, VT,
   6552                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
   6553 }
   6554 
   6555 /// \brief Compute whether each element of a shuffle is zeroable.
   6556 ///
   6557 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
   6558 /// Either it is an undef element in the shuffle mask, the element of the input
   6559 /// referenced is undef, or the element of the input referenced is known to be
   6560 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
   6561 /// as many lanes with this technique as possible to simplify the remaining
   6562 /// shuffle.
   6563 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
   6564                                                      SDValue V1, SDValue V2) {
   6565   SmallBitVector Zeroable(Mask.size(), false);
   6566 
   6567   while (V1.getOpcode() == ISD::BITCAST)
   6568     V1 = V1->getOperand(0);
   6569   while (V2.getOpcode() == ISD::BITCAST)
   6570     V2 = V2->getOperand(0);
   6571 
   6572   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
   6573   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
   6574 
   6575   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   6576     int M = Mask[i];
   6577     // Handle the easy cases.
   6578     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
   6579       Zeroable[i] = true;
   6580       continue;
   6581     }
   6582 
   6583     // If this is an index into a build_vector node (which has the same number
   6584     // of elements), dig out the input value and use it.
   6585     SDValue V = M < Size ? V1 : V2;
   6586     if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
   6587       continue;
   6588 
   6589     SDValue Input = V.getOperand(M % Size);
   6590     // The UNDEF opcode check really should be dead code here, but not quite
   6591     // worth asserting on (it isn't invalid, just unexpected).
   6592     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
   6593       Zeroable[i] = true;
   6594   }
   6595 
   6596   return Zeroable;
   6597 }
   6598 
   6599 /// \brief Try to emit a bitmask instruction for a shuffle.
   6600 ///
   6601 /// This handles cases where we can model a blend exactly as a bitmask due to
   6602 /// one of the inputs being zeroable.
   6603 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
   6604                                            SDValue V2, ArrayRef<int> Mask,
   6605                                            SelectionDAG &DAG) {
   6606   MVT EltVT = VT.getScalarType();
   6607   int NumEltBits = EltVT.getSizeInBits();
   6608   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
   6609   SDValue Zero = DAG.getConstant(0, IntEltVT);
   6610   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
   6611   if (EltVT.isFloatingPoint()) {
   6612     Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
   6613     AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
   6614   }
   6615   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
   6616   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   6617   SDValue V;
   6618   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   6619     if (Zeroable[i])
   6620       continue;
   6621     if (Mask[i] % Size != i)
   6622       return SDValue(); // Not a blend.
   6623     if (!V)
   6624       V = Mask[i] < Size ? V1 : V2;
   6625     else if (V != (Mask[i] < Size ? V1 : V2))
   6626       return SDValue(); // Can only let one input through the mask.
   6627 
   6628     VMaskOps[i] = AllOnes;
   6629   }
   6630   if (!V)
   6631     return SDValue(); // No non-zeroable elements!
   6632 
   6633   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
   6634   V = DAG.getNode(VT.isFloatingPoint()
   6635                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
   6636                   DL, VT, V, VMask);
   6637   return V;
   6638 }
   6639 
   6640 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
   6641 ///
   6642 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
   6643 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
   6644 /// matches elements from one of the input vectors shuffled to the left or
   6645 /// right with zeroable elements 'shifted in'. It handles both the strictly
   6646 /// bit-wise element shifts and the byte shift across an entire 128-bit double
   6647 /// quad word lane.
   6648 ///
   6649 /// PSHL : (little-endian) left bit shift.
   6650 /// [ zz, 0, zz,  2 ]
   6651 /// [ -1, 4, zz, -1 ]
   6652 /// PSRL : (little-endian) right bit shift.
   6653 /// [  1, zz,  3, zz]
   6654 /// [ -1, -1,  7, zz]
   6655 /// PSLLDQ : (little-endian) left byte shift
   6656 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
   6657 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
   6658 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
   6659 /// PSRLDQ : (little-endian) right byte shift
   6660 /// [  5, 6,  7, zz, zz, zz, zz, zz]
   6661 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
   6662 /// [  1, 2, -1, -1, -1, -1, zz, zz]
   6663 static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
   6664                                          SDValue V2, ArrayRef<int> Mask,
   6665                                          SelectionDAG &DAG) {
   6666   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   6667 
   6668   int Size = Mask.size();
   6669   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
   6670 
   6671   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
   6672     for (int i = 0; i < Size; i += Scale)
   6673       for (int j = 0; j < Shift; ++j)
   6674         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
   6675           return false;
   6676 
   6677     return true;
   6678   };
   6679 
   6680   auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
   6681     for (int i = 0; i != Size; i += Scale) {
   6682       unsigned Pos = Left ? i + Shift : i;
   6683       unsigned Low = Left ? i : i + Shift;
   6684       unsigned Len = Scale - Shift;
   6685       if (!isSequentialOrUndefInRange(Mask, Pos, Len,
   6686                                       Low + (V == V1 ? 0 : Size)))
   6687         return SDValue();
   6688     }
   6689 
   6690     int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
   6691     bool ByteShift = ShiftEltBits > 64;
   6692     unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
   6693                            : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
   6694     int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
   6695 
   6696     // Normalize the scale for byte shifts to still produce an i64 element
   6697     // type.
   6698     Scale = ByteShift ? Scale / 2 : Scale;
   6699 
   6700     // We need to round trip through the appropriate type for the shift.
   6701     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
   6702     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
   6703     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
   6704            "Illegal integer vector type");
   6705     V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
   6706 
   6707     V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
   6708     return DAG.getNode(ISD::BITCAST, DL, VT, V);
   6709   };
   6710 
   6711   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
   6712   // keep doubling the size of the integer elements up to that. We can
   6713   // then shift the elements of the integer vector by whole multiples of
   6714   // their width within the elements of the larger integer vector. Test each
   6715   // multiple to see if we can find a match with the moved element indices
   6716   // and that the shifted in elements are all zeroable.
   6717   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
   6718     for (int Shift = 1; Shift != Scale; ++Shift)
   6719       for (bool Left : {true, false})
   6720         if (CheckZeros(Shift, Scale, Left))
   6721           for (SDValue V : {V1, V2})
   6722             if (SDValue Match = MatchShift(Shift, Scale, Left, V))
   6723               return Match;
   6724 
   6725   // no match
   6726   return SDValue();
   6727 }
   6728 
   6729 /// \brief Lower a vector shuffle as a zero or any extension.
   6730 ///
   6731 /// Given a specific number of elements, element bit width, and extension
   6732 /// stride, produce either a zero or any extension based on the available
   6733 /// features of the subtarget.
   6734 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   6735     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
   6736     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   6737   assert(Scale > 1 && "Need a scale to extend.");
   6738   int NumElements = VT.getVectorNumElements();
   6739   int EltBits = VT.getScalarSizeInBits();
   6740   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
   6741          "Only 8, 16, and 32 bit elements can be extended.");
   6742   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
   6743 
   6744   // Found a valid zext mask! Try various lowering strategies based on the
   6745   // input type and available ISA extensions.
   6746   if (Subtarget->hasSSE41()) {
   6747     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
   6748                                  NumElements / Scale);
   6749     return DAG.getNode(ISD::BITCAST, DL, VT,
   6750                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
   6751   }
   6752 
   6753   // For any extends we can cheat for larger element sizes and use shuffle
   6754   // instructions that can fold with a load and/or copy.
   6755   if (AnyExt && EltBits == 32) {
   6756     int PSHUFDMask[4] = {0, -1, 1, -1};
   6757     return DAG.getNode(
   6758         ISD::BITCAST, DL, VT,
   6759         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
   6760                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
   6761                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
   6762   }
   6763   if (AnyExt && EltBits == 16 && Scale > 2) {
   6764     int PSHUFDMask[4] = {0, -1, 0, -1};
   6765     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
   6766                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
   6767                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
   6768     int PSHUFHWMask[4] = {1, -1, -1, -1};
   6769     return DAG.getNode(
   6770         ISD::BITCAST, DL, VT,
   6771         DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
   6772                     DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
   6773                     getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
   6774   }
   6775 
   6776   // If this would require more than 2 unpack instructions to expand, use
   6777   // pshufb when available. We can only use more than 2 unpack instructions
   6778   // when zero extending i8 elements which also makes it easier to use pshufb.
   6779   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
   6780     assert(NumElements == 16 && "Unexpected byte vector width!");
   6781     SDValue PSHUFBMask[16];
   6782     for (int i = 0; i < 16; ++i)
   6783       PSHUFBMask[i] =
   6784           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
   6785     InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
   6786     return DAG.getNode(ISD::BITCAST, DL, VT,
   6787                        DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
   6788                                    DAG.getNode(ISD::BUILD_VECTOR, DL,
   6789                                                MVT::v16i8, PSHUFBMask)));
   6790   }
   6791 
   6792   // Otherwise emit a sequence of unpacks.
   6793   do {
   6794     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
   6795     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
   6796                          : getZeroVector(InputVT, Subtarget, DAG, DL);
   6797     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
   6798     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
   6799     Scale /= 2;
   6800     EltBits *= 2;
   6801     NumElements /= 2;
   6802   } while (Scale > 1);
   6803   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
   6804 }
   6805 
   6806 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
   6807 ///
   6808 /// This routine will try to do everything in its power to cleverly lower
   6809 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
   6810 /// check for the profitability of this lowering,  it tries to aggressively
   6811 /// match this pattern. It will use all of the micro-architectural details it
   6812 /// can to emit an efficient lowering. It handles both blends with all-zero
   6813 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
   6814 /// masking out later).
   6815 ///
   6816 /// The reason we have dedicated lowering for zext-style shuffles is that they
   6817 /// are both incredibly common and often quite performance sensitive.
   6818 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
   6819     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   6820     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   6821   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   6822 
   6823   int Bits = VT.getSizeInBits();
   6824   int NumElements = VT.getVectorNumElements();
   6825   assert(VT.getScalarSizeInBits() <= 32 &&
   6826          "Exceeds 32-bit integer zero extension limit");
   6827   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
   6828 
   6829   // Define a helper function to check a particular ext-scale and lower to it if
   6830   // valid.
   6831   auto Lower = [&](int Scale) -> SDValue {
   6832     SDValue InputV;
   6833     bool AnyExt = true;
   6834     for (int i = 0; i < NumElements; ++i) {
   6835       if (Mask[i] == -1)
   6836         continue; // Valid anywhere but doesn't tell us anything.
   6837       if (i % Scale != 0) {
   6838         // Each of the extended elements need to be zeroable.
   6839         if (!Zeroable[i])
   6840           return SDValue();
   6841 
   6842         // We no longer are in the anyext case.
   6843         AnyExt = false;
   6844         continue;
   6845       }
   6846 
   6847       // Each of the base elements needs to be consecutive indices into the
   6848       // same input vector.
   6849       SDValue V = Mask[i] < NumElements ? V1 : V2;
   6850       if (!InputV)
   6851         InputV = V;
   6852       else if (InputV != V)
   6853         return SDValue(); // Flip-flopping inputs.
   6854 
   6855       if (Mask[i] % NumElements != i / Scale)
   6856         return SDValue(); // Non-consecutive strided elements.
   6857     }
   6858 
   6859     // If we fail to find an input, we have a zero-shuffle which should always
   6860     // have already been handled.
   6861     // FIXME: Maybe handle this here in case during blending we end up with one?
   6862     if (!InputV)
   6863       return SDValue();
   6864 
   6865     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   6866         DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
   6867   };
   6868 
   6869   // The widest scale possible for extending is to a 64-bit integer.
   6870   assert(Bits % 64 == 0 &&
   6871          "The number of bits in a vector must be divisible by 64 on x86!");
   6872   int NumExtElements = Bits / 64;
   6873 
   6874   // Each iteration, try extending the elements half as much, but into twice as
   6875   // many elements.
   6876   for (; NumExtElements < NumElements; NumExtElements *= 2) {
   6877     assert(NumElements % NumExtElements == 0 &&
   6878            "The input vector size must be divisible by the extended size.");
   6879     if (SDValue V = Lower(NumElements / NumExtElements))
   6880       return V;
   6881   }
   6882 
   6883   // General extends failed, but 128-bit vectors may be able to use MOVQ.
   6884   if (Bits != 128)
   6885     return SDValue();
   6886 
   6887   // Returns one of the source operands if the shuffle can be reduced to a
   6888   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
   6889   auto CanZExtLowHalf = [&]() {
   6890     for (int i = NumElements / 2; i != NumElements; ++i)
   6891       if (!Zeroable[i])
   6892         return SDValue();
   6893     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
   6894       return V1;
   6895     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
   6896       return V2;
   6897     return SDValue();
   6898   };
   6899 
   6900   if (SDValue V = CanZExtLowHalf()) {
   6901     V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
   6902     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
   6903     return DAG.getNode(ISD::BITCAST, DL, VT, V);
   6904   }
   6905 
   6906   // No viable ext lowering found.
   6907   return SDValue();
   6908 }
   6909 
   6910 /// \brief Try to get a scalar value for a specific element of a vector.
   6911 ///
   6912 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
   6913 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
   6914                                               SelectionDAG &DAG) {
   6915   MVT VT = V.getSimpleValueType();
   6916   MVT EltVT = VT.getVectorElementType();
   6917   while (V.getOpcode() == ISD::BITCAST)
   6918     V = V.getOperand(0);
   6919   // If the bitcasts shift the element size, we can't extract an equivalent
   6920   // element from it.
   6921   MVT NewVT = V.getSimpleValueType();
   6922   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
   6923     return SDValue();
   6924 
   6925   if (V.getOpcode() == ISD::BUILD_VECTOR ||
   6926       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
   6927     return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
   6928 
   6929   return SDValue();
   6930 }
   6931 
   6932 /// \brief Helper to test for a load that can be folded with x86 shuffles.
   6933 ///
   6934 /// This is particularly important because the set of instructions varies
   6935 /// significantly based on whether the operand is a load or not.
   6936 static bool isShuffleFoldableLoad(SDValue V) {
   6937   while (V.getOpcode() == ISD::BITCAST)
   6938     V = V.getOperand(0);
   6939 
   6940   return ISD::isNON_EXTLoad(V.getNode());
   6941 }
   6942 
   6943 /// \brief Try to lower insertion of a single element into a zero vector.
   6944 ///
   6945 /// This is a common pattern that we have especially efficient patterns to lower
   6946 /// across all subtarget feature sets.
   6947 static SDValue lowerVectorShuffleAsElementInsertion(
   6948     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   6949     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   6950   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   6951   MVT ExtVT = VT;
   6952   MVT EltVT = VT.getVectorElementType();
   6953 
   6954   int V2Index = std::find_if(Mask.begin(), Mask.end(),
   6955                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
   6956                 Mask.begin();
   6957   bool IsV1Zeroable = true;
   6958   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   6959     if (i != V2Index && !Zeroable[i]) {
   6960       IsV1Zeroable = false;
   6961       break;
   6962     }
   6963 
   6964   // Check for a single input from a SCALAR_TO_VECTOR node.
   6965   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
   6966   // all the smarts here sunk into that routine. However, the current
   6967   // lowering of BUILD_VECTOR makes that nearly impossible until the old
   6968   // vector shuffle lowering is dead.
   6969   if (SDValue V2S = getScalarValueForVectorElement(
   6970           V2, Mask[V2Index] - Mask.size(), DAG)) {
   6971     // We need to zext the scalar if it is smaller than an i32.
   6972     V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
   6973     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
   6974       // Using zext to expand a narrow element won't work for non-zero
   6975       // insertions.
   6976       if (!IsV1Zeroable)
   6977         return SDValue();
   6978 
   6979       // Zero-extend directly to i32.
   6980       ExtVT = MVT::v4i32;
   6981       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
   6982     }
   6983     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
   6984   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
   6985              EltVT == MVT::i16) {
   6986     // Either not inserting from the low element of the input or the input
   6987     // element size is too small to use VZEXT_MOVL to clear the high bits.
   6988     return SDValue();
   6989   }
   6990 
   6991   if (!IsV1Zeroable) {
   6992     // If V1 can't be treated as a zero vector we have fewer options to lower
   6993     // this. We can't support integer vectors or non-zero targets cheaply, and
   6994     // the V1 elements can't be permuted in any way.
   6995     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
   6996     if (!VT.isFloatingPoint() || V2Index != 0)
   6997       return SDValue();
   6998     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
   6999     V1Mask[V2Index] = -1;
   7000     if (!isNoopShuffleMask(V1Mask))
   7001       return SDValue();
   7002     // This is essentially a special case blend operation, but if we have
   7003     // general purpose blend operations, they are always faster. Bail and let
   7004     // the rest of the lowering handle these as blends.
   7005     if (Subtarget->hasSSE41())
   7006       return SDValue();
   7007 
   7008     // Otherwise, use MOVSD or MOVSS.
   7009     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
   7010            "Only two types of floating point element types to handle!");
   7011     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
   7012                        ExtVT, V1, V2);
   7013   }
   7014 
   7015   // This lowering only works for the low element with floating point vectors.
   7016   if (VT.isFloatingPoint() && V2Index != 0)
   7017     return SDValue();
   7018 
   7019   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
   7020   if (ExtVT != VT)
   7021     V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
   7022 
   7023   if (V2Index != 0) {
   7024     // If we have 4 or fewer lanes we can cheaply shuffle the element into
   7025     // the desired position. Otherwise it is more efficient to do a vector
   7026     // shift left. We know that we can do a vector shift left because all
   7027     // the inputs are zero.
   7028     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
   7029       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
   7030       V2Shuffle[V2Index] = 0;
   7031       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
   7032     } else {
   7033       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
   7034       V2 = DAG.getNode(
   7035           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
   7036           DAG.getConstant(
   7037               V2Index * EltVT.getSizeInBits()/8,
   7038               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
   7039       V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
   7040     }
   7041   }
   7042   return V2;
   7043 }
   7044 
   7045 /// \brief Try to lower broadcast of a single element.
   7046 ///
   7047 /// For convenience, this code also bundles all of the subtarget feature set
   7048 /// filtering. While a little annoying to re-dispatch on type here, there isn't
   7049 /// a convenient way to factor it out.
   7050 static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
   7051                                              ArrayRef<int> Mask,
   7052                                              const X86Subtarget *Subtarget,
   7053                                              SelectionDAG &DAG) {
   7054   if (!Subtarget->hasAVX())
   7055     return SDValue();
   7056   if (VT.isInteger() && !Subtarget->hasAVX2())
   7057     return SDValue();
   7058 
   7059   // Check that the mask is a broadcast.
   7060   int BroadcastIdx = -1;
   7061   for (int M : Mask)
   7062     if (M >= 0 && BroadcastIdx == -1)
   7063       BroadcastIdx = M;
   7064     else if (M >= 0 && M != BroadcastIdx)
   7065       return SDValue();
   7066 
   7067   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
   7068                                             "a sorted mask where the broadcast "
   7069                                             "comes from V1.");
   7070 
   7071   // Go up the chain of (vector) values to find a scalar load that we can
   7072   // combine with the broadcast.
   7073   for (;;) {
   7074     switch (V.getOpcode()) {
   7075     case ISD::CONCAT_VECTORS: {
   7076       int OperandSize = Mask.size() / V.getNumOperands();
   7077       V = V.getOperand(BroadcastIdx / OperandSize);
   7078       BroadcastIdx %= OperandSize;
   7079       continue;
   7080     }
   7081 
   7082     case ISD::INSERT_SUBVECTOR: {
   7083       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
   7084       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
   7085       if (!ConstantIdx)
   7086         break;
   7087 
   7088       int BeginIdx = (int)ConstantIdx->getZExtValue();
   7089       int EndIdx =
   7090           BeginIdx + (int)VInner.getValueType().getVectorNumElements();
   7091       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
   7092         BroadcastIdx -= BeginIdx;
   7093         V = VInner;
   7094       } else {
   7095         V = VOuter;
   7096       }
   7097       continue;
   7098     }
   7099     }
   7100     break;
   7101   }
   7102 
   7103   // Check if this is a broadcast of a scalar. We special case lowering
   7104   // for scalars so that we can more effectively fold with loads.
   7105   if (V.getOpcode() == ISD::BUILD_VECTOR ||
   7106       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
   7107     V = V.getOperand(BroadcastIdx);
   7108 
   7109     // If the scalar isn't a load, we can't broadcast from it in AVX1.
   7110     // Only AVX2 has register broadcasts.
   7111     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
   7112       return SDValue();
   7113   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
   7114     // We can't broadcast from a vector register without AVX2, and we can only
   7115     // broadcast from the zero-element of a vector register.
   7116     return SDValue();
   7117   }
   7118 
   7119   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
   7120 }
   7121 
   7122 // Check for whether we can use INSERTPS to perform the shuffle. We only use
   7123 // INSERTPS when the V1 elements are already in the correct locations
   7124 // because otherwise we can just always use two SHUFPS instructions which
   7125 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
   7126 // perform INSERTPS if a single V1 element is out of place and all V2
   7127 // elements are zeroable.
   7128 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
   7129                                             ArrayRef<int> Mask,
   7130                                             SelectionDAG &DAG) {
   7131   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
   7132   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   7133   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   7134   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   7135 
   7136   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   7137 
   7138   unsigned ZMask = 0;
   7139   int V1DstIndex = -1;
   7140   int V2DstIndex = -1;
   7141   bool V1UsedInPlace = false;
   7142 
   7143   for (int i = 0; i < 4; ++i) {
   7144     // Synthesize a zero mask from the zeroable elements (includes undefs).
   7145     if (Zeroable[i]) {
   7146       ZMask |= 1 << i;
   7147       continue;
   7148     }
   7149 
   7150     // Flag if we use any V1 inputs in place.
   7151     if (i == Mask[i]) {
   7152       V1UsedInPlace = true;
   7153       continue;
   7154     }
   7155 
   7156     // We can only insert a single non-zeroable element.
   7157     if (V1DstIndex != -1 || V2DstIndex != -1)
   7158       return SDValue();
   7159 
   7160     if (Mask[i] < 4) {
   7161       // V1 input out of place for insertion.
   7162       V1DstIndex = i;
   7163     } else {
   7164       // V2 input for insertion.
   7165       V2DstIndex = i;
   7166     }
   7167   }
   7168 
   7169   // Don't bother if we have no (non-zeroable) element for insertion.
   7170   if (V1DstIndex == -1 && V2DstIndex == -1)
   7171     return SDValue();
   7172 
   7173   // Determine element insertion src/dst indices. The src index is from the
   7174   // start of the inserted vector, not the start of the concatenated vector.
   7175   unsigned V2SrcIndex = 0;
   7176   if (V1DstIndex != -1) {
   7177     // If we have a V1 input out of place, we use V1 as the V2 element insertion
   7178     // and don't use the original V2 at all.
   7179     V2SrcIndex = Mask[V1DstIndex];
   7180     V2DstIndex = V1DstIndex;
   7181     V2 = V1;
   7182   } else {
   7183     V2SrcIndex = Mask[V2DstIndex] - 4;
   7184   }
   7185 
   7186   // If no V1 inputs are used in place, then the result is created only from
   7187   // the zero mask and the V2 insertion - so remove V1 dependency.
   7188   if (!V1UsedInPlace)
   7189     V1 = DAG.getUNDEF(MVT::v4f32);
   7190 
   7191   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
   7192   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
   7193 
   7194   // Insert the V2 element into the desired position.
   7195   SDLoc DL(Op);
   7196   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
   7197                      DAG.getConstant(InsertPSMask, MVT::i8));
   7198 }
   7199 
   7200 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
   7201 /// UNPCK instruction.
   7202 ///
   7203 /// This specifically targets cases where we end up with alternating between
   7204 /// the two inputs, and so can permute them into something that feeds a single
   7205 /// UNPCK instruction. Note that this routine only targets integer vectors
   7206 /// because for floating point vectors we have a generalized SHUFPS lowering
   7207 /// strategy that handles everything that doesn't *exactly* match an unpack,
   7208 /// making this clever lowering unnecessary.
   7209 static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1,
   7210                                           SDValue V2, ArrayRef<int> Mask,
   7211                                           SelectionDAG &DAG) {
   7212   assert(!VT.isFloatingPoint() &&
   7213          "This routine only supports integer vectors.");
   7214   assert(!isSingleInputShuffleMask(Mask) &&
   7215          "This routine should only be used when blending two inputs.");
   7216   assert(Mask.size() >= 2 && "Single element masks are invalid.");
   7217 
   7218   int Size = Mask.size();
   7219 
   7220   int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
   7221     return M >= 0 && M % Size < Size / 2;
   7222   });
   7223   int NumHiInputs = std::count_if(
   7224       Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; });
   7225 
   7226   bool UnpackLo = NumLoInputs >= NumHiInputs;
   7227 
   7228   auto TryUnpack = [&](MVT UnpackVT, int Scale) {
   7229     SmallVector<int, 32> V1Mask(Mask.size(), -1);
   7230     SmallVector<int, 32> V2Mask(Mask.size(), -1);
   7231 
   7232     for (int i = 0; i < Size; ++i) {
   7233       if (Mask[i] < 0)
   7234         continue;
   7235 
   7236       // Each element of the unpack contains Scale elements from this mask.
   7237       int UnpackIdx = i / Scale;
   7238 
   7239       // We only handle the case where V1 feeds the first slots of the unpack.
   7240       // We rely on canonicalization to ensure this is the case.
   7241       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
   7242         return SDValue();
   7243 
   7244       // Setup the mask for this input. The indexing is tricky as we have to
   7245       // handle the unpack stride.
   7246       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
   7247       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
   7248           Mask[i] % Size;
   7249     }
   7250 
   7251     // If we will have to shuffle both inputs to use the unpack, check whether
   7252     // we can just unpack first and shuffle the result. If so, skip this unpack.
   7253     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
   7254         !isNoopShuffleMask(V2Mask))
   7255       return SDValue();
   7256 
   7257     // Shuffle the inputs into place.
   7258     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
   7259     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
   7260 
   7261     // Cast the inputs to the type we will use to unpack them.
   7262     V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1);
   7263     V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2);
   7264 
   7265     // Unpack the inputs and cast the result back to the desired type.
   7266     return DAG.getNode(ISD::BITCAST, DL, VT,
   7267                        DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
   7268                                    DL, UnpackVT, V1, V2));
   7269   };
   7270 
   7271   // We try each unpack from the largest to the smallest to try and find one
   7272   // that fits this mask.
   7273   int OrigNumElements = VT.getVectorNumElements();
   7274   int OrigScalarSize = VT.getScalarSizeInBits();
   7275   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
   7276     int Scale = ScalarSize / OrigScalarSize;
   7277     int NumElements = OrigNumElements / Scale;
   7278     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
   7279     if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
   7280       return Unpack;
   7281   }
   7282 
   7283   // If none of the unpack-rooted lowerings worked (or were profitable) try an
   7284   // initial unpack.
   7285   if (NumLoInputs == 0 || NumHiInputs == 0) {
   7286     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
   7287            "We have to have *some* inputs!");
   7288     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
   7289 
   7290     // FIXME: We could consider the total complexity of the permute of each
   7291     // possible unpacking. Or at the least we should consider how many
   7292     // half-crossings are created.
   7293     // FIXME: We could consider commuting the unpacks.
   7294 
   7295     SmallVector<int, 32> PermMask;
   7296     PermMask.assign(Size, -1);
   7297     for (int i = 0; i < Size; ++i) {
   7298       if (Mask[i] < 0)
   7299         continue;
   7300 
   7301       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
   7302 
   7303       PermMask[i] =
   7304           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
   7305     }
   7306     return DAG.getVectorShuffle(
   7307         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
   7308                             DL, VT, V1, V2),
   7309         DAG.getUNDEF(VT), PermMask);
   7310   }
   7311 
   7312   return SDValue();
   7313 }
   7314 
   7315 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
   7316 ///
   7317 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
   7318 /// support for floating point shuffles but not integer shuffles. These
   7319 /// instructions will incur a domain crossing penalty on some chips though so
   7320 /// it is better to avoid lowering through this for integer vectors where
   7321 /// possible.
   7322 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   7323                                        const X86Subtarget *Subtarget,
   7324                                        SelectionDAG &DAG) {
   7325   SDLoc DL(Op);
   7326   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
   7327   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   7328   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   7329   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   7330   ArrayRef<int> Mask = SVOp->getMask();
   7331   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
   7332 
   7333   if (isSingleInputShuffleMask(Mask)) {
   7334     // Use low duplicate instructions for masks that match their pattern.
   7335     if (Subtarget->hasSSE3())
   7336       if (isShuffleEquivalent(V1, V2, Mask, {0, 0}))
   7337         return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
   7338 
   7339     // Straight shuffle of a single input vector. Simulate this by using the
   7340     // single input as both of the "inputs" to this instruction..
   7341     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
   7342 
   7343     if (Subtarget->hasAVX()) {
   7344       // If we have AVX, we can use VPERMILPS which will allow folding a load
   7345       // into the shuffle.
   7346       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
   7347                          DAG.getConstant(SHUFPDMask, MVT::i8));
   7348     }
   7349 
   7350     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
   7351                        DAG.getConstant(SHUFPDMask, MVT::i8));
   7352   }
   7353   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
   7354   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
   7355 
   7356   // If we have a single input, insert that into V1 if we can do so cheaply.
   7357   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
   7358     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   7359             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
   7360       return Insertion;
   7361     // Try inverting the insertion since for v2 masks it is easy to do and we
   7362     // can't reliably sort the mask one way or the other.
   7363     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
   7364                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
   7365     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   7366             DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
   7367       return Insertion;
   7368   }
   7369 
   7370   // Try to use one of the special instruction patterns to handle two common
   7371   // blend patterns if a zero-blend above didn't work.
   7372   if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
   7373       isShuffleEquivalent(V1, V2, Mask, {1, 3}))
   7374     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
   7375       // We can either use a special instruction to load over the low double or
   7376       // to move just the low double.
   7377       return DAG.getNode(
   7378           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
   7379           DL, MVT::v2f64, V2,
   7380           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
   7381 
   7382   if (Subtarget->hasSSE41())
   7383     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
   7384                                                   Subtarget, DAG))
   7385       return Blend;
   7386 
   7387   // Use dedicated unpack instructions for masks that match their pattern.
   7388   if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
   7389     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
   7390   if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
   7391     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
   7392 
   7393   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
   7394   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
   7395                      DAG.getConstant(SHUFPDMask, MVT::i8));
   7396 }
   7397 
   7398 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
   7399 ///
   7400 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
   7401 /// the integer unit to minimize domain crossing penalties. However, for blends
   7402 /// it falls back to the floating point shuffle operation with appropriate bit
   7403 /// casting.
   7404 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   7405                                        const X86Subtarget *Subtarget,
   7406                                        SelectionDAG &DAG) {
   7407   SDLoc DL(Op);
   7408   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
   7409   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   7410   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   7411   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   7412   ArrayRef<int> Mask = SVOp->getMask();
   7413   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
   7414 
   7415   if (isSingleInputShuffleMask(Mask)) {
   7416     // Check for being able to broadcast a single element.
   7417     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
   7418                                                           Mask, Subtarget, DAG))
   7419       return Broadcast;
   7420 
   7421     // Straight shuffle of a single input vector. For everything from SSE2
   7422     // onward this has a single fast instruction with no scary immediates.
   7423     // We have to map the mask as it is actually a v4i32 shuffle instruction.
   7424     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
   7425     int WidenedMask[4] = {
   7426         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
   7427         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
   7428     return DAG.getNode(
   7429         ISD::BITCAST, DL, MVT::v2i64,
   7430         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
   7431                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
   7432   }
   7433   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
   7434   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
   7435   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
   7436   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
   7437 
   7438   // If we have a blend of two PACKUS operations an the blend aligns with the
   7439   // low and half halves, we can just merge the PACKUS operations. This is
   7440   // particularly important as it lets us merge shuffles that this routine itself
   7441   // creates.
   7442   auto GetPackNode = [](SDValue V) {
   7443     while (V.getOpcode() == ISD::BITCAST)
   7444       V = V.getOperand(0);
   7445 
   7446     return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
   7447   };
   7448   if (SDValue V1Pack = GetPackNode(V1))
   7449     if (SDValue V2Pack = GetPackNode(V2))
   7450       return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
   7451                          DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
   7452                                      Mask[0] == 0 ? V1Pack.getOperand(0)
   7453                                                   : V1Pack.getOperand(1),
   7454                                      Mask[1] == 2 ? V2Pack.getOperand(0)
   7455                                                   : V2Pack.getOperand(1)));
   7456 
   7457   // Try to use shift instructions.
   7458   if (SDValue Shift =
   7459           lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
   7460     return Shift;
   7461 
   7462   // When loading a scalar and then shuffling it into a vector we can often do
   7463   // the insertion cheaply.
   7464   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   7465           DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
   7466     return Insertion;
   7467   // Try inverting the insertion since for v2 masks it is easy to do and we
   7468   // can't reliably sort the mask one way or the other.
   7469   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
   7470   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   7471           DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
   7472     return Insertion;
   7473 
   7474   // We have different paths for blend lowering, but they all must use the
   7475   // *exact* same predicate.
   7476   bool IsBlendSupported = Subtarget->hasSSE41();
   7477   if (IsBlendSupported)
   7478     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
   7479                                                   Subtarget, DAG))
   7480       return Blend;
   7481 
   7482   // Use dedicated unpack instructions for masks that match their pattern.
   7483   if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
   7484     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
   7485   if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
   7486     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
   7487 
   7488   // Try to use byte rotation instructions.
   7489   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   7490   if (Subtarget->hasSSSE3())
   7491     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   7492             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
   7493       return Rotate;
   7494 
   7495   // If we have direct support for blends, we should lower by decomposing into
   7496   // a permute. That will be faster than the domain cross.
   7497   if (IsBlendSupported)
   7498     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
   7499                                                       Mask, DAG);
   7500 
   7501   // We implement this with SHUFPD which is pretty lame because it will likely
   7502   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
   7503   // However, all the alternatives are still more cycles and newer chips don't
   7504   // have this problem. It would be really nice if x86 had better shuffles here.
   7505   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
   7506   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
   7507   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
   7508                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
   7509 }
   7510 
   7511 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
   7512 ///
   7513 /// This is used to disable more specialized lowerings when the shufps lowering
   7514 /// will happen to be efficient.
   7515 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
   7516   // This routine only handles 128-bit shufps.
   7517   assert(Mask.size() == 4 && "Unsupported mask size!");
   7518 
   7519   // To lower with a single SHUFPS we need to have the low half and high half
   7520   // each requiring a single input.
   7521   if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
   7522     return false;
   7523   if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
   7524     return false;
   7525 
   7526   return true;
   7527 }
   7528 
   7529 /// \brief Lower a vector shuffle using the SHUFPS instruction.
   7530 ///
   7531 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
   7532 /// It makes no assumptions about whether this is the *best* lowering, it simply
   7533 /// uses it.
   7534 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
   7535                                             ArrayRef<int> Mask, SDValue V1,
   7536                                             SDValue V2, SelectionDAG &DAG) {
   7537   SDValue LowV = V1, HighV = V2;
   7538   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
   7539 
   7540   int NumV2Elements =
   7541       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
   7542 
   7543   if (NumV2Elements == 1) {
   7544     int V2Index =
   7545         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
   7546         Mask.begin();
   7547 
   7548     // Compute the index adjacent to V2Index and in the same half by toggling
   7549     // the low bit.
   7550     int V2AdjIndex = V2Index ^ 1;
   7551 
   7552     if (Mask[V2AdjIndex] == -1) {
   7553       // Handles all the cases where we have a single V2 element and an undef.
   7554       // This will only ever happen in the high lanes because we commute the
   7555       // vector otherwise.
   7556       if (V2Index < 2)
   7557         std::swap(LowV, HighV);
   7558       NewMask[V2Index] -= 4;
   7559     } else {
   7560       // Handle the case where the V2 element ends up adjacent to a V1 element.
   7561       // To make this work, blend them together as the first step.
   7562       int V1Index = V2AdjIndex;
   7563       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
   7564       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
   7565                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
   7566 
   7567       // Now proceed to reconstruct the final blend as we have the necessary
   7568       // high or low half formed.
   7569       if (V2Index < 2) {
   7570         LowV = V2;
   7571         HighV = V1;
   7572       } else {
   7573         HighV = V2;
   7574       }
   7575       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
   7576       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
   7577     }
   7578   } else if (NumV2Elements == 2) {
   7579     if (Mask[0] < 4 && Mask[1] < 4) {
   7580       // Handle the easy case where we have V1 in the low lanes and V2 in the
   7581       // high lanes.
   7582       NewMask[2] -= 4;
   7583       NewMask[3] -= 4;
   7584     } else if (Mask[2] < 4 && Mask[3] < 4) {
   7585       // We also handle the reversed case because this utility may get called
   7586       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
   7587       // arrange things in the right direction.
   7588       NewMask[0] -= 4;
   7589       NewMask[1] -= 4;
   7590       HighV = V1;
   7591       LowV = V2;
   7592     } else {
   7593       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
   7594       // trying to place elements directly, just blend them and set up the final
   7595       // shuffle to place them.
   7596 
   7597       // The first two blend mask elements are for V1, the second two are for
   7598       // V2.
   7599       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
   7600                           Mask[2] < 4 ? Mask[2] : Mask[3],
   7601                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
   7602                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
   7603       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
   7604                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
   7605 
   7606       // Now we do a normal shuffle of V1 by giving V1 as both operands to
   7607       // a blend.
   7608       LowV = HighV = V1;
   7609       NewMask[0] = Mask[0] < 4 ? 0 : 2;
   7610       NewMask[1] = Mask[0] < 4 ? 2 : 0;
   7611       NewMask[2] = Mask[2] < 4 ? 1 : 3;
   7612       NewMask[3] = Mask[2] < 4 ? 3 : 1;
   7613     }
   7614   }
   7615   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
   7616                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
   7617 }
   7618 
   7619 /// \brief Lower 4-lane 32-bit floating point shuffles.
   7620 ///
   7621 /// Uses instructions exclusively from the floating point unit to minimize
   7622 /// domain crossing penalties, as these are sufficient to implement all v4f32
   7623 /// shuffles.
   7624 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   7625                                        const X86Subtarget *Subtarget,
   7626                                        SelectionDAG &DAG) {
   7627   SDLoc DL(Op);
   7628   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
   7629   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   7630   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   7631   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   7632   ArrayRef<int> Mask = SVOp->getMask();
   7633   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   7634 
   7635   int NumV2Elements =
   7636       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
   7637 
   7638   if (NumV2Elements == 0) {
   7639     // Check for being able to broadcast a single element.
   7640     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
   7641                                                           Mask, Subtarget, DAG))
   7642       return Broadcast;
   7643 
   7644     // Use even/odd duplicate instructions for masks that match their pattern.
   7645     if (Subtarget->hasSSE3()) {
   7646       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
   7647         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
   7648       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
   7649         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
   7650     }
   7651 
   7652     if (Subtarget->hasAVX()) {
   7653       // If we have AVX, we can use VPERMILPS which will allow folding a load
   7654       // into the shuffle.
   7655       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
   7656                          getV4X86ShuffleImm8ForMask(Mask, DAG));
   7657     }
   7658 
   7659     // Otherwise, use a straight shuffle of a single input vector. We pass the
   7660     // input vector to both operands to simulate this with a SHUFPS.
   7661     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
   7662                        getV4X86ShuffleImm8ForMask(Mask, DAG));
   7663   }
   7664 
   7665   // There are special ways we can lower some single-element blends. However, we
   7666   // have custom ways we can lower more complex single-element blends below that
   7667   // we defer to if both this and BLENDPS fail to match, so restrict this to
   7668   // when the V2 input is targeting element 0 of the mask -- that is the fast
   7669   // case here.
   7670   if (NumV2Elements == 1 && Mask[0] >= 4)
   7671     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
   7672                                                          Mask, Subtarget, DAG))
   7673       return V;
   7674 
   7675   if (Subtarget->hasSSE41()) {
   7676     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
   7677                                                   Subtarget, DAG))
   7678       return Blend;
   7679 
   7680     // Use INSERTPS if we can complete the shuffle efficiently.
   7681     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
   7682       return V;
   7683 
   7684     if (!isSingleSHUFPSMask(Mask))
   7685       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
   7686               DL, MVT::v4f32, V1, V2, Mask, DAG))
   7687         return BlendPerm;
   7688   }
   7689 
   7690   // Use dedicated unpack instructions for masks that match their pattern.
   7691   if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
   7692     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
   7693   if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
   7694     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
   7695   if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
   7696     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
   7697   if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
   7698     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
   7699 
   7700   // Otherwise fall back to a SHUFPS lowering strategy.
   7701   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
   7702 }
   7703 
   7704 /// \brief Lower 4-lane i32 vector shuffles.
   7705 ///
   7706 /// We try to handle these with integer-domain shuffles where we can, but for
   7707 /// blends we use the floating point domain blend instructions.
   7708 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   7709                                        const X86Subtarget *Subtarget,
   7710                                        SelectionDAG &DAG) {
   7711   SDLoc DL(Op);
   7712   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
   7713   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   7714   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   7715   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   7716   ArrayRef<int> Mask = SVOp->getMask();
   7717   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   7718 
   7719   // Whenever we can lower this as a zext, that instruction is strictly faster
   7720   // than any alternative. It also allows us to fold memory operands into the
   7721   // shuffle in many cases.
   7722   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
   7723                                                          Mask, Subtarget, DAG))
   7724     return ZExt;
   7725 
   7726   int NumV2Elements =
   7727       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
   7728 
   7729   if (NumV2Elements == 0) {
   7730     // Check for being able to broadcast a single element.
   7731     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
   7732                                                           Mask, Subtarget, DAG))
   7733       return Broadcast;
   7734 
   7735     // Straight shuffle of a single input vector. For everything from SSE2
   7736     // onward this has a single fast instruction with no scary immediates.
   7737     // We coerce the shuffle pattern to be compatible with UNPCK instructions
   7738     // but we aren't actually going to use the UNPCK instruction because doing
   7739     // so prevents folding a load into this instruction or making a copy.
   7740     const int UnpackLoMask[] = {0, 0, 1, 1};
   7741     const int UnpackHiMask[] = {2, 2, 3, 3};
   7742     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
   7743       Mask = UnpackLoMask;
   7744     else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
   7745       Mask = UnpackHiMask;
   7746 
   7747     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
   7748                        getV4X86ShuffleImm8ForMask(Mask, DAG));
   7749   }
   7750 
   7751   // Try to use shift instructions.
   7752   if (SDValue Shift =
   7753           lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
   7754     return Shift;
   7755 
   7756   // There are special ways we can lower some single-element blends.
   7757   if (NumV2Elements == 1)
   7758     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
   7759                                                          Mask, Subtarget, DAG))
   7760       return V;
   7761 
   7762   // We have different paths for blend lowering, but they all must use the
   7763   // *exact* same predicate.
   7764   bool IsBlendSupported = Subtarget->hasSSE41();
   7765   if (IsBlendSupported)
   7766     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
   7767                                                   Subtarget, DAG))
   7768       return Blend;
   7769 
   7770   if (SDValue Masked =
   7771           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
   7772     return Masked;
   7773 
   7774   // Use dedicated unpack instructions for masks that match their pattern.
   7775   if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
   7776     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
   7777   if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
   7778     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
   7779   if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
   7780     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
   7781   if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
   7782     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
   7783 
   7784   // Try to use byte rotation instructions.
   7785   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   7786   if (Subtarget->hasSSSE3())
   7787     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   7788             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
   7789       return Rotate;
   7790 
   7791   // If we have direct support for blends, we should lower by decomposing into
   7792   // a permute. That will be faster than the domain cross.
   7793   if (IsBlendSupported)
   7794     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
   7795                                                       Mask, DAG);
   7796 
   7797   // Try to lower by permuting the inputs into an unpack instruction.
   7798   if (SDValue Unpack =
   7799           lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG))
   7800     return Unpack;
   7801 
   7802   // We implement this with SHUFPS because it can blend from two vectors.
   7803   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
   7804   // up the inputs, bypassing domain shift penalties that we would encur if we
   7805   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
   7806   // relevant.
   7807   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
   7808                      DAG.getVectorShuffle(
   7809                          MVT::v4f32, DL,
   7810                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
   7811                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
   7812 }
   7813 
   7814 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
   7815 /// shuffle lowering, and the most complex part.
   7816 ///
   7817 /// The lowering strategy is to try to form pairs of input lanes which are
   7818 /// targeted at the same half of the final vector, and then use a dword shuffle
   7819 /// to place them onto the right half, and finally unpack the paired lanes into
   7820 /// their final position.
   7821 ///
   7822 /// The exact breakdown of how to form these dword pairs and align them on the
   7823 /// correct sides is really tricky. See the comments within the function for
   7824 /// more of the details.
   7825 ///
   7826 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
   7827 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
   7828 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
   7829 /// vector, form the analogous 128-bit 8-element Mask.
   7830 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
   7831     SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
   7832     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   7833   assert(VT.getScalarType() == MVT::i16 && "Bad input type!");
   7834   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
   7835 
   7836   assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
   7837   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
   7838   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
   7839 
   7840   SmallVector<int, 4> LoInputs;
   7841   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
   7842                [](int M) { return M >= 0; });
   7843   std::sort(LoInputs.begin(), LoInputs.end());
   7844   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
   7845   SmallVector<int, 4> HiInputs;
   7846   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
   7847                [](int M) { return M >= 0; });
   7848   std::sort(HiInputs.begin(), HiInputs.end());
   7849   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
   7850   int NumLToL =
   7851       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
   7852   int NumHToL = LoInputs.size() - NumLToL;
   7853   int NumLToH =
   7854       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
   7855   int NumHToH = HiInputs.size() - NumLToH;
   7856   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
   7857   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
   7858   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
   7859   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
   7860 
   7861   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
   7862   // such inputs we can swap two of the dwords across the half mark and end up
   7863   // with <=2 inputs to each half in each half. Once there, we can fall through
   7864   // to the generic code below. For example:
   7865   //
   7866   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   7867   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
   7868   //
   7869   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
   7870   // and an existing 2-into-2 on the other half. In this case we may have to
   7871   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
   7872   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
   7873   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
   7874   // because any other situation (including a 3-into-1 or 1-into-3 in the other
   7875   // half than the one we target for fixing) will be fixed when we re-enter this
   7876   // path. We will also combine away any sequence of PSHUFD instructions that
   7877   // result into a single instruction. Here is an example of the tricky case:
   7878   //
   7879   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   7880   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
   7881   //
   7882   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
   7883   //
   7884   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
   7885   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
   7886   //
   7887   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
   7888   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
   7889   //
   7890   // The result is fine to be handled by the generic logic.
   7891   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
   7892                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
   7893                           int AOffset, int BOffset) {
   7894     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
   7895            "Must call this with A having 3 or 1 inputs from the A half.");
   7896     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
   7897            "Must call this with B having 1 or 3 inputs from the B half.");
   7898     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
   7899            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
   7900 
   7901     // Compute the index of dword with only one word among the three inputs in
   7902     // a half by taking the sum of the half with three inputs and subtracting
   7903     // the sum of the actual three inputs. The difference is the remaining
   7904     // slot.
   7905     int ADWord, BDWord;
   7906     int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
   7907     int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
   7908     int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
   7909     ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
   7910     int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
   7911     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
   7912     int TripleNonInputIdx =
   7913         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
   7914     TripleDWord = TripleNonInputIdx / 2;
   7915 
   7916     // We use xor with one to compute the adjacent DWord to whichever one the
   7917     // OneInput is in.
   7918     OneInputDWord = (OneInput / 2) ^ 1;
   7919 
   7920     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
   7921     // and BToA inputs. If there is also such a problem with the BToB and AToB
   7922     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
   7923     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
   7924     // is essential that we don't *create* a 3<-1 as then we might oscillate.
   7925     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
   7926       // Compute how many inputs will be flipped by swapping these DWords. We
   7927       // need
   7928       // to balance this to ensure we don't form a 3-1 shuffle in the other
   7929       // half.
   7930       int NumFlippedAToBInputs =
   7931           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
   7932           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
   7933       int NumFlippedBToBInputs =
   7934           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
   7935           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
   7936       if ((NumFlippedAToBInputs == 1 &&
   7937            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
   7938           (NumFlippedBToBInputs == 1 &&
   7939            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
   7940         // We choose whether to fix the A half or B half based on whether that
   7941         // half has zero flipped inputs. At zero, we may not be able to fix it
   7942         // with that half. We also bias towards fixing the B half because that
   7943         // will more commonly be the high half, and we have to bias one way.
   7944         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
   7945                                                        ArrayRef<int> Inputs) {
   7946           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
   7947           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
   7948                                          PinnedIdx ^ 1) != Inputs.end();
   7949           // Determine whether the free index is in the flipped dword or the
   7950           // unflipped dword based on where the pinned index is. We use this bit
   7951           // in an xor to conditionally select the adjacent dword.
   7952           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
   7953           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
   7954                                              FixFreeIdx) != Inputs.end();
   7955           if (IsFixIdxInput == IsFixFreeIdxInput)
   7956             FixFreeIdx += 1;
   7957           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
   7958                                         FixFreeIdx) != Inputs.end();
   7959           assert(IsFixIdxInput != IsFixFreeIdxInput &&
   7960                  "We need to be changing the number of flipped inputs!");
   7961           int PSHUFHalfMask[] = {0, 1, 2, 3};
   7962           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
   7963           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
   7964                           MVT::v8i16, V,
   7965                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
   7966 
   7967           for (int &M : Mask)
   7968             if (M != -1 && M == FixIdx)
   7969               M = FixFreeIdx;
   7970             else if (M != -1 && M == FixFreeIdx)
   7971               M = FixIdx;
   7972         };
   7973         if (NumFlippedBToBInputs != 0) {
   7974           int BPinnedIdx =
   7975               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
   7976           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
   7977         } else {
   7978           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
   7979           int APinnedIdx =
   7980               AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
   7981           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
   7982         }
   7983       }
   7984     }
   7985 
   7986     int PSHUFDMask[] = {0, 1, 2, 3};
   7987     PSHUFDMask[ADWord] = BDWord;
   7988     PSHUFDMask[BDWord] = ADWord;
   7989     V = DAG.getNode(ISD::BITCAST, DL, VT,
   7990                     DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
   7991                                 DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
   7992                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
   7993 
   7994     // Adjust the mask to match the new locations of A and B.
   7995     for (int &M : Mask)
   7996       if (M != -1 && M/2 == ADWord)
   7997         M = 2 * BDWord + M % 2;
   7998       else if (M != -1 && M/2 == BDWord)
   7999         M = 2 * ADWord + M % 2;
   8000 
   8001     // Recurse back into this routine to re-compute state now that this isn't
   8002     // a 3 and 1 problem.
   8003     return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
   8004                                                      DAG);
   8005   };
   8006   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
   8007     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
   8008   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
   8009     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
   8010 
   8011   // At this point there are at most two inputs to the low and high halves from
   8012   // each half. That means the inputs can always be grouped into dwords and
   8013   // those dwords can then be moved to the correct half with a dword shuffle.
   8014   // We use at most one low and one high word shuffle to collect these paired
   8015   // inputs into dwords, and finally a dword shuffle to place them.
   8016   int PSHUFLMask[4] = {-1, -1, -1, -1};
   8017   int PSHUFHMask[4] = {-1, -1, -1, -1};
   8018   int PSHUFDMask[4] = {-1, -1, -1, -1};
   8019 
   8020   // First fix the masks for all the inputs that are staying in their
   8021   // original halves. This will then dictate the targets of the cross-half
   8022   // shuffles.
   8023   auto fixInPlaceInputs =
   8024       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
   8025                     MutableArrayRef<int> SourceHalfMask,
   8026                     MutableArrayRef<int> HalfMask, int HalfOffset) {
   8027     if (InPlaceInputs.empty())
   8028       return;
   8029     if (InPlaceInputs.size() == 1) {
   8030       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
   8031           InPlaceInputs[0] - HalfOffset;
   8032       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
   8033       return;
   8034     }
   8035     if (IncomingInputs.empty()) {
   8036       // Just fix all of the in place inputs.
   8037       for (int Input : InPlaceInputs) {
   8038         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
   8039         PSHUFDMask[Input / 2] = Input / 2;
   8040       }
   8041       return;
   8042     }
   8043 
   8044     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
   8045     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
   8046         InPlaceInputs[0] - HalfOffset;
   8047     // Put the second input next to the first so that they are packed into
   8048     // a dword. We find the adjacent index by toggling the low bit.
   8049     int AdjIndex = InPlaceInputs[0] ^ 1;
   8050     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
   8051     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
   8052     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
   8053   };
   8054   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
   8055   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
   8056 
   8057   // Now gather the cross-half inputs and place them into a free dword of
   8058   // their target half.
   8059   // FIXME: This operation could almost certainly be simplified dramatically to
   8060   // look more like the 3-1 fixing operation.
   8061   auto moveInputsToRightHalf = [&PSHUFDMask](
   8062       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
   8063       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
   8064       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
   8065       int DestOffset) {
   8066     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
   8067       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
   8068     };
   8069     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
   8070                                                int Word) {
   8071       int LowWord = Word & ~1;
   8072       int HighWord = Word | 1;
   8073       return isWordClobbered(SourceHalfMask, LowWord) ||
   8074              isWordClobbered(SourceHalfMask, HighWord);
   8075     };
   8076 
   8077     if (IncomingInputs.empty())
   8078       return;
   8079 
   8080     if (ExistingInputs.empty()) {
   8081       // Map any dwords with inputs from them into the right half.
   8082       for (int Input : IncomingInputs) {
   8083         // If the source half mask maps over the inputs, turn those into
   8084         // swaps and use the swapped lane.
   8085         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
   8086           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
   8087             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
   8088                 Input - SourceOffset;
   8089             // We have to swap the uses in our half mask in one sweep.
   8090             for (int &M : HalfMask)
   8091               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
   8092                 M = Input;
   8093               else if (M == Input)
   8094                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
   8095           } else {
   8096             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
   8097                        Input - SourceOffset &&
   8098                    "Previous placement doesn't match!");
   8099           }
   8100           // Note that this correctly re-maps both when we do a swap and when
   8101           // we observe the other side of the swap above. We rely on that to
   8102           // avoid swapping the members of the input list directly.
   8103           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
   8104         }
   8105 
   8106         // Map the input's dword into the correct half.
   8107         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
   8108           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
   8109         else
   8110           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
   8111                      Input / 2 &&
   8112                  "Previous placement doesn't match!");
   8113       }
   8114 
   8115       // And just directly shift any other-half mask elements to be same-half
   8116       // as we will have mirrored the dword containing the element into the
   8117       // same position within that half.
   8118       for (int &M : HalfMask)
   8119         if (M >= SourceOffset && M < SourceOffset + 4) {
   8120           M = M - SourceOffset + DestOffset;
   8121           assert(M >= 0 && "This should never wrap below zero!");
   8122         }
   8123       return;
   8124     }
   8125 
   8126     // Ensure we have the input in a viable dword of its current half. This
   8127     // is particularly tricky because the original position may be clobbered
   8128     // by inputs being moved and *staying* in that half.
   8129     if (IncomingInputs.size() == 1) {
   8130       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
   8131         int InputFixed = std::find(std::begin(SourceHalfMask),
   8132                                    std::end(SourceHalfMask), -1) -
   8133                          std::begin(SourceHalfMask) + SourceOffset;
   8134         SourceHalfMask[InputFixed - SourceOffset] =
   8135             IncomingInputs[0] - SourceOffset;
   8136         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
   8137                      InputFixed);
   8138         IncomingInputs[0] = InputFixed;
   8139       }
   8140     } else if (IncomingInputs.size() == 2) {
   8141       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
   8142           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
   8143         // We have two non-adjacent or clobbered inputs we need to extract from
   8144         // the source half. To do this, we need to map them into some adjacent
   8145         // dword slot in the source mask.
   8146         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
   8147                               IncomingInputs[1] - SourceOffset};
   8148 
   8149         // If there is a free slot in the source half mask adjacent to one of
   8150         // the inputs, place the other input in it. We use (Index XOR 1) to
   8151         // compute an adjacent index.
   8152         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
   8153             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
   8154           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
   8155           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
   8156           InputsFixed[1] = InputsFixed[0] ^ 1;
   8157         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
   8158                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
   8159           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
   8160           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
   8161           InputsFixed[0] = InputsFixed[1] ^ 1;
   8162         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
   8163                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
   8164           // The two inputs are in the same DWord but it is clobbered and the
   8165           // adjacent DWord isn't used at all. Move both inputs to the free
   8166           // slot.
   8167           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
   8168           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
   8169           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
   8170           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
   8171         } else {
   8172           // The only way we hit this point is if there is no clobbering
   8173           // (because there are no off-half inputs to this half) and there is no
   8174           // free slot adjacent to one of the inputs. In this case, we have to
   8175           // swap an input with a non-input.
   8176           for (int i = 0; i < 4; ++i)
   8177             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
   8178                    "We can't handle any clobbers here!");
   8179           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
   8180                  "Cannot have adjacent inputs here!");
   8181 
   8182           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
   8183           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
   8184 
   8185           // We also have to update the final source mask in this case because
   8186           // it may need to undo the above swap.
   8187           for (int &M : FinalSourceHalfMask)
   8188             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
   8189               M = InputsFixed[1] + SourceOffset;
   8190             else if (M == InputsFixed[1] + SourceOffset)
   8191               M = (InputsFixed[0] ^ 1) + SourceOffset;
   8192 
   8193           InputsFixed[1] = InputsFixed[0] ^ 1;
   8194         }
   8195 
   8196         // Point everything at the fixed inputs.
   8197         for (int &M : HalfMask)
   8198           if (M == IncomingInputs[0])
   8199             M = InputsFixed[0] + SourceOffset;
   8200           else if (M == IncomingInputs[1])
   8201             M = InputsFixed[1] + SourceOffset;
   8202 
   8203         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
   8204         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
   8205       }
   8206     } else {
   8207       llvm_unreachable("Unhandled input size!");
   8208     }
   8209 
   8210     // Now hoist the DWord down to the right half.
   8211     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
   8212     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
   8213     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
   8214     for (int &M : HalfMask)
   8215       for (int Input : IncomingInputs)
   8216         if (M == Input)
   8217           M = FreeDWord * 2 + Input % 2;
   8218   };
   8219   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
   8220                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
   8221   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
   8222                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
   8223 
   8224   // Now enact all the shuffles we've computed to move the inputs into their
   8225   // target half.
   8226   if (!isNoopShuffleMask(PSHUFLMask))
   8227     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
   8228                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
   8229   if (!isNoopShuffleMask(PSHUFHMask))
   8230     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
   8231                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
   8232   if (!isNoopShuffleMask(PSHUFDMask))
   8233     V = DAG.getNode(ISD::BITCAST, DL, VT,
   8234                     DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
   8235                                 DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
   8236                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
   8237 
   8238   // At this point, each half should contain all its inputs, and we can then
   8239   // just shuffle them into their final position.
   8240   assert(std::count_if(LoMask.begin(), LoMask.end(),
   8241                        [](int M) { return M >= 4; }) == 0 &&
   8242          "Failed to lift all the high half inputs to the low mask!");
   8243   assert(std::count_if(HiMask.begin(), HiMask.end(),
   8244                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
   8245          "Failed to lift all the low half inputs to the high mask!");
   8246 
   8247   // Do a half shuffle for the low mask.
   8248   if (!isNoopShuffleMask(LoMask))
   8249     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
   8250                     getV4X86ShuffleImm8ForMask(LoMask, DAG));
   8251 
   8252   // Do a half shuffle with the high mask after shifting its values down.
   8253   for (int &M : HiMask)
   8254     if (M >= 0)
   8255       M -= 4;
   8256   if (!isNoopShuffleMask(HiMask))
   8257     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
   8258                     getV4X86ShuffleImm8ForMask(HiMask, DAG));
   8259 
   8260   return V;
   8261 }
   8262 
   8263 /// \brief Helper to form a PSHUFB-based shuffle+blend.
   8264 static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
   8265                                           SDValue V2, ArrayRef<int> Mask,
   8266                                           SelectionDAG &DAG, bool &V1InUse,
   8267                                           bool &V2InUse) {
   8268   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   8269   SDValue V1Mask[16];
   8270   SDValue V2Mask[16];
   8271   V1InUse = false;
   8272   V2InUse = false;
   8273 
   8274   int Size = Mask.size();
   8275   int Scale = 16 / Size;
   8276   for (int i = 0; i < 16; ++i) {
   8277     if (Mask[i / Scale] == -1) {
   8278       V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
   8279     } else {
   8280       const int ZeroMask = 0x80;
   8281       int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
   8282                                           : ZeroMask;
   8283       int V2Idx = Mask[i / Scale] < Size
   8284                       ? ZeroMask
   8285                       : (Mask[i / Scale] - Size) * Scale + i % Scale;
   8286       if (Zeroable[i / Scale])
   8287         V1Idx = V2Idx = ZeroMask;
   8288       V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
   8289       V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
   8290       V1InUse |= (ZeroMask != V1Idx);
   8291       V2InUse |= (ZeroMask != V2Idx);
   8292     }
   8293   }
   8294 
   8295   if (V1InUse)
   8296     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
   8297                      DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1),
   8298                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
   8299   if (V2InUse)
   8300     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
   8301                      DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2),
   8302                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
   8303 
   8304   // If we need shuffled inputs from both, blend the two.
   8305   SDValue V;
   8306   if (V1InUse && V2InUse)
   8307     V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
   8308   else
   8309     V = V1InUse ? V1 : V2;
   8310 
   8311   // Cast the result back to the correct type.
   8312   return DAG.getNode(ISD::BITCAST, DL, VT, V);
   8313 }
   8314 
   8315 /// \brief Generic lowering of 8-lane i16 shuffles.
   8316 ///
   8317 /// This handles both single-input shuffles and combined shuffle/blends with
   8318 /// two inputs. The single input shuffles are immediately delegated to
   8319 /// a dedicated lowering routine.
   8320 ///
   8321 /// The blends are lowered in one of three fundamental ways. If there are few
   8322 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
   8323 /// of the input is significantly cheaper when lowered as an interleaving of
   8324 /// the two inputs, try to interleave them. Otherwise, blend the low and high
   8325 /// halves of the inputs separately (making them have relatively few inputs)
   8326 /// and then concatenate them.
   8327 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   8328                                        const X86Subtarget *Subtarget,
   8329                                        SelectionDAG &DAG) {
   8330   SDLoc DL(Op);
   8331   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
   8332   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   8333   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   8334   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   8335   ArrayRef<int> OrigMask = SVOp->getMask();
   8336   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
   8337                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
   8338   MutableArrayRef<int> Mask(MaskStorage);
   8339 
   8340   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   8341 
   8342   // Whenever we can lower this as a zext, that instruction is strictly faster
   8343   // than any alternative.
   8344   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   8345           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
   8346     return ZExt;
   8347 
   8348   auto isV1 = [](int M) { return M >= 0 && M < 8; };
   8349   (void)isV1;
   8350   auto isV2 = [](int M) { return M >= 8; };
   8351 
   8352   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
   8353 
   8354   if (NumV2Inputs == 0) {
   8355     // Check for being able to broadcast a single element.
   8356     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
   8357                                                           Mask, Subtarget, DAG))
   8358       return Broadcast;
   8359 
   8360     // Try to use shift instructions.
   8361     if (SDValue Shift =
   8362             lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
   8363       return Shift;
   8364 
   8365     // Use dedicated unpack instructions for masks that match their pattern.
   8366     if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3}))
   8367       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1);
   8368     if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7}))
   8369       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1);
   8370 
   8371     // Try to use byte rotation instructions.
   8372     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
   8373                                                         Mask, Subtarget, DAG))
   8374       return Rotate;
   8375 
   8376     return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask,
   8377                                                      Subtarget, DAG);
   8378   }
   8379 
   8380   assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
   8381          "All single-input shuffles should be canonicalized to be V1-input "
   8382          "shuffles.");
   8383 
   8384   // Try to use shift instructions.
   8385   if (SDValue Shift =
   8386           lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
   8387     return Shift;
   8388 
   8389   // There are special ways we can lower some single-element blends.
   8390   if (NumV2Inputs == 1)
   8391     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
   8392                                                          Mask, Subtarget, DAG))
   8393       return V;
   8394 
   8395   // We have different paths for blend lowering, but they all must use the
   8396   // *exact* same predicate.
   8397   bool IsBlendSupported = Subtarget->hasSSE41();
   8398   if (IsBlendSupported)
   8399     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
   8400                                                   Subtarget, DAG))
   8401       return Blend;
   8402 
   8403   if (SDValue Masked =
   8404           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
   8405     return Masked;
   8406 
   8407   // Use dedicated unpack instructions for masks that match their pattern.
   8408   if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11}))
   8409     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
   8410   if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15}))
   8411     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
   8412 
   8413   // Try to use byte rotation instructions.
   8414   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   8415           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
   8416     return Rotate;
   8417 
   8418   if (SDValue BitBlend =
   8419           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
   8420     return BitBlend;
   8421 
   8422   if (SDValue Unpack =
   8423           lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG))
   8424     return Unpack;
   8425 
   8426   // If we can't directly blend but can use PSHUFB, that will be better as it
   8427   // can both shuffle and set up the inefficient blend.
   8428   if (!IsBlendSupported && Subtarget->hasSSSE3()) {
   8429     bool V1InUse, V2InUse;
   8430     return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
   8431                                       V1InUse, V2InUse);
   8432   }
   8433 
   8434   // We can always bit-blend if we have to so the fallback strategy is to
   8435   // decompose into single-input permutes and blends.
   8436   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
   8437                                                       Mask, DAG);
   8438 }
   8439 
   8440 /// \brief Check whether a compaction lowering can be done by dropping even
   8441 /// elements and compute how many times even elements must be dropped.
   8442 ///
   8443 /// This handles shuffles which take every Nth element where N is a power of
   8444 /// two. Example shuffle masks:
   8445 ///
   8446 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
   8447 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
   8448 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
   8449 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
   8450 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
   8451 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
   8452 ///
   8453 /// Any of these lanes can of course be undef.
   8454 ///
   8455 /// This routine only supports N <= 3.
   8456 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
   8457 /// for larger N.
   8458 ///
   8459 /// \returns N above, or the number of times even elements must be dropped if
   8460 /// there is such a number. Otherwise returns zero.
   8461 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
   8462   // Figure out whether we're looping over two inputs or just one.
   8463   bool IsSingleInput = isSingleInputShuffleMask(Mask);
   8464 
   8465   // The modulus for the shuffle vector entries is based on whether this is
   8466   // a single input or not.
   8467   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
   8468   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
   8469          "We should only be called with masks with a power-of-2 size!");
   8470 
   8471   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
   8472 
   8473   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
   8474   // and 2^3 simultaneously. This is because we may have ambiguity with
   8475   // partially undef inputs.
   8476   bool ViableForN[3] = {true, true, true};
   8477 
   8478   for (int i = 0, e = Mask.size(); i < e; ++i) {
   8479     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
   8480     // want.
   8481     if (Mask[i] == -1)
   8482       continue;
   8483 
   8484     bool IsAnyViable = false;
   8485     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
   8486       if (ViableForN[j]) {
   8487         uint64_t N = j + 1;
   8488 
   8489         // The shuffle mask must be equal to (i * 2^N) % M.
   8490         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
   8491           IsAnyViable = true;
   8492         else
   8493           ViableForN[j] = false;
   8494       }
   8495     // Early exit if we exhaust the possible powers of two.
   8496     if (!IsAnyViable)
   8497       break;
   8498   }
   8499 
   8500   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
   8501     if (ViableForN[j])
   8502       return j + 1;
   8503 
   8504   // Return 0 as there is no viable power of two.
   8505   return 0;
   8506 }
   8507 
   8508 /// \brief Generic lowering of v16i8 shuffles.
   8509 ///
   8510 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
   8511 /// detect any complexity reducing interleaving. If that doesn't help, it uses
   8512 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
   8513 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
   8514 /// back together.
   8515 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   8516                                        const X86Subtarget *Subtarget,
   8517                                        SelectionDAG &DAG) {
   8518   SDLoc DL(Op);
   8519   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
   8520   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   8521   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   8522   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   8523   ArrayRef<int> Mask = SVOp->getMask();
   8524   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   8525 
   8526   // Try to use shift instructions.
   8527   if (SDValue Shift =
   8528           lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
   8529     return Shift;
   8530 
   8531   // Try to use byte rotation instructions.
   8532   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   8533           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
   8534     return Rotate;
   8535 
   8536   // Try to use a zext lowering.
   8537   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   8538           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
   8539     return ZExt;
   8540 
   8541   int NumV2Elements =
   8542       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
   8543 
   8544   // For single-input shuffles, there are some nicer lowering tricks we can use.
   8545   if (NumV2Elements == 0) {
   8546     // Check for being able to broadcast a single element.
   8547     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
   8548                                                           Mask, Subtarget, DAG))
   8549       return Broadcast;
   8550 
   8551     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
   8552     // Notably, this handles splat and partial-splat shuffles more efficiently.
   8553     // However, it only makes sense if the pre-duplication shuffle simplifies
   8554     // things significantly. Currently, this means we need to be able to
   8555     // express the pre-duplication shuffle as an i16 shuffle.
   8556     //
   8557     // FIXME: We should check for other patterns which can be widened into an
   8558     // i16 shuffle as well.
   8559     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
   8560       for (int i = 0; i < 16; i += 2)
   8561         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
   8562           return false;
   8563 
   8564       return true;
   8565     };
   8566     auto tryToWidenViaDuplication = [&]() -> SDValue {
   8567       if (!canWidenViaDuplication(Mask))
   8568         return SDValue();
   8569       SmallVector<int, 4> LoInputs;
   8570       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
   8571                    [](int M) { return M >= 0 && M < 8; });
   8572       std::sort(LoInputs.begin(), LoInputs.end());
   8573       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
   8574                      LoInputs.end());
   8575       SmallVector<int, 4> HiInputs;
   8576       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
   8577                    [](int M) { return M >= 8; });
   8578       std::sort(HiInputs.begin(), HiInputs.end());
   8579       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
   8580                      HiInputs.end());
   8581 
   8582       bool TargetLo = LoInputs.size() >= HiInputs.size();
   8583       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
   8584       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
   8585 
   8586       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
   8587       SmallDenseMap<int, int, 8> LaneMap;
   8588       for (int I : InPlaceInputs) {
   8589         PreDupI16Shuffle[I/2] = I/2;
   8590         LaneMap[I] = I;
   8591       }
   8592       int j = TargetLo ? 0 : 4, je = j + 4;
   8593       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
   8594         // Check if j is already a shuffle of this input. This happens when
   8595         // there are two adjacent bytes after we move the low one.
   8596         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
   8597           // If we haven't yet mapped the input, search for a slot into which
   8598           // we can map it.
   8599           while (j < je && PreDupI16Shuffle[j] != -1)
   8600             ++j;
   8601 
   8602           if (j == je)
   8603             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
   8604             return SDValue();
   8605 
   8606           // Map this input with the i16 shuffle.
   8607           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
   8608         }
   8609 
   8610         // Update the lane map based on the mapping we ended up with.
   8611         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
   8612       }
   8613       V1 = DAG.getNode(
   8614           ISD::BITCAST, DL, MVT::v16i8,
   8615           DAG.getVectorShuffle(MVT::v8i16, DL,
   8616                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
   8617                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
   8618 
   8619       // Unpack the bytes to form the i16s that will be shuffled into place.
   8620       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
   8621                        MVT::v16i8, V1, V1);
   8622 
   8623       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   8624       for (int i = 0; i < 16; ++i)
   8625         if (Mask[i] != -1) {
   8626           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
   8627           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
   8628           if (PostDupI16Shuffle[i / 2] == -1)
   8629             PostDupI16Shuffle[i / 2] = MappedMask;
   8630           else
   8631             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
   8632                    "Conflicting entrties in the original shuffle!");
   8633         }
   8634       return DAG.getNode(
   8635           ISD::BITCAST, DL, MVT::v16i8,
   8636           DAG.getVectorShuffle(MVT::v8i16, DL,
   8637                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
   8638                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
   8639     };
   8640     if (SDValue V = tryToWidenViaDuplication())
   8641       return V;
   8642   }
   8643 
   8644   // Use dedicated unpack instructions for masks that match their pattern.
   8645   if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
   8646                                          0, 16, 1, 17, 2, 18, 3, 19,
   8647                                          // High half.
   8648                                          4, 20, 5, 21, 6, 22, 7, 23}))
   8649     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
   8650   if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
   8651                                          8, 24, 9, 25, 10, 26, 11, 27,
   8652                                          // High half.
   8653                                          12, 28, 13, 29, 14, 30, 15, 31}))
   8654     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
   8655 
   8656   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
   8657   // with PSHUFB. It is important to do this before we attempt to generate any
   8658   // blends but after all of the single-input lowerings. If the single input
   8659   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
   8660   // want to preserve that and we can DAG combine any longer sequences into
   8661   // a PSHUFB in the end. But once we start blending from multiple inputs,
   8662   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
   8663   // and there are *very* few patterns that would actually be faster than the
   8664   // PSHUFB approach because of its ability to zero lanes.
   8665   //
   8666   // FIXME: The only exceptions to the above are blends which are exact
   8667   // interleavings with direct instructions supporting them. We currently don't
   8668   // handle those well here.
   8669   if (Subtarget->hasSSSE3()) {
   8670     bool V1InUse = false;
   8671     bool V2InUse = false;
   8672 
   8673     SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
   8674                                                 DAG, V1InUse, V2InUse);
   8675 
   8676     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
   8677     // do so. This avoids using them to handle blends-with-zero which is
   8678     // important as a single pshufb is significantly faster for that.
   8679     if (V1InUse && V2InUse) {
   8680       if (Subtarget->hasSSE41())
   8681         if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
   8682                                                       Mask, Subtarget, DAG))
   8683           return Blend;
   8684 
   8685       // We can use an unpack to do the blending rather than an or in some
   8686       // cases. Even though the or may be (very minorly) more efficient, we
   8687       // preference this lowering because there are common cases where part of
   8688       // the complexity of the shuffles goes away when we do the final blend as
   8689       // an unpack.
   8690       // FIXME: It might be worth trying to detect if the unpack-feeding
   8691       // shuffles will both be pshufb, in which case we shouldn't bother with
   8692       // this.
   8693       if (SDValue Unpack =
   8694               lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG))
   8695         return Unpack;
   8696     }
   8697 
   8698     return PSHUFB;
   8699   }
   8700 
   8701   // There are special ways we can lower some single-element blends.
   8702   if (NumV2Elements == 1)
   8703     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
   8704                                                          Mask, Subtarget, DAG))
   8705       return V;
   8706 
   8707   if (SDValue BitBlend =
   8708           lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
   8709     return BitBlend;
   8710 
   8711   // Check whether a compaction lowering can be done. This handles shuffles
   8712   // which take every Nth element for some even N. See the helper function for
   8713   // details.
   8714   //
   8715   // We special case these as they can be particularly efficiently handled with
   8716   // the PACKUSB instruction on x86 and they show up in common patterns of
   8717   // rearranging bytes to truncate wide elements.
   8718   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
   8719     // NumEvenDrops is the power of two stride of the elements. Another way of
   8720     // thinking about it is that we need to drop the even elements this many
   8721     // times to get the original input.
   8722     bool IsSingleInput = isSingleInputShuffleMask(Mask);
   8723 
   8724     // First we need to zero all the dropped bytes.
   8725     assert(NumEvenDrops <= 3 &&
   8726            "No support for dropping even elements more than 3 times.");
   8727     // We use the mask type to pick which bytes are preserved based on how many
   8728     // elements are dropped.
   8729     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
   8730     SDValue ByteClearMask =
   8731         DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
   8732                     DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
   8733     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
   8734     if (!IsSingleInput)
   8735       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
   8736 
   8737     // Now pack things back together.
   8738     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
   8739     V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
   8740     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
   8741     for (int i = 1; i < NumEvenDrops; ++i) {
   8742       Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
   8743       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
   8744     }
   8745 
   8746     return Result;
   8747   }
   8748 
   8749   // Handle multi-input cases by blending single-input shuffles.
   8750   if (NumV2Elements > 0)
   8751     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
   8752                                                       Mask, DAG);
   8753 
   8754   // The fallback path for single-input shuffles widens this into two v8i16
   8755   // vectors with unpacks, shuffles those, and then pulls them back together
   8756   // with a pack.
   8757   SDValue V = V1;
   8758 
   8759   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   8760   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   8761   for (int i = 0; i < 16; ++i)
   8762     if (Mask[i] >= 0)
   8763       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
   8764 
   8765   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
   8766 
   8767   SDValue VLoHalf, VHiHalf;
   8768   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
   8769   // them out and avoid using UNPCK{L,H} to extract the elements of V as
   8770   // i16s.
   8771   if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
   8772                    [](int M) { return M >= 0 && M % 2 == 1; }) &&
   8773       std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
   8774                    [](int M) { return M >= 0 && M % 2 == 1; })) {
   8775     // Use a mask to drop the high bytes.
   8776     VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
   8777     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
   8778                      DAG.getConstant(0x00FF, MVT::v8i16));
   8779 
   8780     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
   8781     VHiHalf = DAG.getUNDEF(MVT::v8i16);
   8782 
   8783     // Squash the masks to point directly into VLoHalf.
   8784     for (int &M : LoBlendMask)
   8785       if (M >= 0)
   8786         M /= 2;
   8787     for (int &M : HiBlendMask)
   8788       if (M >= 0)
   8789         M /= 2;
   8790   } else {
   8791     // Otherwise just unpack the low half of V into VLoHalf and the high half into
   8792     // VHiHalf so that we can blend them as i16s.
   8793     VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
   8794                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
   8795     VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
   8796                      DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
   8797   }
   8798 
   8799   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
   8800   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
   8801 
   8802   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
   8803 }
   8804 
   8805 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
   8806 ///
   8807 /// This routine breaks down the specific type of 128-bit shuffle and
   8808 /// dispatches to the lowering routines accordingly.
   8809 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   8810                                         MVT VT, const X86Subtarget *Subtarget,
   8811                                         SelectionDAG &DAG) {
   8812   switch (VT.SimpleTy) {
   8813   case MVT::v2i64:
   8814     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   8815   case MVT::v2f64:
   8816     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   8817   case MVT::v4i32:
   8818     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   8819   case MVT::v4f32:
   8820     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   8821   case MVT::v8i16:
   8822     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
   8823   case MVT::v16i8:
   8824     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
   8825 
   8826   default:
   8827     llvm_unreachable("Unimplemented!");
   8828   }
   8829 }
   8830 
   8831 /// \brief Helper function to test whether a shuffle mask could be
   8832 /// simplified by widening the elements being shuffled.
   8833 ///
   8834 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
   8835 /// leaves it in an unspecified state.
   8836 ///
   8837 /// NOTE: This must handle normal vector shuffle masks and *target* vector
   8838 /// shuffle masks. The latter have the special property of a '-2' representing
   8839 /// a zero-ed lane of a vector.
   8840 static bool canWidenShuffleElements(ArrayRef<int> Mask,
   8841                                     SmallVectorImpl<int> &WidenedMask) {
   8842   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
   8843     // If both elements are undef, its trivial.
   8844     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
   8845       WidenedMask.push_back(SM_SentinelUndef);
   8846       continue;
   8847     }
   8848 
   8849     // Check for an undef mask and a mask value properly aligned to fit with
   8850     // a pair of values. If we find such a case, use the non-undef mask's value.
   8851     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
   8852       WidenedMask.push_back(Mask[i + 1] / 2);
   8853       continue;
   8854     }
   8855     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
   8856       WidenedMask.push_back(Mask[i] / 2);
   8857       continue;
   8858     }
   8859 
   8860     // When zeroing, we need to spread the zeroing across both lanes to widen.
   8861     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
   8862       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
   8863           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
   8864         WidenedMask.push_back(SM_SentinelZero);
   8865         continue;
   8866       }
   8867       return false;
   8868     }
   8869 
   8870     // Finally check if the two mask values are adjacent and aligned with
   8871     // a pair.
   8872     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
   8873       WidenedMask.push_back(Mask[i] / 2);
   8874       continue;
   8875     }
   8876 
   8877     // Otherwise we can't safely widen the elements used in this shuffle.
   8878     return false;
   8879   }
   8880   assert(WidenedMask.size() == Mask.size() / 2 &&
   8881          "Incorrect size of mask after widening the elements!");
   8882 
   8883   return true;
   8884 }
   8885 
   8886 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
   8887 ///
   8888 /// This routine just extracts two subvectors, shuffles them independently, and
   8889 /// then concatenates them back together. This should work effectively with all
   8890 /// AVX vector shuffle types.
   8891 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
   8892                                           SDValue V2, ArrayRef<int> Mask,
   8893                                           SelectionDAG &DAG) {
   8894   assert(VT.getSizeInBits() >= 256 &&
   8895          "Only for 256-bit or wider vector shuffles!");
   8896   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
   8897   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
   8898 
   8899   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
   8900   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
   8901 
   8902   int NumElements = VT.getVectorNumElements();
   8903   int SplitNumElements = NumElements / 2;
   8904   MVT ScalarVT = VT.getScalarType();
   8905   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
   8906 
   8907   // Rather than splitting build-vectors, just build two narrower build
   8908   // vectors. This helps shuffling with splats and zeros.
   8909   auto SplitVector = [&](SDValue V) {
   8910     while (V.getOpcode() == ISD::BITCAST)
   8911       V = V->getOperand(0);
   8912 
   8913     MVT OrigVT = V.getSimpleValueType();
   8914     int OrigNumElements = OrigVT.getVectorNumElements();
   8915     int OrigSplitNumElements = OrigNumElements / 2;
   8916     MVT OrigScalarVT = OrigVT.getScalarType();
   8917     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
   8918 
   8919     SDValue LoV, HiV;
   8920 
   8921     auto *BV = dyn_cast<BuildVectorSDNode>(V);
   8922     if (!BV) {
   8923       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
   8924                         DAG.getIntPtrConstant(0));
   8925       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
   8926                         DAG.getIntPtrConstant(OrigSplitNumElements));
   8927     } else {
   8928 
   8929       SmallVector<SDValue, 16> LoOps, HiOps;
   8930       for (int i = 0; i < OrigSplitNumElements; ++i) {
   8931         LoOps.push_back(BV->getOperand(i));
   8932         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
   8933       }
   8934       LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
   8935       HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
   8936     }
   8937     return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV),
   8938                           DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV));
   8939   };
   8940 
   8941   SDValue LoV1, HiV1, LoV2, HiV2;
   8942   std::tie(LoV1, HiV1) = SplitVector(V1);
   8943   std::tie(LoV2, HiV2) = SplitVector(V2);
   8944 
   8945   // Now create two 4-way blends of these half-width vectors.
   8946   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
   8947     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
   8948     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
   8949     for (int i = 0; i < SplitNumElements; ++i) {
   8950       int M = HalfMask[i];
   8951       if (M >= NumElements) {
   8952         if (M >= NumElements + SplitNumElements)
   8953           UseHiV2 = true;
   8954         else
   8955           UseLoV2 = true;
   8956         V2BlendMask.push_back(M - NumElements);
   8957         V1BlendMask.push_back(-1);
   8958         BlendMask.push_back(SplitNumElements + i);
   8959       } else if (M >= 0) {
   8960         if (M >= SplitNumElements)
   8961           UseHiV1 = true;
   8962         else
   8963           UseLoV1 = true;
   8964         V2BlendMask.push_back(-1);
   8965         V1BlendMask.push_back(M);
   8966         BlendMask.push_back(i);
   8967       } else {
   8968         V2BlendMask.push_back(-1);
   8969         V1BlendMask.push_back(-1);
   8970         BlendMask.push_back(-1);
   8971       }
   8972     }
   8973 
   8974     // Because the lowering happens after all combining takes place, we need to
   8975     // manually combine these blend masks as much as possible so that we create
   8976     // a minimal number of high-level vector shuffle nodes.
   8977 
   8978     // First try just blending the halves of V1 or V2.
   8979     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
   8980       return DAG.getUNDEF(SplitVT);
   8981     if (!UseLoV2 && !UseHiV2)
   8982       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
   8983     if (!UseLoV1 && !UseHiV1)
   8984       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
   8985 
   8986     SDValue V1Blend, V2Blend;
   8987     if (UseLoV1 && UseHiV1) {
   8988       V1Blend =
   8989         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
   8990     } else {
   8991       // We only use half of V1 so map the usage down into the final blend mask.
   8992       V1Blend = UseLoV1 ? LoV1 : HiV1;
   8993       for (int i = 0; i < SplitNumElements; ++i)
   8994         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
   8995           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
   8996     }
   8997     if (UseLoV2 && UseHiV2) {
   8998       V2Blend =
   8999         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
   9000     } else {
   9001       // We only use half of V2 so map the usage down into the final blend mask.
   9002       V2Blend = UseLoV2 ? LoV2 : HiV2;
   9003       for (int i = 0; i < SplitNumElements; ++i)
   9004         if (BlendMask[i] >= SplitNumElements)
   9005           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
   9006     }
   9007     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
   9008   };
   9009   SDValue Lo = HalfBlend(LoMask);
   9010   SDValue Hi = HalfBlend(HiMask);
   9011   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
   9012 }
   9013 
   9014 /// \brief Either split a vector in halves or decompose the shuffles and the
   9015 /// blend.
   9016 ///
   9017 /// This is provided as a good fallback for many lowerings of non-single-input
   9018 /// shuffles with more than one 128-bit lane. In those cases, we want to select
   9019 /// between splitting the shuffle into 128-bit components and stitching those
   9020 /// back together vs. extracting the single-input shuffles and blending those
   9021 /// results.
   9022 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
   9023                                                 SDValue V2, ArrayRef<int> Mask,
   9024                                                 SelectionDAG &DAG) {
   9025   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
   9026                                             "lower single-input shuffles as it "
   9027                                             "could then recurse on itself.");
   9028   int Size = Mask.size();
   9029 
   9030   // If this can be modeled as a broadcast of two elements followed by a blend,
   9031   // prefer that lowering. This is especially important because broadcasts can
   9032   // often fold with memory operands.
   9033   auto DoBothBroadcast = [&] {
   9034     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
   9035     for (int M : Mask)
   9036       if (M >= Size) {
   9037         if (V2BroadcastIdx == -1)
   9038           V2BroadcastIdx = M - Size;
   9039         else if (M - Size != V2BroadcastIdx)
   9040           return false;
   9041       } else if (M >= 0) {
   9042         if (V1BroadcastIdx == -1)
   9043           V1BroadcastIdx = M;
   9044         else if (M != V1BroadcastIdx)
   9045           return false;
   9046       }
   9047     return true;
   9048   };
   9049   if (DoBothBroadcast())
   9050     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
   9051                                                       DAG);
   9052 
   9053   // If the inputs all stem from a single 128-bit lane of each input, then we
   9054   // split them rather than blending because the split will decompose to
   9055   // unusually few instructions.
   9056   int LaneCount = VT.getSizeInBits() / 128;
   9057   int LaneSize = Size / LaneCount;
   9058   SmallBitVector LaneInputs[2];
   9059   LaneInputs[0].resize(LaneCount, false);
   9060   LaneInputs[1].resize(LaneCount, false);
   9061   for (int i = 0; i < Size; ++i)
   9062     if (Mask[i] >= 0)
   9063       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
   9064   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
   9065     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   9066 
   9067   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
   9068   // that the decomposed single-input shuffles don't end up here.
   9069   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
   9070 }
   9071 
   9072 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
   9073 /// a permutation and blend of those lanes.
   9074 ///
   9075 /// This essentially blends the out-of-lane inputs to each lane into the lane
   9076 /// from a permuted copy of the vector. This lowering strategy results in four
   9077 /// instructions in the worst case for a single-input cross lane shuffle which
   9078 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
   9079 /// of. Special cases for each particular shuffle pattern should be handled
   9080 /// prior to trying this lowering.
   9081 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
   9082                                                        SDValue V1, SDValue V2,
   9083                                                        ArrayRef<int> Mask,
   9084                                                        SelectionDAG &DAG) {
   9085   // FIXME: This should probably be generalized for 512-bit vectors as well.
   9086   assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
   9087   int LaneSize = Mask.size() / 2;
   9088 
   9089   // If there are only inputs from one 128-bit lane, splitting will in fact be
   9090   // less expensive. The flags track whether the given lane contains an element
   9091   // that crosses to another lane.
   9092   bool LaneCrossing[2] = {false, false};
   9093   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   9094     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
   9095       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
   9096   if (!LaneCrossing[0] || !LaneCrossing[1])
   9097     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   9098 
   9099   if (isSingleInputShuffleMask(Mask)) {
   9100     SmallVector<int, 32> FlippedBlendMask;
   9101     for (int i = 0, Size = Mask.size(); i < Size; ++i)
   9102       FlippedBlendMask.push_back(
   9103           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
   9104                                   ? Mask[i]
   9105                                   : Mask[i] % LaneSize +
   9106                                         (i / LaneSize) * LaneSize + Size));
   9107 
   9108     // Flip the vector, and blend the results which should now be in-lane. The
   9109     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
   9110     // 5 for the high source. The value 3 selects the high half of source 2 and
   9111     // the value 2 selects the low half of source 2. We only use source 2 to
   9112     // allow folding it into a memory operand.
   9113     unsigned PERMMask = 3 | 2 << 4;
   9114     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
   9115                                   V1, DAG.getConstant(PERMMask, MVT::i8));
   9116     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
   9117   }
   9118 
   9119   // This now reduces to two single-input shuffles of V1 and V2 which at worst
   9120   // will be handled by the above logic and a blend of the results, much like
   9121   // other patterns in AVX.
   9122   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
   9123 }
   9124 
   9125 /// \brief Handle lowering 2-lane 128-bit shuffles.
   9126 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
   9127                                         SDValue V2, ArrayRef<int> Mask,
   9128                                         const X86Subtarget *Subtarget,
   9129                                         SelectionDAG &DAG) {
   9130   // TODO: If minimizing size and one of the inputs is a zero vector and the
   9131   // the zero vector has only one use, we could use a VPERM2X128 to save the
   9132   // instruction bytes needed to explicitly generate the zero vector.
   9133 
   9134   // Blends are faster and handle all the non-lane-crossing cases.
   9135   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
   9136                                                 Subtarget, DAG))
   9137     return Blend;
   9138 
   9139   bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
   9140   bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
   9141 
   9142   // If either input operand is a zero vector, use VPERM2X128 because its mask
   9143   // allows us to replace the zero input with an implicit zero.
   9144   if (!IsV1Zero && !IsV2Zero) {
   9145     // Check for patterns which can be matched with a single insert of a 128-bit
   9146     // subvector.
   9147     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
   9148     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
   9149       MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
   9150                                    VT.getVectorNumElements() / 2);
   9151       SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
   9152                                 DAG.getIntPtrConstant(0));
   9153       SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
   9154                                 OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0));
   9155       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
   9156     }
   9157   }
   9158 
   9159   // Otherwise form a 128-bit permutation. After accounting for undefs,
   9160   // convert the 64-bit shuffle mask selection values into 128-bit
   9161   // selection bits by dividing the indexes by 2 and shifting into positions
   9162   // defined by a vperm2*128 instruction's immediate control byte.
   9163 
   9164   // The immediate permute control byte looks like this:
   9165   //    [1:0] - select 128 bits from sources for low half of destination
   9166   //    [2]   - ignore
   9167   //    [3]   - zero low half of destination
   9168   //    [5:4] - select 128 bits from sources for high half of destination
   9169   //    [6]   - ignore
   9170   //    [7]   - zero high half of destination
   9171 
   9172   int MaskLO = Mask[0];
   9173   if (MaskLO == SM_SentinelUndef)
   9174     MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
   9175 
   9176   int MaskHI = Mask[2];
   9177   if (MaskHI == SM_SentinelUndef)
   9178     MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
   9179 
   9180   unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
   9181 
   9182   // If either input is a zero vector, replace it with an undef input.
   9183   // Shuffle mask values <  4 are selecting elements of V1.
   9184   // Shuffle mask values >= 4 are selecting elements of V2.
   9185   // Adjust each half of the permute mask by clearing the half that was
   9186   // selecting the zero vector and setting the zero mask bit.
   9187   if (IsV1Zero) {
   9188     V1 = DAG.getUNDEF(VT);
   9189     if (MaskLO < 4)
   9190       PermMask = (PermMask & 0xf0) | 0x08;
   9191     if (MaskHI < 4)
   9192       PermMask = (PermMask & 0x0f) | 0x80;
   9193   }
   9194   if (IsV2Zero) {
   9195     V2 = DAG.getUNDEF(VT);
   9196     if (MaskLO >= 4)
   9197       PermMask = (PermMask & 0xf0) | 0x08;
   9198     if (MaskHI >= 4)
   9199       PermMask = (PermMask & 0x0f) | 0x80;
   9200   }
   9201 
   9202   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
   9203                      DAG.getConstant(PermMask, MVT::i8));
   9204 }
   9205 
   9206 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
   9207 /// shuffling each lane.
   9208 ///
   9209 /// This will only succeed when the result of fixing the 128-bit lanes results
   9210 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
   9211 /// each 128-bit lanes. This handles many cases where we can quickly blend away
   9212 /// the lane crosses early and then use simpler shuffles within each lane.
   9213 ///
   9214 /// FIXME: It might be worthwhile at some point to support this without
   9215 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
   9216 /// in x86 only floating point has interesting non-repeating shuffles, and even
   9217 /// those are still *marginally* more expensive.
   9218 static SDValue lowerVectorShuffleByMerging128BitLanes(
   9219     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   9220     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   9221   assert(!isSingleInputShuffleMask(Mask) &&
   9222          "This is only useful with multiple inputs.");
   9223 
   9224   int Size = Mask.size();
   9225   int LaneSize = 128 / VT.getScalarSizeInBits();
   9226   int NumLanes = Size / LaneSize;
   9227   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
   9228 
   9229   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
   9230   // check whether the in-128-bit lane shuffles share a repeating pattern.
   9231   SmallVector<int, 4> Lanes;
   9232   Lanes.resize(NumLanes, -1);
   9233   SmallVector<int, 4> InLaneMask;
   9234   InLaneMask.resize(LaneSize, -1);
   9235   for (int i = 0; i < Size; ++i) {
   9236     if (Mask[i] < 0)
   9237       continue;
   9238 
   9239     int j = i / LaneSize;
   9240 
   9241     if (Lanes[j] < 0) {
   9242       // First entry we've seen for this lane.
   9243       Lanes[j] = Mask[i] / LaneSize;
   9244     } else if (Lanes[j] != Mask[i] / LaneSize) {
   9245       // This doesn't match the lane selected previously!
   9246       return SDValue();
   9247     }
   9248 
   9249     // Check that within each lane we have a consistent shuffle mask.
   9250     int k = i % LaneSize;
   9251     if (InLaneMask[k] < 0) {
   9252       InLaneMask[k] = Mask[i] % LaneSize;
   9253     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
   9254       // This doesn't fit a repeating in-lane mask.
   9255       return SDValue();
   9256     }
   9257   }
   9258 
   9259   // First shuffle the lanes into place.
   9260   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
   9261                                 VT.getSizeInBits() / 64);
   9262   SmallVector<int, 8> LaneMask;
   9263   LaneMask.resize(NumLanes * 2, -1);
   9264   for (int i = 0; i < NumLanes; ++i)
   9265     if (Lanes[i] >= 0) {
   9266       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
   9267       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
   9268     }
   9269 
   9270   V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
   9271   V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
   9272   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
   9273 
   9274   // Cast it back to the type we actually want.
   9275   LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
   9276 
   9277   // Now do a simple shuffle that isn't lane crossing.
   9278   SmallVector<int, 8> NewMask;
   9279   NewMask.resize(Size, -1);
   9280   for (int i = 0; i < Size; ++i)
   9281     if (Mask[i] >= 0)
   9282       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
   9283   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
   9284          "Must not introduce lane crosses at this point!");
   9285 
   9286   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
   9287 }
   9288 
   9289 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
   9290 /// given mask.
   9291 ///
   9292 /// This returns true if the elements from a particular input are already in the
   9293 /// slot required by the given mask and require no permutation.
   9294 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
   9295   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
   9296   int Size = Mask.size();
   9297   for (int i = 0; i < Size; ++i)
   9298     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
   9299       return false;
   9300 
   9301   return true;
   9302 }
   9303 
   9304 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
   9305 ///
   9306 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
   9307 /// isn't available.
   9308 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9309                                        const X86Subtarget *Subtarget,
   9310                                        SelectionDAG &DAG) {
   9311   SDLoc DL(Op);
   9312   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   9313   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   9314   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9315   ArrayRef<int> Mask = SVOp->getMask();
   9316   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   9317 
   9318   SmallVector<int, 4> WidenedMask;
   9319   if (canWidenShuffleElements(Mask, WidenedMask))
   9320     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
   9321                                     DAG);
   9322 
   9323   if (isSingleInputShuffleMask(Mask)) {
   9324     // Check for being able to broadcast a single element.
   9325     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
   9326                                                           Mask, Subtarget, DAG))
   9327       return Broadcast;
   9328 
   9329     // Use low duplicate instructions for masks that match their pattern.
   9330     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
   9331       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
   9332 
   9333     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
   9334       // Non-half-crossing single input shuffles can be lowerid with an
   9335       // interleaved permutation.
   9336       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
   9337                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
   9338       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
   9339                          DAG.getConstant(VPERMILPMask, MVT::i8));
   9340     }
   9341 
   9342     // With AVX2 we have direct support for this permutation.
   9343     if (Subtarget->hasAVX2())
   9344       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
   9345                          getV4X86ShuffleImm8ForMask(Mask, DAG));
   9346 
   9347     // Otherwise, fall back.
   9348     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
   9349                                                    DAG);
   9350   }
   9351 
   9352   // X86 has dedicated unpack instructions that can handle specific blend
   9353   // operations: UNPCKH and UNPCKL.
   9354   if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
   9355     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
   9356   if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
   9357     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
   9358   if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
   9359     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
   9360   if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
   9361     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
   9362 
   9363   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
   9364                                                 Subtarget, DAG))
   9365     return Blend;
   9366 
   9367   // Check if the blend happens to exactly fit that of SHUFPD.
   9368   if ((Mask[0] == -1 || Mask[0] < 2) &&
   9369       (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
   9370       (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
   9371       (Mask[3] == -1 || Mask[3] >= 6)) {
   9372     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
   9373                           ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
   9374     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
   9375                        DAG.getConstant(SHUFPDMask, MVT::i8));
   9376   }
   9377   if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
   9378       (Mask[1] == -1 || Mask[1] < 2) &&
   9379       (Mask[2] == -1 || Mask[2] >= 6) &&
   9380       (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
   9381     unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
   9382                           ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
   9383     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
   9384                        DAG.getConstant(SHUFPDMask, MVT::i8));
   9385   }
   9386 
   9387   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   9388   // shuffle. However, if we have AVX2 and either inputs are already in place,
   9389   // we will be able to shuffle even across lanes the other input in a single
   9390   // instruction so skip this pattern.
   9391   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
   9392                                  isShuffleMaskInputInPlace(1, Mask))))
   9393     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   9394             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
   9395       return Result;
   9396 
   9397   // If we have AVX2 then we always want to lower with a blend because an v4 we
   9398   // can fully permute the elements.
   9399   if (Subtarget->hasAVX2())
   9400     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
   9401                                                       Mask, DAG);
   9402 
   9403   // Otherwise fall back on generic lowering.
   9404   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
   9405 }
   9406 
   9407 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
   9408 ///
   9409 /// This routine is only called when we have AVX2 and thus a reasonable
   9410 /// instruction set for v4i64 shuffling..
   9411 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9412                                        const X86Subtarget *Subtarget,
   9413                                        SelectionDAG &DAG) {
   9414   SDLoc DL(Op);
   9415   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   9416   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   9417   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9418   ArrayRef<int> Mask = SVOp->getMask();
   9419   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   9420   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
   9421 
   9422   SmallVector<int, 4> WidenedMask;
   9423   if (canWidenShuffleElements(Mask, WidenedMask))
   9424     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
   9425                                     DAG);
   9426 
   9427   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
   9428                                                 Subtarget, DAG))
   9429     return Blend;
   9430 
   9431   // Check for being able to broadcast a single element.
   9432   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
   9433                                                         Mask, Subtarget, DAG))
   9434     return Broadcast;
   9435 
   9436   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
   9437   // use lower latency instructions that will operate on both 128-bit lanes.
   9438   SmallVector<int, 2> RepeatedMask;
   9439   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
   9440     if (isSingleInputShuffleMask(Mask)) {
   9441       int PSHUFDMask[] = {-1, -1, -1, -1};
   9442       for (int i = 0; i < 2; ++i)
   9443         if (RepeatedMask[i] >= 0) {
   9444           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
   9445           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
   9446         }
   9447       return DAG.getNode(
   9448           ISD::BITCAST, DL, MVT::v4i64,
   9449           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
   9450                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
   9451                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
   9452     }
   9453   }
   9454 
   9455   // AVX2 provides a direct instruction for permuting a single input across
   9456   // lanes.
   9457   if (isSingleInputShuffleMask(Mask))
   9458     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
   9459                        getV4X86ShuffleImm8ForMask(Mask, DAG));
   9460 
   9461   // Try to use shift instructions.
   9462   if (SDValue Shift =
   9463           lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
   9464     return Shift;
   9465 
   9466   // Use dedicated unpack instructions for masks that match their pattern.
   9467   if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
   9468     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
   9469   if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
   9470     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
   9471   if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
   9472     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
   9473   if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
   9474     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
   9475 
   9476   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   9477   // shuffle. However, if we have AVX2 and either inputs are already in place,
   9478   // we will be able to shuffle even across lanes the other input in a single
   9479   // instruction so skip this pattern.
   9480   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
   9481                                  isShuffleMaskInputInPlace(1, Mask))))
   9482     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   9483             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
   9484       return Result;
   9485 
   9486   // Otherwise fall back on generic blend lowering.
   9487   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
   9488                                                     Mask, DAG);
   9489 }
   9490 
   9491 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
   9492 ///
   9493 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
   9494 /// isn't available.
   9495 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9496                                        const X86Subtarget *Subtarget,
   9497                                        SelectionDAG &DAG) {
   9498   SDLoc DL(Op);
   9499   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   9500   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   9501   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9502   ArrayRef<int> Mask = SVOp->getMask();
   9503   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   9504 
   9505   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
   9506                                                 Subtarget, DAG))
   9507     return Blend;
   9508 
   9509   // Check for being able to broadcast a single element.
   9510   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
   9511                                                         Mask, Subtarget, DAG))
   9512     return Broadcast;
   9513 
   9514   // If the shuffle mask is repeated in each 128-bit lane, we have many more
   9515   // options to efficiently lower the shuffle.
   9516   SmallVector<int, 4> RepeatedMask;
   9517   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
   9518     assert(RepeatedMask.size() == 4 &&
   9519            "Repeated masks must be half the mask width!");
   9520 
   9521     // Use even/odd duplicate instructions for masks that match their pattern.
   9522     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
   9523       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
   9524     if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7}))
   9525       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
   9526 
   9527     if (isSingleInputShuffleMask(Mask))
   9528       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
   9529                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
   9530 
   9531     // Use dedicated unpack instructions for masks that match their pattern.
   9532     if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
   9533       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
   9534     if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
   9535       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
   9536     if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
   9537       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
   9538     if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
   9539       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
   9540 
   9541     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
   9542     // have already handled any direct blends. We also need to squash the
   9543     // repeated mask into a simulated v4f32 mask.
   9544     for (int i = 0; i < 4; ++i)
   9545       if (RepeatedMask[i] >= 8)
   9546         RepeatedMask[i] -= 4;
   9547     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
   9548   }
   9549 
   9550   // If we have a single input shuffle with different shuffle patterns in the
   9551   // two 128-bit lanes use the variable mask to VPERMILPS.
   9552   if (isSingleInputShuffleMask(Mask)) {
   9553     SDValue VPermMask[8];
   9554     for (int i = 0; i < 8; ++i)
   9555       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
   9556                                  : DAG.getConstant(Mask[i], MVT::i32);
   9557     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
   9558       return DAG.getNode(
   9559           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
   9560           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
   9561 
   9562     if (Subtarget->hasAVX2())
   9563       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
   9564                          DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
   9565                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
   9566                                                  MVT::v8i32, VPermMask)),
   9567                          V1);
   9568 
   9569     // Otherwise, fall back.
   9570     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
   9571                                                    DAG);
   9572   }
   9573 
   9574   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   9575   // shuffle.
   9576   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   9577           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
   9578     return Result;
   9579 
   9580   // If we have AVX2 then we always want to lower with a blend because at v8 we
   9581   // can fully permute the elements.
   9582   if (Subtarget->hasAVX2())
   9583     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
   9584                                                       Mask, DAG);
   9585 
   9586   // Otherwise fall back on generic lowering.
   9587   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
   9588 }
   9589 
   9590 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
   9591 ///
   9592 /// This routine is only called when we have AVX2 and thus a reasonable
   9593 /// instruction set for v8i32 shuffling..
   9594 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9595                                        const X86Subtarget *Subtarget,
   9596                                        SelectionDAG &DAG) {
   9597   SDLoc DL(Op);
   9598   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   9599   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   9600   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9601   ArrayRef<int> Mask = SVOp->getMask();
   9602   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   9603   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
   9604 
   9605   // Whenever we can lower this as a zext, that instruction is strictly faster
   9606   // than any alternative. It also allows us to fold memory operands into the
   9607   // shuffle in many cases.
   9608   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
   9609                                                          Mask, Subtarget, DAG))
   9610     return ZExt;
   9611 
   9612   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
   9613                                                 Subtarget, DAG))
   9614     return Blend;
   9615 
   9616   // Check for being able to broadcast a single element.
   9617   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
   9618                                                         Mask, Subtarget, DAG))
   9619     return Broadcast;
   9620 
   9621   // If the shuffle mask is repeated in each 128-bit lane we can use more
   9622   // efficient instructions that mirror the shuffles across the two 128-bit
   9623   // lanes.
   9624   SmallVector<int, 4> RepeatedMask;
   9625   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
   9626     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
   9627     if (isSingleInputShuffleMask(Mask))
   9628       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
   9629                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
   9630 
   9631     // Use dedicated unpack instructions for masks that match their pattern.
   9632     if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
   9633       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
   9634     if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
   9635       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
   9636     if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
   9637       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
   9638     if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
   9639       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
   9640   }
   9641 
   9642   // Try to use shift instructions.
   9643   if (SDValue Shift =
   9644           lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
   9645     return Shift;
   9646 
   9647   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   9648           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
   9649     return Rotate;
   9650 
   9651   // If the shuffle patterns aren't repeated but it is a single input, directly
   9652   // generate a cross-lane VPERMD instruction.
   9653   if (isSingleInputShuffleMask(Mask)) {
   9654     SDValue VPermMask[8];
   9655     for (int i = 0; i < 8; ++i)
   9656       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
   9657                                  : DAG.getConstant(Mask[i], MVT::i32);
   9658     return DAG.getNode(
   9659         X86ISD::VPERMV, DL, MVT::v8i32,
   9660         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
   9661   }
   9662 
   9663   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   9664   // shuffle.
   9665   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   9666           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
   9667     return Result;
   9668 
   9669   // Otherwise fall back on generic blend lowering.
   9670   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
   9671                                                     Mask, DAG);
   9672 }
   9673 
   9674 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
   9675 ///
   9676 /// This routine is only called when we have AVX2 and thus a reasonable
   9677 /// instruction set for v16i16 shuffling..
   9678 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9679                                         const X86Subtarget *Subtarget,
   9680                                         SelectionDAG &DAG) {
   9681   SDLoc DL(Op);
   9682   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   9683   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   9684   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9685   ArrayRef<int> Mask = SVOp->getMask();
   9686   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   9687   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
   9688 
   9689   // Whenever we can lower this as a zext, that instruction is strictly faster
   9690   // than any alternative. It also allows us to fold memory operands into the
   9691   // shuffle in many cases.
   9692   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
   9693                                                          Mask, Subtarget, DAG))
   9694     return ZExt;
   9695 
   9696   // Check for being able to broadcast a single element.
   9697   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
   9698                                                         Mask, Subtarget, DAG))
   9699     return Broadcast;
   9700 
   9701   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
   9702                                                 Subtarget, DAG))
   9703     return Blend;
   9704 
   9705   // Use dedicated unpack instructions for masks that match their pattern.
   9706   if (isShuffleEquivalent(V1, V2, Mask,
   9707                           {// First 128-bit lane:
   9708                            0, 16, 1, 17, 2, 18, 3, 19,
   9709                            // Second 128-bit lane:
   9710                            8, 24, 9, 25, 10, 26, 11, 27}))
   9711     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
   9712   if (isShuffleEquivalent(V1, V2, Mask,
   9713                           {// First 128-bit lane:
   9714                            4, 20, 5, 21, 6, 22, 7, 23,
   9715                            // Second 128-bit lane:
   9716                            12, 28, 13, 29, 14, 30, 15, 31}))
   9717     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
   9718 
   9719   // Try to use shift instructions.
   9720   if (SDValue Shift =
   9721           lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
   9722     return Shift;
   9723 
   9724   // Try to use byte rotation instructions.
   9725   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   9726           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
   9727     return Rotate;
   9728 
   9729   if (isSingleInputShuffleMask(Mask)) {
   9730     // There are no generalized cross-lane shuffle operations available on i16
   9731     // element types.
   9732     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
   9733       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
   9734                                                      Mask, DAG);
   9735 
   9736     SmallVector<int, 8> RepeatedMask;
   9737     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
   9738       // As this is a single-input shuffle, the repeated mask should be
   9739       // a strictly valid v8i16 mask that we can pass through to the v8i16
   9740       // lowering to handle even the v16 case.
   9741       return lowerV8I16GeneralSingleInputVectorShuffle(
   9742           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
   9743     }
   9744 
   9745     SDValue PSHUFBMask[32];
   9746     for (int i = 0; i < 16; ++i) {
   9747       if (Mask[i] == -1) {
   9748         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
   9749         continue;
   9750       }
   9751 
   9752       int M = i < 8 ? Mask[i] : Mask[i] - 8;
   9753       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
   9754       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
   9755       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
   9756     }
   9757     return DAG.getNode(
   9758         ISD::BITCAST, DL, MVT::v16i16,
   9759         DAG.getNode(
   9760             X86ISD::PSHUFB, DL, MVT::v32i8,
   9761             DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
   9762             DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
   9763   }
   9764 
   9765   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   9766   // shuffle.
   9767   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   9768           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
   9769     return Result;
   9770 
   9771   // Otherwise fall back on generic lowering.
   9772   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
   9773 }
   9774 
   9775 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
   9776 ///
   9777 /// This routine is only called when we have AVX2 and thus a reasonable
   9778 /// instruction set for v32i8 shuffling..
   9779 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9780                                        const X86Subtarget *Subtarget,
   9781                                        SelectionDAG &DAG) {
   9782   SDLoc DL(Op);
   9783   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   9784   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   9785   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9786   ArrayRef<int> Mask = SVOp->getMask();
   9787   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   9788   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
   9789 
   9790   // Whenever we can lower this as a zext, that instruction is strictly faster
   9791   // than any alternative. It also allows us to fold memory operands into the
   9792   // shuffle in many cases.
   9793   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
   9794                                                          Mask, Subtarget, DAG))
   9795     return ZExt;
   9796 
   9797   // Check for being able to broadcast a single element.
   9798   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
   9799                                                         Mask, Subtarget, DAG))
   9800     return Broadcast;
   9801 
   9802   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
   9803                                                 Subtarget, DAG))
   9804     return Blend;
   9805 
   9806   // Use dedicated unpack instructions for masks that match their pattern.
   9807   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
   9808   // 256-bit lanes.
   9809   if (isShuffleEquivalent(
   9810           V1, V2, Mask,
   9811           {// First 128-bit lane:
   9812            0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
   9813            // Second 128-bit lane:
   9814            16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55}))
   9815     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
   9816   if (isShuffleEquivalent(
   9817           V1, V2, Mask,
   9818           {// First 128-bit lane:
   9819            8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
   9820            // Second 128-bit lane:
   9821            24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63}))
   9822     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
   9823 
   9824   // Try to use shift instructions.
   9825   if (SDValue Shift =
   9826           lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
   9827     return Shift;
   9828 
   9829   // Try to use byte rotation instructions.
   9830   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   9831           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
   9832     return Rotate;
   9833 
   9834   if (isSingleInputShuffleMask(Mask)) {
   9835     // There are no generalized cross-lane shuffle operations available on i8
   9836     // element types.
   9837     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
   9838       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
   9839                                                      Mask, DAG);
   9840 
   9841     SDValue PSHUFBMask[32];
   9842     for (int i = 0; i < 32; ++i)
   9843       PSHUFBMask[i] =
   9844           Mask[i] < 0
   9845               ? DAG.getUNDEF(MVT::i8)
   9846               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
   9847 
   9848     return DAG.getNode(
   9849         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
   9850         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
   9851   }
   9852 
   9853   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   9854   // shuffle.
   9855   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   9856           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
   9857     return Result;
   9858 
   9859   // Otherwise fall back on generic lowering.
   9860   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
   9861 }
   9862 
   9863 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
   9864 ///
   9865 /// This routine either breaks down the specific type of a 256-bit x86 vector
   9866 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
   9867 /// together based on the available instructions.
   9868 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9869                                         MVT VT, const X86Subtarget *Subtarget,
   9870                                         SelectionDAG &DAG) {
   9871   SDLoc DL(Op);
   9872   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9873   ArrayRef<int> Mask = SVOp->getMask();
   9874 
   9875   // If we have a single input to the zero element, insert that into V1 if we
   9876   // can do so cheaply.
   9877   int NumElts = VT.getVectorNumElements();
   9878   int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
   9879     return M >= NumElts;
   9880   });
   9881 
   9882   if (NumV2Elements == 1 && Mask[0] >= NumElts)
   9883     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   9884                               DL, VT, V1, V2, Mask, Subtarget, DAG))
   9885       return Insertion;
   9886 
   9887   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
   9888   // check for those subtargets here and avoid much of the subtarget querying in
   9889   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
   9890   // ability to manipulate a 256-bit vector with integer types. Since we'll use
   9891   // floating point types there eventually, just immediately cast everything to
   9892   // a float and operate entirely in that domain.
   9893   if (VT.isInteger() && !Subtarget->hasAVX2()) {
   9894     int ElementBits = VT.getScalarSizeInBits();
   9895     if (ElementBits < 32)
   9896       // No floating point type available, decompose into 128-bit vectors.
   9897       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   9898 
   9899     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
   9900                                 VT.getVectorNumElements());
   9901     V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
   9902     V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
   9903     return DAG.getNode(ISD::BITCAST, DL, VT,
   9904                        DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
   9905   }
   9906 
   9907   switch (VT.SimpleTy) {
   9908   case MVT::v4f64:
   9909     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   9910   case MVT::v4i64:
   9911     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   9912   case MVT::v8f32:
   9913     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   9914   case MVT::v8i32:
   9915     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   9916   case MVT::v16i16:
   9917     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
   9918   case MVT::v32i8:
   9919     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
   9920 
   9921   default:
   9922     llvm_unreachable("Not a valid 256-bit x86 vector type!");
   9923   }
   9924 }
   9925 
   9926 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
   9927 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9928                                        const X86Subtarget *Subtarget,
   9929                                        SelectionDAG &DAG) {
   9930   SDLoc DL(Op);
   9931   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   9932   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   9933   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9934   ArrayRef<int> Mask = SVOp->getMask();
   9935   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   9936 
   9937   // X86 has dedicated unpack instructions that can handle specific blend
   9938   // operations: UNPCKH and UNPCKL.
   9939   if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
   9940     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
   9941   if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
   9942     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
   9943 
   9944   // FIXME: Implement direct support for this type!
   9945   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
   9946 }
   9947 
   9948 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
   9949 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9950                                        const X86Subtarget *Subtarget,
   9951                                        SelectionDAG &DAG) {
   9952   SDLoc DL(Op);
   9953   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   9954   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   9955   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9956   ArrayRef<int> Mask = SVOp->getMask();
   9957   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   9958 
   9959   // Use dedicated unpack instructions for masks that match their pattern.
   9960   if (isShuffleEquivalent(V1, V2, Mask,
   9961                           {// First 128-bit lane.
   9962                            0, 16, 1, 17, 4, 20, 5, 21,
   9963                            // Second 128-bit lane.
   9964                            8, 24, 9, 25, 12, 28, 13, 29}))
   9965     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
   9966   if (isShuffleEquivalent(V1, V2, Mask,
   9967                           {// First 128-bit lane.
   9968                            2, 18, 3, 19, 6, 22, 7, 23,
   9969                            // Second 128-bit lane.
   9970                            10, 26, 11, 27, 14, 30, 15, 31}))
   9971     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
   9972 
   9973   // FIXME: Implement direct support for this type!
   9974   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
   9975 }
   9976 
   9977 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
   9978 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   9979                                        const X86Subtarget *Subtarget,
   9980                                        SelectionDAG &DAG) {
   9981   SDLoc DL(Op);
   9982   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   9983   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   9984   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   9985   ArrayRef<int> Mask = SVOp->getMask();
   9986   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   9987 
   9988   // X86 has dedicated unpack instructions that can handle specific blend
   9989   // operations: UNPCKH and UNPCKL.
   9990   if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
   9991     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
   9992   if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
   9993     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
   9994 
   9995   // FIXME: Implement direct support for this type!
   9996   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
   9997 }
   9998 
   9999 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
   10000 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   10001                                        const X86Subtarget *Subtarget,
   10002                                        SelectionDAG &DAG) {
   10003   SDLoc DL(Op);
   10004   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   10005   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   10006   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   10007   ArrayRef<int> Mask = SVOp->getMask();
   10008   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   10009 
   10010   // Use dedicated unpack instructions for masks that match their pattern.
   10011   if (isShuffleEquivalent(V1, V2, Mask,
   10012                           {// First 128-bit lane.
   10013                            0, 16, 1, 17, 4, 20, 5, 21,
   10014                            // Second 128-bit lane.
   10015                            8, 24, 9, 25, 12, 28, 13, 29}))
   10016     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
   10017   if (isShuffleEquivalent(V1, V2, Mask,
   10018                           {// First 128-bit lane.
   10019                            2, 18, 3, 19, 6, 22, 7, 23,
   10020                            // Second 128-bit lane.
   10021                            10, 26, 11, 27, 14, 30, 15, 31}))
   10022     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
   10023 
   10024   // FIXME: Implement direct support for this type!
   10025   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
   10026 }
   10027 
   10028 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
   10029 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   10030                                         const X86Subtarget *Subtarget,
   10031                                         SelectionDAG &DAG) {
   10032   SDLoc DL(Op);
   10033   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   10034   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   10035   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   10036   ArrayRef<int> Mask = SVOp->getMask();
   10037   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   10038   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
   10039 
   10040   // FIXME: Implement direct support for this type!
   10041   return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
   10042 }
   10043 
   10044 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
   10045 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   10046                                        const X86Subtarget *Subtarget,
   10047                                        SelectionDAG &DAG) {
   10048   SDLoc DL(Op);
   10049   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   10050   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   10051   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   10052   ArrayRef<int> Mask = SVOp->getMask();
   10053   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
   10054   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
   10055 
   10056   // FIXME: Implement direct support for this type!
   10057   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
   10058 }
   10059 
   10060 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
   10061 ///
   10062 /// This routine either breaks down the specific type of a 512-bit x86 vector
   10063 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
   10064 /// together based on the available instructions.
   10065 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   10066                                         MVT VT, const X86Subtarget *Subtarget,
   10067                                         SelectionDAG &DAG) {
   10068   SDLoc DL(Op);
   10069   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   10070   ArrayRef<int> Mask = SVOp->getMask();
   10071   assert(Subtarget->hasAVX512() &&
   10072          "Cannot lower 512-bit vectors w/ basic ISA!");
   10073 
   10074   // Check for being able to broadcast a single element.
   10075   if (SDValue Broadcast =
   10076           lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
   10077     return Broadcast;
   10078 
   10079   // Dispatch to each element type for lowering. If we don't have supprot for
   10080   // specific element type shuffles at 512 bits, immediately split them and
   10081   // lower them. Each lowering routine of a given type is allowed to assume that
   10082   // the requisite ISA extensions for that element type are available.
   10083   switch (VT.SimpleTy) {
   10084   case MVT::v8f64:
   10085     return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   10086   case MVT::v16f32:
   10087     return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   10088   case MVT::v8i64:
   10089     return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
   10090   case MVT::v16i32:
   10091     return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   10092   case MVT::v32i16:
   10093     if (Subtarget->hasBWI())
   10094       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
   10095     break;
   10096   case MVT::v64i8:
   10097     if (Subtarget->hasBWI())
   10098       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
   10099     break;
   10100 
   10101   default:
   10102     llvm_unreachable("Not a valid 512-bit x86 vector type!");
   10103   }
   10104 
   10105   // Otherwise fall back on splitting.
   10106   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   10107 }
   10108 
   10109 /// \brief Top-level lowering for x86 vector shuffles.
   10110 ///
   10111 /// This handles decomposition, canonicalization, and lowering of all x86
   10112 /// vector shuffles. Most of the specific lowering strategies are encapsulated
   10113 /// above in helper routines. The canonicalization attempts to widen shuffles
   10114 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
   10115 /// s.t. only one of the two inputs needs to be tested, etc.
   10116 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
   10117                                   SelectionDAG &DAG) {
   10118   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   10119   ArrayRef<int> Mask = SVOp->getMask();
   10120   SDValue V1 = Op.getOperand(0);
   10121   SDValue V2 = Op.getOperand(1);
   10122   MVT VT = Op.getSimpleValueType();
   10123   int NumElements = VT.getVectorNumElements();
   10124   SDLoc dl(Op);
   10125 
   10126   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
   10127 
   10128   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   10129   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   10130   if (V1IsUndef && V2IsUndef)
   10131     return DAG.getUNDEF(VT);
   10132 
   10133   // When we create a shuffle node we put the UNDEF node to second operand,
   10134   // but in some cases the first operand may be transformed to UNDEF.
   10135   // In this case we should just commute the node.
   10136   if (V1IsUndef)
   10137     return DAG.getCommutedVectorShuffle(*SVOp);
   10138 
   10139   // Check for non-undef masks pointing at an undef vector and make the masks
   10140   // undef as well. This makes it easier to match the shuffle based solely on
   10141   // the mask.
   10142   if (V2IsUndef)
   10143     for (int M : Mask)
   10144       if (M >= NumElements) {
   10145         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
   10146         for (int &M : NewMask)
   10147           if (M >= NumElements)
   10148             M = -1;
   10149         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
   10150       }
   10151 
   10152   // We actually see shuffles that are entirely re-arrangements of a set of
   10153   // zero inputs. This mostly happens while decomposing complex shuffles into
   10154   // simple ones. Directly lower these as a buildvector of zeros.
   10155   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   10156   if (Zeroable.all())
   10157     return getZeroVector(VT, Subtarget, DAG, dl);
   10158 
   10159   // Try to collapse shuffles into using a vector type with fewer elements but
   10160   // wider element types. We cap this to not form integers or floating point
   10161   // elements wider than 64 bits, but it might be interesting to form i128
   10162   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
   10163   SmallVector<int, 16> WidenedMask;
   10164   if (VT.getScalarSizeInBits() < 64 &&
   10165       canWidenShuffleElements(Mask, WidenedMask)) {
   10166     MVT NewEltVT = VT.isFloatingPoint()
   10167                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
   10168                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
   10169     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
   10170     // Make sure that the new vector type is legal. For example, v2f64 isn't
   10171     // legal on SSE1.
   10172     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
   10173       V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
   10174       V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
   10175       return DAG.getNode(ISD::BITCAST, dl, VT,
   10176                          DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
   10177     }
   10178   }
   10179 
   10180   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
   10181   for (int M : SVOp->getMask())
   10182     if (M < 0)
   10183       ++NumUndefElements;
   10184     else if (M < NumElements)
   10185       ++NumV1Elements;
   10186     else
   10187       ++NumV2Elements;
   10188 
   10189   // Commute the shuffle as needed such that more elements come from V1 than
   10190   // V2. This allows us to match the shuffle pattern strictly on how many
   10191   // elements come from V1 without handling the symmetric cases.
   10192   if (NumV2Elements > NumV1Elements)
   10193     return DAG.getCommutedVectorShuffle(*SVOp);
   10194 
   10195   // When the number of V1 and V2 elements are the same, try to minimize the
   10196   // number of uses of V2 in the low half of the vector. When that is tied,
   10197   // ensure that the sum of indices for V1 is equal to or lower than the sum
   10198   // indices for V2. When those are equal, try to ensure that the number of odd
   10199   // indices for V1 is lower than the number of odd indices for V2.
   10200   if (NumV1Elements == NumV2Elements) {
   10201     int LowV1Elements = 0, LowV2Elements = 0;
   10202     for (int M : SVOp->getMask().slice(0, NumElements / 2))
   10203       if (M >= NumElements)
   10204         ++LowV2Elements;
   10205       else if (M >= 0)
   10206         ++LowV1Elements;
   10207     if (LowV2Elements > LowV1Elements) {
   10208       return DAG.getCommutedVectorShuffle(*SVOp);
   10209     } else if (LowV2Elements == LowV1Elements) {
   10210       int SumV1Indices = 0, SumV2Indices = 0;
   10211       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
   10212         if (SVOp->getMask()[i] >= NumElements)
   10213           SumV2Indices += i;
   10214         else if (SVOp->getMask()[i] >= 0)
   10215           SumV1Indices += i;
   10216       if (SumV2Indices < SumV1Indices) {
   10217         return DAG.getCommutedVectorShuffle(*SVOp);
   10218       } else if (SumV2Indices == SumV1Indices) {
   10219         int NumV1OddIndices = 0, NumV2OddIndices = 0;
   10220         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
   10221           if (SVOp->getMask()[i] >= NumElements)
   10222             NumV2OddIndices += i % 2;
   10223           else if (SVOp->getMask()[i] >= 0)
   10224             NumV1OddIndices += i % 2;
   10225         if (NumV2OddIndices < NumV1OddIndices)
   10226           return DAG.getCommutedVectorShuffle(*SVOp);
   10227       }
   10228     }
   10229   }
   10230 
   10231   // For each vector width, delegate to a specialized lowering routine.
   10232   if (VT.getSizeInBits() == 128)
   10233     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
   10234 
   10235   if (VT.getSizeInBits() == 256)
   10236     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
   10237 
   10238   // Force AVX-512 vectors to be scalarized for now.
   10239   // FIXME: Implement AVX-512 support!
   10240   if (VT.getSizeInBits() == 512)
   10241     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
   10242 
   10243   llvm_unreachable("Unimplemented!");
   10244 }
   10245 
   10246 // This function assumes its argument is a BUILD_VECTOR of constants or
   10247 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
   10248 // true.
   10249 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
   10250                                     unsigned &MaskValue) {
   10251   MaskValue = 0;
   10252   unsigned NumElems = BuildVector->getNumOperands();
   10253   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
   10254   unsigned NumLanes = (NumElems - 1) / 8 + 1;
   10255   unsigned NumElemsInLane = NumElems / NumLanes;
   10256 
   10257   // Blend for v16i16 should be symetric for the both lanes.
   10258   for (unsigned i = 0; i < NumElemsInLane; ++i) {
   10259     SDValue EltCond = BuildVector->getOperand(i);
   10260     SDValue SndLaneEltCond =
   10261         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
   10262 
   10263     int Lane1Cond = -1, Lane2Cond = -1;
   10264     if (isa<ConstantSDNode>(EltCond))
   10265       Lane1Cond = !isZero(EltCond);
   10266     if (isa<ConstantSDNode>(SndLaneEltCond))
   10267       Lane2Cond = !isZero(SndLaneEltCond);
   10268 
   10269     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
   10270       // Lane1Cond != 0, means we want the first argument.
   10271       // Lane1Cond == 0, means we want the second argument.
   10272       // The encoding of this argument is 0 for the first argument, 1
   10273       // for the second. Therefore, invert the condition.
   10274       MaskValue |= !Lane1Cond << i;
   10275     else if (Lane1Cond < 0)
   10276       MaskValue |= !Lane2Cond << i;
   10277     else
   10278       return false;
   10279   }
   10280   return true;
   10281 }
   10282 
   10283 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
   10284 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
   10285                                            const X86Subtarget *Subtarget,
   10286                                            SelectionDAG &DAG) {
   10287   SDValue Cond = Op.getOperand(0);
   10288   SDValue LHS = Op.getOperand(1);
   10289   SDValue RHS = Op.getOperand(2);
   10290   SDLoc dl(Op);
   10291   MVT VT = Op.getSimpleValueType();
   10292 
   10293   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
   10294     return SDValue();
   10295   auto *CondBV = cast<BuildVectorSDNode>(Cond);
   10296 
   10297   // Only non-legal VSELECTs reach this lowering, convert those into generic
   10298   // shuffles and re-use the shuffle lowering path for blends.
   10299   SmallVector<int, 32> Mask;
   10300   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
   10301     SDValue CondElt = CondBV->getOperand(i);
   10302     Mask.push_back(
   10303         isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1);
   10304   }
   10305   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
   10306 }
   10307 
   10308 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   10309   // A vselect where all conditions and data are constants can be optimized into
   10310   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
   10311   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
   10312       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
   10313       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
   10314     return SDValue();
   10315 
   10316   // Try to lower this to a blend-style vector shuffle. This can handle all
   10317   // constant condition cases.
   10318   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
   10319     return BlendOp;
   10320 
   10321   // Variable blends are only legal from SSE4.1 onward.
   10322   if (!Subtarget->hasSSE41())
   10323     return SDValue();
   10324 
   10325   // Only some types will be legal on some subtargets. If we can emit a legal
   10326   // VSELECT-matching blend, return Op, and but if we need to expand, return
   10327   // a null value.
   10328   switch (Op.getSimpleValueType().SimpleTy) {
   10329   default:
   10330     // Most of the vector types have blends past SSE4.1.
   10331     return Op;
   10332 
   10333   case MVT::v32i8:
   10334     // The byte blends for AVX vectors were introduced only in AVX2.
   10335     if (Subtarget->hasAVX2())
   10336       return Op;
   10337 
   10338     return SDValue();
   10339 
   10340   case MVT::v8i16:
   10341   case MVT::v16i16:
   10342     // AVX-512 BWI and VLX features support VSELECT with i16 elements.
   10343     if (Subtarget->hasBWI() && Subtarget->hasVLX())
   10344       return Op;
   10345 
   10346     // FIXME: We should custom lower this by fixing the condition and using i8
   10347     // blends.
   10348     return SDValue();
   10349   }
   10350 }
   10351 
   10352 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   10353   MVT VT = Op.getSimpleValueType();
   10354   SDLoc dl(Op);
   10355 
   10356   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
   10357     return SDValue();
   10358 
   10359   if (VT.getSizeInBits() == 8) {
   10360     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
   10361                                   Op.getOperand(0), Op.getOperand(1));
   10362     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   10363                                   DAG.getValueType(VT));
   10364     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   10365   }
   10366 
   10367   if (VT.getSizeInBits() == 16) {
   10368     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   10369     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
   10370     if (Idx == 0)
   10371       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   10372                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   10373                                      DAG.getNode(ISD::BITCAST, dl,
   10374                                                  MVT::v4i32,
   10375                                                  Op.getOperand(0)),
   10376                                      Op.getOperand(1)));
   10377     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
   10378                                   Op.getOperand(0), Op.getOperand(1));
   10379     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
   10380                                   DAG.getValueType(VT));
   10381     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   10382   }
   10383 
   10384   if (VT == MVT::f32) {
   10385     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
   10386     // the result back to FR32 register. It's only worth matching if the
   10387     // result has a single use which is a store or a bitcast to i32.  And in
   10388     // the case of a store, it's not worth it if the index is a constant 0,
   10389     // because a MOVSSmr can be used instead, which is smaller and faster.
   10390     if (!Op.hasOneUse())
   10391       return SDValue();
   10392     SDNode *User = *Op.getNode()->use_begin();
   10393     if ((User->getOpcode() != ISD::STORE ||
   10394          (isa<ConstantSDNode>(Op.getOperand(1)) &&
   10395           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
   10396         (User->getOpcode() != ISD::BITCAST ||
   10397          User->getValueType(0) != MVT::i32))
   10398       return SDValue();
   10399     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   10400                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
   10401                                               Op.getOperand(0)),
   10402                                               Op.getOperand(1));
   10403     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
   10404   }
   10405 
   10406   if (VT == MVT::i32 || VT == MVT::i64) {
   10407     // ExtractPS/pextrq works with constant index.
   10408     if (isa<ConstantSDNode>(Op.getOperand(1)))
   10409       return Op;
   10410   }
   10411   return SDValue();
   10412 }
   10413 
   10414 /// Extract one bit from mask vector, like v16i1 or v8i1.
   10415 /// AVX-512 feature.
   10416 SDValue
   10417 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
   10418   SDValue Vec = Op.getOperand(0);
   10419   SDLoc dl(Vec);
   10420   MVT VecVT = Vec.getSimpleValueType();
   10421   SDValue Idx = Op.getOperand(1);
   10422   MVT EltVT = Op.getSimpleValueType();
   10423 
   10424   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
   10425   assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
   10426          "Unexpected vector type in ExtractBitFromMaskVector");
   10427 
   10428   // variable index can't be handled in mask registers,
   10429   // extend vector to VR512
   10430   if (!isa<ConstantSDNode>(Idx)) {
   10431     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
   10432     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
   10433     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   10434                               ExtVT.getVectorElementType(), Ext, Idx);
   10435     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
   10436   }
   10437 
   10438   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   10439   const TargetRegisterClass* rc = getRegClassFor(VecVT);
   10440   if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
   10441     rc = getRegClassFor(MVT::v16i1);
   10442   unsigned MaxSift = rc->getSize()*8 - 1;
   10443   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
   10444                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
   10445   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
   10446                     DAG.getConstant(MaxSift, MVT::i8));
   10447   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
   10448                        DAG.getIntPtrConstant(0));
   10449 }
   10450 
   10451 SDValue
   10452 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   10453                                            SelectionDAG &DAG) const {
   10454   SDLoc dl(Op);
   10455   SDValue Vec = Op.getOperand(0);
   10456   MVT VecVT = Vec.getSimpleValueType();
   10457   SDValue Idx = Op.getOperand(1);
   10458 
   10459   if (Op.getSimpleValueType() == MVT::i1)
   10460     return ExtractBitFromMaskVector(Op, DAG);
   10461 
   10462   if (!isa<ConstantSDNode>(Idx)) {
   10463     if (VecVT.is512BitVector() ||
   10464         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
   10465          VecVT.getVectorElementType().getSizeInBits() == 32)) {
   10466 
   10467       MVT MaskEltVT =
   10468         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
   10469       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
   10470                                     MaskEltVT.getSizeInBits());
   10471 
   10472       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
   10473       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
   10474                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
   10475                                 Idx, DAG.getConstant(0, getPointerTy()));
   10476       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
   10477       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
   10478                         Perm, DAG.getConstant(0, getPointerTy()));
   10479     }
   10480     return SDValue();
   10481   }
   10482 
   10483   // If this is a 256-bit vector result, first extract the 128-bit vector and
   10484   // then extract the element from the 128-bit vector.
   10485   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
   10486 
   10487     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   10488     // Get the 128-bit vector.
   10489     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
   10490     MVT EltVT = VecVT.getVectorElementType();
   10491 
   10492     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
   10493 
   10494     //if (IdxVal >= NumElems/2)
   10495     //  IdxVal -= NumElems/2;
   10496     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
   10497     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
   10498                        DAG.getConstant(IdxVal, MVT::i32));
   10499   }
   10500 
   10501   assert(VecVT.is128BitVector() && "Unexpected vector length");
   10502 
   10503   if (Subtarget->hasSSE41()) {
   10504     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
   10505     if (Res.getNode())
   10506       return Res;
   10507   }
   10508 
   10509   MVT VT = Op.getSimpleValueType();
   10510   // TODO: handle v16i8.
   10511   if (VT.getSizeInBits() == 16) {
   10512     SDValue Vec = Op.getOperand(0);
   10513     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   10514     if (Idx == 0)
   10515       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   10516                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   10517                                      DAG.getNode(ISD::BITCAST, dl,
   10518                                                  MVT::v4i32, Vec),
   10519                                      Op.getOperand(1)));
   10520     // Transform it so it match pextrw which produces a 32-bit result.
   10521     MVT EltVT = MVT::i32;
   10522     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
   10523                                   Op.getOperand(0), Op.getOperand(1));
   10524     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
   10525                                   DAG.getValueType(VT));
   10526     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   10527   }
   10528 
   10529   if (VT.getSizeInBits() == 32) {
   10530     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   10531     if (Idx == 0)
   10532       return Op;
   10533 
   10534     // SHUFPS the element to the lowest double word, then movss.
   10535     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
   10536     MVT VVT = Op.getOperand(0).getSimpleValueType();
   10537     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
   10538                                        DAG.getUNDEF(VVT), Mask);
   10539     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   10540                        DAG.getIntPtrConstant(0));
   10541   }
   10542 
   10543   if (VT.getSizeInBits() == 64) {
   10544     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
   10545     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
   10546     //        to match extract_elt for f64.
   10547     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   10548     if (Idx == 0)
   10549       return Op;
   10550 
   10551     // UNPCKHPD the element to the lowest double word, then movsd.
   10552     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
   10553     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
   10554     int Mask[2] = { 1, -1 };
   10555     MVT VVT = Op.getOperand(0).getSimpleValueType();
   10556     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
   10557                                        DAG.getUNDEF(VVT), Mask);
   10558     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   10559                        DAG.getIntPtrConstant(0));
   10560   }
   10561 
   10562   return SDValue();
   10563 }
   10564 
   10565 /// Insert one bit to mask vector, like v16i1 or v8i1.
   10566 /// AVX-512 feature.
   10567 SDValue
   10568 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
   10569   SDLoc dl(Op);
   10570   SDValue Vec = Op.getOperand(0);
   10571   SDValue Elt = Op.getOperand(1);
   10572   SDValue Idx = Op.getOperand(2);
   10573   MVT VecVT = Vec.getSimpleValueType();
   10574 
   10575   if (!isa<ConstantSDNode>(Idx)) {
   10576     // Non constant index. Extend source and destination,
   10577     // insert element and then truncate the result.
   10578     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
   10579     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
   10580     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
   10581       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
   10582       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
   10583     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
   10584   }
   10585 
   10586   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   10587   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
   10588   if (Vec.getOpcode() == ISD::UNDEF)
   10589     return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
   10590                        DAG.getConstant(IdxVal, MVT::i8));
   10591   const TargetRegisterClass* rc = getRegClassFor(VecVT);
   10592   unsigned MaxSift = rc->getSize()*8 - 1;
   10593   EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
   10594                     DAG.getConstant(MaxSift, MVT::i8));
   10595   EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
   10596                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
   10597   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
   10598 }
   10599 
   10600 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   10601                                                   SelectionDAG &DAG) const {
   10602   MVT VT = Op.getSimpleValueType();
   10603   MVT EltVT = VT.getVectorElementType();
   10604 
   10605   if (EltVT == MVT::i1)
   10606     return InsertBitToMaskVector(Op, DAG);
   10607 
   10608   SDLoc dl(Op);
   10609   SDValue N0 = Op.getOperand(0);
   10610   SDValue N1 = Op.getOperand(1);
   10611   SDValue N2 = Op.getOperand(2);
   10612   if (!isa<ConstantSDNode>(N2))
   10613     return SDValue();
   10614   auto *N2C = cast<ConstantSDNode>(N2);
   10615   unsigned IdxVal = N2C->getZExtValue();
   10616 
   10617   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
   10618   // into that, and then insert the subvector back into the result.
   10619   if (VT.is256BitVector() || VT.is512BitVector()) {
   10620     // With a 256-bit vector, we can insert into the zero element efficiently
   10621     // using a blend if we have AVX or AVX2 and the right data type.
   10622     if (VT.is256BitVector() && IdxVal == 0) {
   10623       // TODO: It is worthwhile to cast integer to floating point and back
   10624       // and incur a domain crossing penalty if that's what we'll end up
   10625       // doing anyway after extracting to a 128-bit vector.
   10626       if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
   10627           (Subtarget->hasAVX2() && EltVT == MVT::i32)) {
   10628         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
   10629         N2 = DAG.getIntPtrConstant(1);
   10630         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
   10631       }
   10632     }
   10633 
   10634     // Get the desired 128-bit vector chunk.
   10635     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
   10636 
   10637     // Insert the element into the desired chunk.
   10638     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
   10639     unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
   10640 
   10641     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
   10642                     DAG.getConstant(IdxIn128, MVT::i32));
   10643 
   10644     // Insert the changed part back into the bigger vector
   10645     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
   10646   }
   10647   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
   10648 
   10649   if (Subtarget->hasSSE41()) {
   10650     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
   10651       unsigned Opc;
   10652       if (VT == MVT::v8i16) {
   10653         Opc = X86ISD::PINSRW;
   10654       } else {
   10655         assert(VT == MVT::v16i8);
   10656         Opc = X86ISD::PINSRB;
   10657       }
   10658 
   10659       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
   10660       // argument.
   10661       if (N1.getValueType() != MVT::i32)
   10662         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   10663       if (N2.getValueType() != MVT::i32)
   10664         N2 = DAG.getIntPtrConstant(IdxVal);
   10665       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
   10666     }
   10667 
   10668     if (EltVT == MVT::f32) {
   10669       // Bits [7:6] of the constant are the source select. This will always be
   10670       //   zero here. The DAG Combiner may combine an extract_elt index into
   10671       //   these bits. For example (insert (extract, 3), 2) could be matched by
   10672       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
   10673       // Bits [5:4] of the constant are the destination select. This is the
   10674       //   value of the incoming immediate.
   10675       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
   10676       //   combine either bitwise AND or insert of float 0.0 to set these bits.
   10677 
   10678       const Function *F = DAG.getMachineFunction().getFunction();
   10679       bool MinSize = F->hasFnAttribute(Attribute::MinSize);
   10680       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
   10681         // If this is an insertion of 32-bits into the low 32-bits of
   10682         // a vector, we prefer to generate a blend with immediate rather
   10683         // than an insertps. Blends are simpler operations in hardware and so
   10684         // will always have equal or better performance than insertps.
   10685         // But if optimizing for size and there's a load folding opportunity,
   10686         // generate insertps because blendps does not have a 32-bit memory
   10687         // operand form.
   10688         N2 = DAG.getIntPtrConstant(1);
   10689         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
   10690         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
   10691       }
   10692       N2 = DAG.getIntPtrConstant(IdxVal << 4);
   10693       // Create this as a scalar to vector..
   10694       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
   10695       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
   10696     }
   10697 
   10698     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
   10699       // PINSR* works with constant index.
   10700       return Op;
   10701     }
   10702   }
   10703 
   10704   if (EltVT == MVT::i8)
   10705     return SDValue();
   10706 
   10707   if (EltVT.getSizeInBits() == 16) {
   10708     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
   10709     // as its second argument.
   10710     if (N1.getValueType() != MVT::i32)
   10711       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   10712     if (N2.getValueType() != MVT::i32)
   10713       N2 = DAG.getIntPtrConstant(IdxVal);
   10714     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
   10715   }
   10716   return SDValue();
   10717 }
   10718 
   10719 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   10720   SDLoc dl(Op);
   10721   MVT OpVT = Op.getSimpleValueType();
   10722 
   10723   // If this is a 256-bit vector result, first insert into a 128-bit
   10724   // vector and then insert into the 256-bit vector.
   10725   if (!OpVT.is128BitVector()) {
   10726     // Insert into a 128-bit vector.
   10727     unsigned SizeFactor = OpVT.getSizeInBits()/128;
   10728     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
   10729                                  OpVT.getVectorNumElements() / SizeFactor);
   10730 
   10731     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
   10732 
   10733     // Insert the 128-bit vector.
   10734     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   10735   }
   10736 
   10737   if (OpVT == MVT::v1i64 &&
   10738       Op.getOperand(0).getValueType() == MVT::i64)
   10739     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
   10740 
   10741   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
   10742   assert(OpVT.is128BitVector() && "Expected an SSE type!");
   10743   return DAG.getNode(ISD::BITCAST, dl, OpVT,
   10744                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
   10745 }
   10746 
   10747 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
   10748 // a simple subregister reference or explicit instructions to grab
   10749 // upper bits of a vector.
   10750 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   10751                                       SelectionDAG &DAG) {
   10752   SDLoc dl(Op);
   10753   SDValue In =  Op.getOperand(0);
   10754   SDValue Idx = Op.getOperand(1);
   10755   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   10756   MVT ResVT   = Op.getSimpleValueType();
   10757   MVT InVT    = In.getSimpleValueType();
   10758 
   10759   if (Subtarget->hasFp256()) {
   10760     if (ResVT.is128BitVector() &&
   10761         (InVT.is256BitVector() || InVT.is512BitVector()) &&
   10762         isa<ConstantSDNode>(Idx)) {
   10763       return Extract128BitVector(In, IdxVal, DAG, dl);
   10764     }
   10765     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
   10766         isa<ConstantSDNode>(Idx)) {
   10767       return Extract256BitVector(In, IdxVal, DAG, dl);
   10768     }
   10769   }
   10770   return SDValue();
   10771 }
   10772 
   10773 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
   10774 // simple superregister reference or explicit instructions to insert
   10775 // the upper bits of a vector.
   10776 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   10777                                      SelectionDAG &DAG) {
   10778   if (!Subtarget->hasAVX())
   10779     return SDValue();
   10780 
   10781   SDLoc dl(Op);
   10782   SDValue Vec = Op.getOperand(0);
   10783   SDValue SubVec = Op.getOperand(1);
   10784   SDValue Idx = Op.getOperand(2);
   10785 
   10786   if (!isa<ConstantSDNode>(Idx))
   10787     return SDValue();
   10788 
   10789   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   10790   MVT OpVT = Op.getSimpleValueType();
   10791   MVT SubVecVT = SubVec.getSimpleValueType();
   10792 
   10793   // Fold two 16-byte subvector loads into one 32-byte load:
   10794   // (insert_subvector (insert_subvector undef, (load addr), 0),
   10795   //                   (load addr + 16), Elts/2)
   10796   // --> load32 addr
   10797   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
   10798       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
   10799       OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
   10800       !Subtarget->isUnalignedMem32Slow()) {
   10801     SDValue SubVec2 = Vec.getOperand(1);
   10802     if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
   10803       if (Idx2->getZExtValue() == 0) {
   10804         SDValue Ops[] = { SubVec2, SubVec };
   10805         SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
   10806         if (LD.getNode())
   10807           return LD;
   10808       }
   10809     }
   10810   }
   10811 
   10812   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
   10813       SubVecVT.is128BitVector())
   10814     return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
   10815 
   10816   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
   10817     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
   10818 
   10819   if (OpVT.getVectorElementType() == MVT::i1) {
   10820     if (IdxVal == 0  && Vec.getOpcode() == ISD::UNDEF) // the operation is legal
   10821       return Op;
   10822     SDValue ZeroIdx = DAG.getIntPtrConstant(0);
   10823     SDValue Undef = DAG.getUNDEF(OpVT);
   10824     unsigned NumElems = OpVT.getVectorNumElements();
   10825     SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8);
   10826 
   10827     if (IdxVal == OpVT.getVectorNumElements() / 2) {
   10828       // Zero upper bits of the Vec
   10829       Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
   10830       Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
   10831 
   10832       SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
   10833                                  SubVec, ZeroIdx);
   10834       Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
   10835       return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
   10836     }
   10837     if (IdxVal == 0) {
   10838       SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
   10839                                  SubVec, ZeroIdx);
   10840       // Zero upper bits of the Vec2
   10841       Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
   10842       Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits);
   10843       // Zero lower bits of the Vec
   10844       Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
   10845       Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
   10846       // Merge them together
   10847       return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
   10848     }
   10849   }
   10850   return SDValue();
   10851 }
   10852 
   10853 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
   10854 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
   10855 // one of the above mentioned nodes. It has to be wrapped because otherwise
   10856 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
   10857 // be used to form addressing mode. These wrapped nodes will be selected
   10858 // into MOV32ri.
   10859 SDValue
   10860 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   10861   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   10862 
   10863   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   10864   // global base reg.
   10865   unsigned char OpFlag = 0;
   10866   unsigned WrapperKind = X86ISD::Wrapper;
   10867   CodeModel::Model M = DAG.getTarget().getCodeModel();
   10868 
   10869   if (Subtarget->isPICStyleRIPRel() &&
   10870       (M == CodeModel::Small || M == CodeModel::Kernel))
   10871     WrapperKind = X86ISD::WrapperRIP;
   10872   else if (Subtarget->isPICStyleGOT())
   10873     OpFlag = X86II::MO_GOTOFF;
   10874   else if (Subtarget->isPICStyleStubPIC())
   10875     OpFlag = X86II::MO_PIC_BASE_OFFSET;
   10876 
   10877   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
   10878                                              CP->getAlignment(),
   10879                                              CP->getOffset(), OpFlag);
   10880   SDLoc DL(CP);
   10881   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   10882   // With PIC, the address is actually $g + Offset.
   10883   if (OpFlag) {
   10884     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   10885                          DAG.getNode(X86ISD::GlobalBaseReg,
   10886                                      SDLoc(), getPointerTy()),
   10887                          Result);
   10888   }
   10889 
   10890   return Result;
   10891 }
   10892 
   10893 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   10894   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   10895 
   10896   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   10897   // global base reg.
   10898   unsigned char OpFlag = 0;
   10899   unsigned WrapperKind = X86ISD::Wrapper;
   10900   CodeModel::Model M = DAG.getTarget().getCodeModel();
   10901 
   10902   if (Subtarget->isPICStyleRIPRel() &&
   10903       (M == CodeModel::Small || M == CodeModel::Kernel))
   10904     WrapperKind = X86ISD::WrapperRIP;
   10905   else if (Subtarget->isPICStyleGOT())
   10906     OpFlag = X86II::MO_GOTOFF;
   10907   else if (Subtarget->isPICStyleStubPIC())
   10908     OpFlag = X86II::MO_PIC_BASE_OFFSET;
   10909 
   10910   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
   10911                                           OpFlag);
   10912   SDLoc DL(JT);
   10913   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   10914 
   10915   // With PIC, the address is actually $g + Offset.
   10916   if (OpFlag)
   10917     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   10918                          DAG.getNode(X86ISD::GlobalBaseReg,
   10919                                      SDLoc(), getPointerTy()),
   10920                          Result);
   10921 
   10922   return Result;
   10923 }
   10924 
   10925 SDValue
   10926 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   10927   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
   10928 
   10929   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   10930   // global base reg.
   10931   unsigned char OpFlag = 0;
   10932   unsigned WrapperKind = X86ISD::Wrapper;
   10933   CodeModel::Model M = DAG.getTarget().getCodeModel();
   10934 
   10935   if (Subtarget->isPICStyleRIPRel() &&
   10936       (M == CodeModel::Small || M == CodeModel::Kernel)) {
   10937     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
   10938       OpFlag = X86II::MO_GOTPCREL;
   10939     WrapperKind = X86ISD::WrapperRIP;
   10940   } else if (Subtarget->isPICStyleGOT()) {
   10941     OpFlag = X86II::MO_GOT;
   10942   } else if (Subtarget->isPICStyleStubPIC()) {
   10943     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
   10944   } else if (Subtarget->isPICStyleStubNoDynamic()) {
   10945     OpFlag = X86II::MO_DARWIN_NONLAZY;
   10946   }
   10947 
   10948   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
   10949 
   10950   SDLoc DL(Op);
   10951   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   10952 
   10953   // With PIC, the address is actually $g + Offset.
   10954   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
   10955       !Subtarget->is64Bit()) {
   10956     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   10957                          DAG.getNode(X86ISD::GlobalBaseReg,
   10958                                      SDLoc(), getPointerTy()),
   10959                          Result);
   10960   }
   10961 
   10962   // For symbols that require a load from a stub to get the address, emit the
   10963   // load.
   10964   if (isGlobalStubReference(OpFlag))
   10965     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
   10966                          MachinePointerInfo::getGOT(), false, false, false, 0);
   10967 
   10968   return Result;
   10969 }
   10970 
   10971 SDValue
   10972 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   10973   // Create the TargetBlockAddressAddress node.
   10974   unsigned char OpFlags =
   10975     Subtarget->ClassifyBlockAddressReference();
   10976   CodeModel::Model M = DAG.getTarget().getCodeModel();
   10977   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   10978   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   10979   SDLoc dl(Op);
   10980   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
   10981                                              OpFlags);
   10982 
   10983   if (Subtarget->isPICStyleRIPRel() &&
   10984       (M == CodeModel::Small || M == CodeModel::Kernel))
   10985     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
   10986   else
   10987     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
   10988 
   10989   // With PIC, the address is actually $g + Offset.
   10990   if (isGlobalRelativeToPICBase(OpFlags)) {
   10991     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
   10992                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
   10993                          Result);
   10994   }
   10995 
   10996   return Result;
   10997 }
   10998 
   10999 SDValue
   11000 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
   11001                                       int64_t Offset, SelectionDAG &DAG) const {
   11002   // Create the TargetGlobalAddress node, folding in the constant
   11003   // offset if it is legal.
   11004   unsigned char OpFlags =
   11005       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
   11006   CodeModel::Model M = DAG.getTarget().getCodeModel();
   11007   SDValue Result;
   11008   if (OpFlags == X86II::MO_NO_FLAG &&
   11009       X86::isOffsetSuitableForCodeModel(Offset, M)) {
   11010     // A direct static reference to a global.
   11011     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
   11012     Offset = 0;
   11013   } else {
   11014     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
   11015   }
   11016 
   11017   if (Subtarget->isPICStyleRIPRel() &&
   11018       (M == CodeModel::Small || M == CodeModel::Kernel))
   11019     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
   11020   else
   11021     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
   11022 
   11023   // With PIC, the address is actually $g + Offset.
   11024   if (isGlobalRelativeToPICBase(OpFlags)) {
   11025     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
   11026                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
   11027                          Result);
   11028   }
   11029 
   11030   // For globals that require a load from a stub to get the address, emit the
   11031   // load.
   11032   if (isGlobalStubReference(OpFlags))
   11033     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
   11034                          MachinePointerInfo::getGOT(), false, false, false, 0);
   11035 
   11036   // If there was a non-zero offset that we didn't fold, create an explicit
   11037   // addition for it.
   11038   if (Offset != 0)
   11039     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
   11040                          DAG.getConstant(Offset, getPointerTy()));
   11041 
   11042   return Result;
   11043 }
   11044 
   11045 SDValue
   11046 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   11047   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   11048   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
   11049   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
   11050 }
   11051 
   11052 static SDValue
   11053 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
   11054            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
   11055            unsigned char OperandFlags, bool LocalDynamic = false) {
   11056   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   11057   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   11058   SDLoc dl(GA);
   11059   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   11060                                            GA->getValueType(0),
   11061                                            GA->getOffset(),
   11062                                            OperandFlags);
   11063 
   11064   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
   11065                                            : X86ISD::TLSADDR;
   11066 
   11067   if (InFlag) {
   11068     SDValue Ops[] = { Chain,  TGA, *InFlag };
   11069     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   11070   } else {
   11071     SDValue Ops[]  = { Chain, TGA };
   11072     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   11073   }
   11074 
   11075   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   11076   MFI->setAdjustsStack(true);
   11077   MFI->setHasCalls(true);
   11078 
   11079   SDValue Flag = Chain.getValue(1);
   11080   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
   11081 }
   11082 
   11083 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
   11084 static SDValue
   11085 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   11086                                 const EVT PtrVT) {
   11087   SDValue InFlag;
   11088   SDLoc dl(GA);  // ? function entry point might be better
   11089   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
   11090                                    DAG.getNode(X86ISD::GlobalBaseReg,
   11091                                                SDLoc(), PtrVT), InFlag);
   11092   InFlag = Chain.getValue(1);
   11093 
   11094   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
   11095 }
   11096 
   11097 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
   11098 static SDValue
   11099 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   11100                                 const EVT PtrVT) {
   11101   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
   11102                     X86::RAX, X86II::MO_TLSGD);
   11103 }
   11104 
   11105 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
   11106                                            SelectionDAG &DAG,
   11107                                            const EVT PtrVT,
   11108                                            bool is64Bit) {
   11109   SDLoc dl(GA);
   11110 
   11111   // Get the start address of the TLS block for this module.
   11112   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
   11113       .getInfo<X86MachineFunctionInfo>();
   11114   MFI->incNumLocalDynamicTLSAccesses();
   11115 
   11116   SDValue Base;
   11117   if (is64Bit) {
   11118     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
   11119                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
   11120   } else {
   11121     SDValue InFlag;
   11122     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
   11123         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
   11124     InFlag = Chain.getValue(1);
   11125     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
   11126                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
   11127   }
   11128 
   11129   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
   11130   // of Base.
   11131 
   11132   // Build x@dtpoff.
   11133   unsigned char OperandFlags = X86II::MO_DTPOFF;
   11134   unsigned WrapperKind = X86ISD::Wrapper;
   11135   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   11136                                            GA->getValueType(0),
   11137                                            GA->getOffset(), OperandFlags);
   11138   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
   11139 
   11140   // Add x@dtpoff with the base.
   11141   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
   11142 }
   11143 
   11144 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
   11145 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   11146                                    const EVT PtrVT, TLSModel::Model model,
   11147                                    bool is64Bit, bool isPIC) {
   11148   SDLoc dl(GA);
   11149 
   11150   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
   11151   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
   11152                                                          is64Bit ? 257 : 256));
   11153 
   11154   SDValue ThreadPointer =
   11155       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
   11156                   MachinePointerInfo(Ptr), false, false, false, 0);
   11157 
   11158   unsigned char OperandFlags = 0;
   11159   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
   11160   // initialexec.
   11161   unsigned WrapperKind = X86ISD::Wrapper;
   11162   if (model == TLSModel::LocalExec) {
   11163     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
   11164   } else if (model == TLSModel::InitialExec) {
   11165     if (is64Bit) {
   11166       OperandFlags = X86II::MO_GOTTPOFF;
   11167       WrapperKind = X86ISD::WrapperRIP;
   11168     } else {
   11169       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
   11170     }
   11171   } else {
   11172     llvm_unreachable("Unexpected model");
   11173   }
   11174 
   11175   // emit "addl x@ntpoff,%eax" (local exec)
   11176   // or "addl x@indntpoff,%eax" (initial exec)
   11177   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
   11178   SDValue TGA =
   11179       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
   11180                                  GA->getOffset(), OperandFlags);
   11181   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
   11182 
   11183   if (model == TLSModel::InitialExec) {
   11184     if (isPIC && !is64Bit) {
   11185       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
   11186                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
   11187                            Offset);
   11188     }
   11189 
   11190     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
   11191                          MachinePointerInfo::getGOT(), false, false, false, 0);
   11192   }
   11193 
   11194   // The address of the thread local variable is the add of the thread
   11195   // pointer with the offset of the variable.
   11196   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
   11197 }
   11198 
   11199 SDValue
   11200 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   11201 
   11202   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   11203   const GlobalValue *GV = GA->getGlobal();
   11204 
   11205   if (Subtarget->isTargetELF()) {
   11206     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
   11207 
   11208     switch (model) {
   11209       case TLSModel::GeneralDynamic:
   11210         if (Subtarget->is64Bit())
   11211           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
   11212         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
   11213       case TLSModel::LocalDynamic:
   11214         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
   11215                                            Subtarget->is64Bit());
   11216       case TLSModel::InitialExec:
   11217       case TLSModel::LocalExec:
   11218         return LowerToTLSExecModel(
   11219             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
   11220             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
   11221     }
   11222     llvm_unreachable("Unknown TLS model.");
   11223   }
   11224 
   11225   if (Subtarget->isTargetDarwin()) {
   11226     // Darwin only has one model of TLS.  Lower to that.
   11227     unsigned char OpFlag = 0;
   11228     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
   11229                            X86ISD::WrapperRIP : X86ISD::Wrapper;
   11230 
   11231     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   11232     // global base reg.
   11233     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
   11234                  !Subtarget->is64Bit();
   11235     if (PIC32)
   11236       OpFlag = X86II::MO_TLVP_PIC_BASE;
   11237     else
   11238       OpFlag = X86II::MO_TLVP;
   11239     SDLoc DL(Op);
   11240     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
   11241                                                 GA->getValueType(0),
   11242                                                 GA->getOffset(), OpFlag);
   11243     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   11244 
   11245     // With PIC32, the address is actually $g + Offset.
   11246     if (PIC32)
   11247       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   11248                            DAG.getNode(X86ISD::GlobalBaseReg,
   11249                                        SDLoc(), getPointerTy()),
   11250                            Offset);
   11251 
   11252     // Lowering the machine isd will make sure everything is in the right
   11253     // location.
   11254     SDValue Chain = DAG.getEntryNode();
   11255     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   11256     SDValue Args[] = { Chain, Offset };
   11257     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
   11258 
   11259     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
   11260     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   11261     MFI->setAdjustsStack(true);
   11262 
   11263     // And our return value (tls address) is in the standard call return value
   11264     // location.
   11265     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
   11266     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
   11267                               Chain.getValue(1));
   11268   }
   11269 
   11270   if (Subtarget->isTargetKnownWindowsMSVC() ||
   11271       Subtarget->isTargetWindowsGNU()) {
   11272     // Just use the implicit TLS architecture
   11273     // Need to generate someting similar to:
   11274     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
   11275     //                                  ; from TEB
   11276     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
   11277     //   mov     rcx, qword [rdx+rcx*8]
   11278     //   mov     eax, .tls$:tlsvar
   11279     //   [rax+rcx] contains the address
   11280     // Windows 64bit: gs:0x58
   11281     // Windows 32bit: fs:__tls_array
   11282 
   11283     SDLoc dl(GA);
   11284     SDValue Chain = DAG.getEntryNode();
   11285 
   11286     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
   11287     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
   11288     // use its literal value of 0x2C.
   11289     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
   11290                                         ? Type::getInt8PtrTy(*DAG.getContext(),
   11291                                                              256)
   11292                                         : Type::getInt32PtrTy(*DAG.getContext(),
   11293                                                               257));
   11294 
   11295     SDValue TlsArray =
   11296         Subtarget->is64Bit()
   11297             ? DAG.getIntPtrConstant(0x58)
   11298             : (Subtarget->isTargetWindowsGNU()
   11299                    ? DAG.getIntPtrConstant(0x2C)
   11300                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
   11301 
   11302     SDValue ThreadPointer =
   11303         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
   11304                     MachinePointerInfo(Ptr), false, false, false, 0);
   11305 
   11306     // Load the _tls_index variable
   11307     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
   11308     if (Subtarget->is64Bit())
   11309       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
   11310                            IDX, MachinePointerInfo(), MVT::i32,
   11311                            false, false, false, 0);
   11312     else
   11313       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
   11314                         false, false, false, 0);
   11315 
   11316     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
   11317                                     getPointerTy());
   11318     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
   11319 
   11320     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
   11321     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
   11322                       false, false, false, 0);
   11323 
   11324     // Get the offset of start of .tls section
   11325     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   11326                                              GA->getValueType(0),
   11327                                              GA->getOffset(), X86II::MO_SECREL);
   11328     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
   11329 
   11330     // The address of the thread local variable is the add of the thread
   11331     // pointer with the offset of the variable.
   11332     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
   11333   }
   11334 
   11335   llvm_unreachable("TLS not implemented for this target.");
   11336 }
   11337 
   11338 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
   11339 /// and take a 2 x i32 value to shift plus a shift amount.
   11340 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   11341   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   11342   MVT VT = Op.getSimpleValueType();
   11343   unsigned VTBits = VT.getSizeInBits();
   11344   SDLoc dl(Op);
   11345   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   11346   SDValue ShOpLo = Op.getOperand(0);
   11347   SDValue ShOpHi = Op.getOperand(1);
   11348   SDValue ShAmt  = Op.getOperand(2);
   11349   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
   11350   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
   11351   // during isel.
   11352   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
   11353                                   DAG.getConstant(VTBits - 1, MVT::i8));
   11354   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
   11355                                      DAG.getConstant(VTBits - 1, MVT::i8))
   11356                        : DAG.getConstant(0, VT);
   11357 
   11358   SDValue Tmp2, Tmp3;
   11359   if (Op.getOpcode() == ISD::SHL_PARTS) {
   11360     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
   11361     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
   11362   } else {
   11363     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
   11364     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
   11365   }
   11366 
   11367   // If the shift amount is larger or equal than the width of a part we can't
   11368   // rely on the results of shld/shrd. Insert a test and select the appropriate
   11369   // values for large shift amounts.
   11370   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
   11371                                 DAG.getConstant(VTBits, MVT::i8));
   11372   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   11373                              AndNode, DAG.getConstant(0, MVT::i8));
   11374 
   11375   SDValue Hi, Lo;
   11376   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   11377   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
   11378   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
   11379 
   11380   if (Op.getOpcode() == ISD::SHL_PARTS) {
   11381     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
   11382     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   11383   } else {
   11384     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
   11385     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   11386   }
   11387 
   11388   SDValue Ops[2] = { Lo, Hi };
   11389   return DAG.getMergeValues(Ops, dl);
   11390 }
   11391 
   11392 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   11393                                            SelectionDAG &DAG) const {
   11394   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   11395   SDLoc dl(Op);
   11396 
   11397   if (SrcVT.isVector()) {
   11398     if (SrcVT.getVectorElementType() == MVT::i1) {
   11399       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
   11400       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
   11401                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
   11402                                      Op.getOperand(0)));
   11403     }
   11404     return SDValue();
   11405   }
   11406 
   11407   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
   11408          "Unknown SINT_TO_FP to lower!");
   11409 
   11410   // These are really Legal; return the operand so the caller accepts it as
   11411   // Legal.
   11412   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
   11413     return Op;
   11414   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
   11415       Subtarget->is64Bit()) {
   11416     return Op;
   11417   }
   11418 
   11419   unsigned Size = SrcVT.getSizeInBits()/8;
   11420   MachineFunction &MF = DAG.getMachineFunction();
   11421   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
   11422   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   11423   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   11424                                StackSlot,
   11425                                MachinePointerInfo::getFixedStack(SSFI),
   11426                                false, false, 0);
   11427   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
   11428 }
   11429 
   11430 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
   11431                                      SDValue StackSlot,
   11432                                      SelectionDAG &DAG) const {
   11433   // Build the FILD
   11434   SDLoc DL(Op);
   11435   SDVTList Tys;
   11436   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
   11437   if (useSSE)
   11438     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
   11439   else
   11440     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
   11441 
   11442   unsigned ByteSize = SrcVT.getSizeInBits()/8;
   11443 
   11444   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
   11445   MachineMemOperand *MMO;
   11446   if (FI) {
   11447     int SSFI = FI->getIndex();
   11448     MMO =
   11449       DAG.getMachineFunction()
   11450       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   11451                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
   11452   } else {
   11453     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
   11454     StackSlot = StackSlot.getOperand(1);
   11455   }
   11456   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   11457   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
   11458                                            X86ISD::FILD, DL,
   11459                                            Tys, Ops, SrcVT, MMO);
   11460 
   11461   if (useSSE) {
   11462     Chain = Result.getValue(1);
   11463     SDValue InFlag = Result.getValue(2);
   11464 
   11465     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
   11466     // shouldn't be necessary except that RFP cannot be live across
   11467     // multiple blocks. When stackifier is fixed, they can be uncoupled.
   11468     MachineFunction &MF = DAG.getMachineFunction();
   11469     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
   11470     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
   11471     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   11472     Tys = DAG.getVTList(MVT::Other);
   11473     SDValue Ops[] = {
   11474       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
   11475     };
   11476     MachineMemOperand *MMO =
   11477       DAG.getMachineFunction()
   11478       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   11479                             MachineMemOperand::MOStore, SSFISize, SSFISize);
   11480 
   11481     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
   11482                                     Ops, Op.getValueType(), MMO);
   11483     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
   11484                          MachinePointerInfo::getFixedStack(SSFI),
   11485                          false, false, false, 0);
   11486   }
   11487 
   11488   return Result;
   11489 }
   11490 
   11491 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
   11492 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   11493                                                SelectionDAG &DAG) const {
   11494   // This algorithm is not obvious. Here it is what we're trying to output:
   11495   /*
   11496      movq       %rax,  %xmm0
   11497      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
   11498      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
   11499      #ifdef __SSE3__
   11500        haddpd   %xmm0, %xmm0
   11501      #else
   11502        pshufd   $0x4e, %xmm0, %xmm1
   11503        addpd    %xmm1, %xmm0
   11504      #endif
   11505   */
   11506 
   11507   SDLoc dl(Op);
   11508   LLVMContext *Context = DAG.getContext();
   11509 
   11510   // Build some magic constants.
   11511   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   11512   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   11513   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
   11514 
   11515   SmallVector<Constant*,2> CV1;
   11516   CV1.push_back(
   11517     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
   11518                                       APInt(64, 0x4330000000000000ULL))));
   11519   CV1.push_back(
   11520     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
   11521                                       APInt(64, 0x4530000000000000ULL))));
   11522   Constant *C1 = ConstantVector::get(CV1);
   11523   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
   11524 
   11525   // Load the 64-bit value into an XMM register.
   11526   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   11527                             Op.getOperand(0));
   11528   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
   11529                               MachinePointerInfo::getConstantPool(),
   11530                               false, false, false, 16);
   11531   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
   11532                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
   11533                               CLod0);
   11534 
   11535   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
   11536                               MachinePointerInfo::getConstantPool(),
   11537                               false, false, false, 16);
   11538   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
   11539   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   11540   SDValue Result;
   11541 
   11542   if (Subtarget->hasSSE3()) {
   11543     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
   11544     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   11545   } else {
   11546     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
   11547     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
   11548                                            S2F, 0x4E, DAG);
   11549     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
   11550                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
   11551                          Sub);
   11552   }
   11553 
   11554   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
   11555                      DAG.getIntPtrConstant(0));
   11556 }
   11557 
   11558 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
   11559 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
   11560                                                SelectionDAG &DAG) const {
   11561   SDLoc dl(Op);
   11562   // FP constant to bias correct the final result.
   11563   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
   11564                                    MVT::f64);
   11565 
   11566   // Load the 32-bit value into an XMM register.
   11567   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
   11568                              Op.getOperand(0));
   11569 
   11570   // Zero out the upper parts of the register.
   11571   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
   11572 
   11573   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   11574                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
   11575                      DAG.getIntPtrConstant(0));
   11576 
   11577   // Or the load with the bias.
   11578   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
   11579                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
   11580                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   11581                                                    MVT::v2f64, Load)),
   11582                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
   11583                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   11584                                                    MVT::v2f64, Bias)));
   11585   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   11586                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
   11587                    DAG.getIntPtrConstant(0));
   11588 
   11589   // Subtract the bias.
   11590   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
   11591 
   11592   // Handle final rounding.
   11593   EVT DestVT = Op.getValueType();
   11594 
   11595   if (DestVT.bitsLT(MVT::f64))
   11596     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
   11597                        DAG.getIntPtrConstant(0));
   11598   if (DestVT.bitsGT(MVT::f64))
   11599     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
   11600 
   11601   // Handle final rounding.
   11602   return Sub;
   11603 }
   11604 
   11605 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
   11606                                      const X86Subtarget &Subtarget) {
   11607   // The algorithm is the following:
   11608   // #ifdef __SSE4_1__
   11609   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
   11610   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
   11611   //                                 (uint4) 0x53000000, 0xaa);
   11612   // #else
   11613   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
   11614   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
   11615   // #endif
   11616   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   11617   //     return (float4) lo + fhi;
   11618 
   11619   SDLoc DL(Op);
   11620   SDValue V = Op->getOperand(0);
   11621   EVT VecIntVT = V.getValueType();
   11622   bool Is128 = VecIntVT == MVT::v4i32;
   11623   EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
   11624   // If we convert to something else than the supported type, e.g., to v4f64,
   11625   // abort early.
   11626   if (VecFloatVT != Op->getValueType(0))
   11627     return SDValue();
   11628 
   11629   unsigned NumElts = VecIntVT.getVectorNumElements();
   11630   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
   11631          "Unsupported custom type");
   11632   assert(NumElts <= 8 && "The size of the constant array must be fixed");
   11633 
   11634   // In the #idef/#else code, we have in common:
   11635   // - The vector of constants:
   11636   // -- 0x4b000000
   11637   // -- 0x53000000
   11638   // - A shift:
   11639   // -- v >> 16
   11640 
   11641   // Create the splat vector for 0x4b000000.
   11642   SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
   11643   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
   11644                            CstLow, CstLow, CstLow, CstLow};
   11645   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
   11646                                   makeArrayRef(&CstLowArray[0], NumElts));
   11647   // Create the splat vector for 0x53000000.
   11648   SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
   11649   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
   11650                             CstHigh, CstHigh, CstHigh, CstHigh};
   11651   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
   11652                                    makeArrayRef(&CstHighArray[0], NumElts));
   11653 
   11654   // Create the right shift.
   11655   SDValue CstShift = DAG.getConstant(16, MVT::i32);
   11656   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
   11657                              CstShift, CstShift, CstShift, CstShift};
   11658   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
   11659                                     makeArrayRef(&CstShiftArray[0], NumElts));
   11660   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
   11661 
   11662   SDValue Low, High;
   11663   if (Subtarget.hasSSE41()) {
   11664     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
   11665     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
   11666     SDValue VecCstLowBitcast =
   11667         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
   11668     SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
   11669     // Low will be bitcasted right away, so do not bother bitcasting back to its
   11670     // original type.
   11671     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
   11672                       VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
   11673     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
   11674     //                                 (uint4) 0x53000000, 0xaa);
   11675     SDValue VecCstHighBitcast =
   11676         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
   11677     SDValue VecShiftBitcast =
   11678         DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
   11679     // High will be bitcasted right away, so do not bother bitcasting back to
   11680     // its original type.
   11681     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
   11682                        VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
   11683   } else {
   11684     SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
   11685     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
   11686                                      CstMask, CstMask, CstMask);
   11687     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
   11688     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
   11689     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
   11690 
   11691     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
   11692     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
   11693   }
   11694 
   11695   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
   11696   SDValue CstFAdd = DAG.getConstantFP(
   11697       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
   11698   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
   11699                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
   11700   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
   11701                                    makeArrayRef(&CstFAddArray[0], NumElts));
   11702 
   11703   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   11704   SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
   11705   SDValue FHigh =
   11706       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
   11707   //     return (float4) lo + fhi;
   11708   SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
   11709   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
   11710 }
   11711 
   11712 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
   11713                                                SelectionDAG &DAG) const {
   11714   SDValue N0 = Op.getOperand(0);
   11715   MVT SVT = N0.getSimpleValueType();
   11716   SDLoc dl(Op);
   11717 
   11718   switch (SVT.SimpleTy) {
   11719   default:
   11720     llvm_unreachable("Custom UINT_TO_FP is not supported!");
   11721   case MVT::v4i8:
   11722   case MVT::v4i16:
   11723   case MVT::v8i8:
   11724   case MVT::v8i16: {
   11725     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
   11726     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
   11727                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
   11728   }
   11729   case MVT::v4i32:
   11730   case MVT::v8i32:
   11731     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
   11732   }
   11733   llvm_unreachable(nullptr);
   11734 }
   11735 
   11736 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   11737                                            SelectionDAG &DAG) const {
   11738   SDValue N0 = Op.getOperand(0);
   11739   SDLoc dl(Op);
   11740 
   11741   if (Op.getValueType().isVector())
   11742     return lowerUINT_TO_FP_vec(Op, DAG);
   11743 
   11744   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
   11745   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
   11746   // the optimization here.
   11747   if (DAG.SignBitIsZero(N0))
   11748     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
   11749 
   11750   MVT SrcVT = N0.getSimpleValueType();
   11751   MVT DstVT = Op.getSimpleValueType();
   11752   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
   11753     return LowerUINT_TO_FP_i64(Op, DAG);
   11754   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
   11755     return LowerUINT_TO_FP_i32(Op, DAG);
   11756   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
   11757     return SDValue();
   11758 
   11759   // Make a 64-bit buffer, and use it to build an FILD.
   11760   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
   11761   if (SrcVT == MVT::i32) {
   11762     SDValue WordOff = DAG.getConstant(4, getPointerTy());
   11763     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
   11764                                      getPointerTy(), StackSlot, WordOff);
   11765     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   11766                                   StackSlot, MachinePointerInfo(),
   11767                                   false, false, 0);
   11768     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
   11769                                   OffsetSlot, MachinePointerInfo(),
   11770                                   false, false, 0);
   11771     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
   11772     return Fild;
   11773   }
   11774 
   11775   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
   11776   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   11777                                StackSlot, MachinePointerInfo(),
   11778                                false, false, 0);
   11779   // For i64 source, we need to add the appropriate power of 2 if the input
   11780   // was negative.  This is the same as the optimization in
   11781   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
   11782   // we must be careful to do the computation in x87 extended precision, not
   11783   // in SSE. (The generic code can't know it's OK to do this, or how to.)
   11784   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
   11785   MachineMemOperand *MMO =
   11786     DAG.getMachineFunction()
   11787     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   11788                           MachineMemOperand::MOLoad, 8, 8);
   11789 
   11790   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   11791   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
   11792   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
   11793                                          MVT::i64, MMO);
   11794 
   11795   APInt FF(32, 0x5F800000ULL);
   11796 
   11797   // Check whether the sign bit is set.
   11798   SDValue SignSet = DAG.getSetCC(dl,
   11799                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
   11800                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
   11801                                  ISD::SETLT);
   11802 
   11803   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
   11804   SDValue FudgePtr = DAG.getConstantPool(
   11805                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
   11806                                          getPointerTy());
   11807 
   11808   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   11809   SDValue Zero = DAG.getIntPtrConstant(0);
   11810   SDValue Four = DAG.getIntPtrConstant(4);
   11811   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
   11812                                Zero, Four);
   11813   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
   11814 
   11815   // Load the value out, extending it from f32 to f80.
   11816   // FIXME: Avoid the extend by constructing the right constant pool?
   11817   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
   11818                                  FudgePtr, MachinePointerInfo::getConstantPool(),
   11819                                  MVT::f32, false, false, false, 4);
   11820   // Extend everything to 80 bits to force it to be done on x87.
   11821   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
   11822   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
   11823 }
   11824 
   11825 std::pair<SDValue,SDValue>
   11826 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   11827                                     bool IsSigned, bool IsReplace) const {
   11828   SDLoc DL(Op);
   11829 
   11830   EVT DstTy = Op.getValueType();
   11831 
   11832   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
   11833     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
   11834     DstTy = MVT::i64;
   11835   }
   11836 
   11837   assert(DstTy.getSimpleVT() <= MVT::i64 &&
   11838          DstTy.getSimpleVT() >= MVT::i16 &&
   11839          "Unknown FP_TO_INT to lower!");
   11840 
   11841   // These are really Legal.
   11842   if (DstTy == MVT::i32 &&
   11843       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   11844     return std::make_pair(SDValue(), SDValue());
   11845   if (Subtarget->is64Bit() &&
   11846       DstTy == MVT::i64 &&
   11847       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   11848     return std::make_pair(SDValue(), SDValue());
   11849 
   11850   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
   11851   // stack slot, or into the FTOL runtime function.
   11852   MachineFunction &MF = DAG.getMachineFunction();
   11853   unsigned MemSize = DstTy.getSizeInBits()/8;
   11854   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   11855   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   11856 
   11857   unsigned Opc;
   11858   if (!IsSigned && isIntegerTypeFTOL(DstTy))
   11859     Opc = X86ISD::WIN_FTOL;
   11860   else
   11861     switch (DstTy.getSimpleVT().SimpleTy) {
   11862     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
   11863     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
   11864     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
   11865     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
   11866     }
   11867 
   11868   SDValue Chain = DAG.getEntryNode();
   11869   SDValue Value = Op.getOperand(0);
   11870   EVT TheVT = Op.getOperand(0).getValueType();
   11871   // FIXME This causes a redundant load/store if the SSE-class value is already
   11872   // in memory, such as if it is on the callstack.
   11873   if (isScalarFPTypeInSSEReg(TheVT)) {
   11874     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
   11875     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
   11876                          MachinePointerInfo::getFixedStack(SSFI),
   11877                          false, false, 0);
   11878     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
   11879     SDValue Ops[] = {
   11880       Chain, StackSlot, DAG.getValueType(TheVT)
   11881     };
   11882 
   11883     MachineMemOperand *MMO =
   11884       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   11885                               MachineMemOperand::MOLoad, MemSize, MemSize);
   11886     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
   11887     Chain = Value.getValue(1);
   11888     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   11889     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   11890   }
   11891 
   11892   MachineMemOperand *MMO =
   11893     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   11894                             MachineMemOperand::MOStore, MemSize, MemSize);
   11895 
   11896   if (Opc != X86ISD::WIN_FTOL) {
   11897     // Build the FP_TO_INT*_IN_MEM
   11898     SDValue Ops[] = { Chain, Value, StackSlot };
   11899     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
   11900                                            Ops, DstTy, MMO);
   11901     return std::make_pair(FIST, StackSlot);
   11902   } else {
   11903     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
   11904       DAG.getVTList(MVT::Other, MVT::Glue),
   11905       Chain, Value);
   11906     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
   11907       MVT::i32, ftol.getValue(1));
   11908     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
   11909       MVT::i32, eax.getValue(2));
   11910     SDValue Ops[] = { eax, edx };
   11911     SDValue pair = IsReplace
   11912       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
   11913       : DAG.getMergeValues(Ops, DL);
   11914     return std::make_pair(pair, SDValue());
   11915   }
   11916 }
   11917 
   11918 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   11919                               const X86Subtarget *Subtarget) {
   11920   MVT VT = Op->getSimpleValueType(0);
   11921   SDValue In = Op->getOperand(0);
   11922   MVT InVT = In.getSimpleValueType();
   11923   SDLoc dl(Op);
   11924 
   11925   // Optimize vectors in AVX mode:
   11926   //
   11927   //   v8i16 -> v8i32
   11928   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
   11929   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
   11930   //   Concat upper and lower parts.
   11931   //
   11932   //   v4i32 -> v4i64
   11933   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
   11934   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   11935   //   Concat upper and lower parts.
   11936   //
   11937 
   11938   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
   11939       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
   11940       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
   11941     return SDValue();
   11942 
   11943   if (Subtarget->hasInt256())
   11944     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
   11945 
   11946   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
   11947   SDValue Undef = DAG.getUNDEF(InVT);
   11948   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
   11949   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   11950   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   11951 
   11952   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
   11953                              VT.getVectorNumElements()/2);
   11954 
   11955   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
   11956   OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
   11957 
   11958   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   11959 }
   11960 
   11961 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
   11962                                         SelectionDAG &DAG) {
   11963   MVT VT = Op->getSimpleValueType(0);
   11964   SDValue In = Op->getOperand(0);
   11965   MVT InVT = In.getSimpleValueType();
   11966   SDLoc DL(Op);
   11967   unsigned int NumElts = VT.getVectorNumElements();
   11968   if (NumElts != 8 && NumElts != 16)
   11969     return SDValue();
   11970 
   11971   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
   11972     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
   11973 
   11974   EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
   11975   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   11976   // Now we have only mask extension
   11977   assert(InVT.getVectorElementType() == MVT::i1);
   11978   SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
   11979   const Constant *C = cast<ConstantSDNode>(Cst)->getConstantIntValue();
   11980   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
   11981   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   11982   SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
   11983                            MachinePointerInfo::getConstantPool(),
   11984                            false, false, false, Alignment);
   11985 
   11986   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
   11987   if (VT.is512BitVector())
   11988     return Brcst;
   11989   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
   11990 }
   11991 
   11992 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   11993                                SelectionDAG &DAG) {
   11994   if (Subtarget->hasFp256()) {
   11995     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
   11996     if (Res.getNode())
   11997       return Res;
   11998   }
   11999 
   12000   return SDValue();
   12001 }
   12002 
   12003 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   12004                                 SelectionDAG &DAG) {
   12005   SDLoc DL(Op);
   12006   MVT VT = Op.getSimpleValueType();
   12007   SDValue In = Op.getOperand(0);
   12008   MVT SVT = In.getSimpleValueType();
   12009 
   12010   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
   12011     return LowerZERO_EXTEND_AVX512(Op, DAG);
   12012 
   12013   if (Subtarget->hasFp256()) {
   12014     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
   12015     if (Res.getNode())
   12016       return Res;
   12017   }
   12018 
   12019   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
   12020          VT.getVectorNumElements() != SVT.getVectorNumElements());
   12021   return SDValue();
   12022 }
   12023 
   12024 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   12025   SDLoc DL(Op);
   12026   MVT VT = Op.getSimpleValueType();
   12027   SDValue In = Op.getOperand(0);
   12028   MVT InVT = In.getSimpleValueType();
   12029 
   12030   if (VT == MVT::i1) {
   12031     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
   12032            "Invalid scalar TRUNCATE operation");
   12033     if (InVT.getSizeInBits() >= 32)
   12034       return SDValue();
   12035     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
   12036     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
   12037   }
   12038   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
   12039          "Invalid TRUNCATE operation");
   12040 
   12041   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
   12042     if (VT.getVectorElementType().getSizeInBits() >=8)
   12043       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
   12044 
   12045     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
   12046     unsigned NumElts = InVT.getVectorNumElements();
   12047     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
   12048     if (InVT.getSizeInBits() < 512) {
   12049       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
   12050       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
   12051       InVT = ExtVT;
   12052     }
   12053 
   12054     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
   12055     const Constant *C = cast<ConstantSDNode>(Cst)->getConstantIntValue();
   12056     SDValue CP = DAG.getConstantPool(C, getPointerTy());
   12057     unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   12058     SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
   12059                            MachinePointerInfo::getConstantPool(),
   12060                            false, false, false, Alignment);
   12061     SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
   12062     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
   12063     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
   12064   }
   12065 
   12066   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
   12067     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
   12068     if (Subtarget->hasInt256()) {
   12069       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
   12070       In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
   12071       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
   12072                                 ShufMask);
   12073       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
   12074                          DAG.getIntPtrConstant(0));
   12075     }
   12076 
   12077     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   12078                                DAG.getIntPtrConstant(0));
   12079     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   12080                                DAG.getIntPtrConstant(2));
   12081     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
   12082     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
   12083     static const int ShufMask[] = {0, 2, 4, 6};
   12084     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
   12085   }
   12086 
   12087   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
   12088     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
   12089     if (Subtarget->hasInt256()) {
   12090       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
   12091 
   12092       SmallVector<SDValue,32> pshufbMask;
   12093       for (unsigned i = 0; i < 2; ++i) {
   12094         pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
   12095         pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
   12096         pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
   12097         pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
   12098         pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
   12099         pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
   12100         pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
   12101         pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
   12102         for (unsigned j = 0; j < 8; ++j)
   12103           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
   12104       }
   12105       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
   12106       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
   12107       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
   12108 
   12109       static const int ShufMask[] = {0,  2,  -1,  -1};
   12110       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
   12111                                 &ShufMask[0]);
   12112       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   12113                        DAG.getIntPtrConstant(0));
   12114       return DAG.getNode(ISD::BITCAST, DL, VT, In);
   12115     }
   12116 
   12117     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
   12118                                DAG.getIntPtrConstant(0));
   12119 
   12120     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
   12121                                DAG.getIntPtrConstant(4));
   12122 
   12123     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
   12124     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
   12125 
   12126     // The PSHUFB mask:
   12127     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
   12128                                    -1, -1, -1, -1, -1, -1, -1, -1};
   12129 
   12130     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
   12131     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
   12132     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
   12133 
   12134     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
   12135     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
   12136 
   12137     // The MOVLHPS Mask:
   12138     static const int ShufMask2[] = {0, 1, 4, 5};
   12139     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
   12140     return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
   12141   }
   12142 
   12143   // Handle truncation of V256 to V128 using shuffles.
   12144   if (!VT.is128BitVector() || !InVT.is256BitVector())
   12145     return SDValue();
   12146 
   12147   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
   12148 
   12149   unsigned NumElems = VT.getVectorNumElements();
   12150   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
   12151 
   12152   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
   12153   // Prepare truncation shuffle mask
   12154   for (unsigned i = 0; i != NumElems; ++i)
   12155     MaskVec[i] = i * 2;
   12156   SDValue V = DAG.getVectorShuffle(NVT, DL,
   12157                                    DAG.getNode(ISD::BITCAST, DL, NVT, In),
   12158                                    DAG.getUNDEF(NVT), &MaskVec[0]);
   12159   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
   12160                      DAG.getIntPtrConstant(0));
   12161 }
   12162 
   12163 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
   12164                                            SelectionDAG &DAG) const {
   12165   assert(!Op.getSimpleValueType().isVector());
   12166 
   12167   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   12168     /*IsSigned=*/ true, /*IsReplace=*/ false);
   12169   SDValue FIST = Vals.first, StackSlot = Vals.second;
   12170   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
   12171   if (!FIST.getNode()) return Op;
   12172 
   12173   if (StackSlot.getNode())
   12174     // Load the result.
   12175     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
   12176                        FIST, StackSlot, MachinePointerInfo(),
   12177                        false, false, false, 0);
   12178 
   12179   // The node is the result.
   12180   return FIST;
   12181 }
   12182 
   12183 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
   12184                                            SelectionDAG &DAG) const {
   12185   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   12186     /*IsSigned=*/ false, /*IsReplace=*/ false);
   12187   SDValue FIST = Vals.first, StackSlot = Vals.second;
   12188   assert(FIST.getNode() && "Unexpected failure");
   12189 
   12190   if (StackSlot.getNode())
   12191     // Load the result.
   12192     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
   12193                        FIST, StackSlot, MachinePointerInfo(),
   12194                        false, false, false, 0);
   12195 
   12196   // The node is the result.
   12197   return FIST;
   12198 }
   12199 
   12200 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
   12201   SDLoc DL(Op);
   12202   MVT VT = Op.getSimpleValueType();
   12203   SDValue In = Op.getOperand(0);
   12204   MVT SVT = In.getSimpleValueType();
   12205 
   12206   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
   12207 
   12208   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
   12209                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
   12210                                  In, DAG.getUNDEF(SVT)));
   12211 }
   12212 
   12213 /// The only differences between FABS and FNEG are the mask and the logic op.
   12214 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
   12215 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   12216   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
   12217          "Wrong opcode for lowering FABS or FNEG.");
   12218 
   12219   bool IsFABS = (Op.getOpcode() == ISD::FABS);
   12220 
   12221   // If this is a FABS and it has an FNEG user, bail out to fold the combination
   12222   // into an FNABS. We'll lower the FABS after that if it is still in use.
   12223   if (IsFABS)
   12224     for (SDNode *User : Op->uses())
   12225       if (User->getOpcode() == ISD::FNEG)
   12226         return Op;
   12227 
   12228   SDValue Op0 = Op.getOperand(0);
   12229   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
   12230 
   12231   SDLoc dl(Op);
   12232   MVT VT = Op.getSimpleValueType();
   12233   // Assume scalar op for initialization; update for vector if needed.
   12234   // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
   12235   // generate a 16-byte vector constant and logic op even for the scalar case.
   12236   // Using a 16-byte mask allows folding the load of the mask with
   12237   // the logic op, so it can save (~4 bytes) on code size.
   12238   MVT EltVT = VT;
   12239   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   12240   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
   12241   // decide if we should generate a 16-byte constant mask when we only need 4 or
   12242   // 8 bytes for the scalar case.
   12243   if (VT.isVector()) {
   12244     EltVT = VT.getVectorElementType();
   12245     NumElts = VT.getVectorNumElements();
   12246   }
   12247 
   12248   unsigned EltBits = EltVT.getSizeInBits();
   12249   LLVMContext *Context = DAG.getContext();
   12250   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
   12251   APInt MaskElt =
   12252     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
   12253   Constant *C = ConstantInt::get(*Context, MaskElt);
   12254   C = ConstantVector::getSplat(NumElts, C);
   12255   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   12256   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
   12257   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   12258   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
   12259                              MachinePointerInfo::getConstantPool(),
   12260                              false, false, false, Alignment);
   12261 
   12262   if (VT.isVector()) {
   12263     // For a vector, cast operands to a vector type, perform the logic op,
   12264     // and cast the result back to the original value type.
   12265     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
   12266     SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
   12267     SDValue Operand = IsFNABS ?
   12268       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
   12269       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
   12270     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
   12271     return DAG.getNode(ISD::BITCAST, dl, VT,
   12272                        DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
   12273   }
   12274 
   12275   // If not vector, then scalar.
   12276   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
   12277   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
   12278   return DAG.getNode(BitOp, dl, VT, Operand, Mask);
   12279 }
   12280 
   12281 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   12282   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   12283   LLVMContext *Context = DAG.getContext();
   12284   SDValue Op0 = Op.getOperand(0);
   12285   SDValue Op1 = Op.getOperand(1);
   12286   SDLoc dl(Op);
   12287   MVT VT = Op.getSimpleValueType();
   12288   MVT SrcVT = Op1.getSimpleValueType();
   12289 
   12290   // If second operand is smaller, extend it first.
   12291   if (SrcVT.bitsLT(VT)) {
   12292     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
   12293     SrcVT = VT;
   12294   }
   12295   // And if it is bigger, shrink it first.
   12296   if (SrcVT.bitsGT(VT)) {
   12297     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
   12298     SrcVT = VT;
   12299   }
   12300 
   12301   // At this point the operands and the result should have the same
   12302   // type, and that won't be f80 since that is not custom lowered.
   12303 
   12304   const fltSemantics &Sem =
   12305       VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
   12306   const unsigned SizeInBits = VT.getSizeInBits();
   12307 
   12308   SmallVector<Constant *, 4> CV(
   12309       VT == MVT::f64 ? 2 : 4,
   12310       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
   12311 
   12312   // First, clear all bits but the sign bit from the second operand (sign).
   12313   CV[0] = ConstantFP::get(*Context,
   12314                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
   12315   Constant *C = ConstantVector::get(CV);
   12316   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
   12317   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
   12318                               MachinePointerInfo::getConstantPool(),
   12319                               false, false, false, 16);
   12320   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
   12321 
   12322   // Next, clear the sign bit from the first operand (magnitude).
   12323   // If it's a constant, we can clear it here.
   12324   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
   12325     APFloat APF = Op0CN->getValueAPF();
   12326     // If the magnitude is a positive zero, the sign bit alone is enough.
   12327     if (APF.isPosZero())
   12328       return SignBit;
   12329     APF.clearSign();
   12330     CV[0] = ConstantFP::get(*Context, APF);
   12331   } else {
   12332     CV[0] = ConstantFP::get(
   12333         *Context,
   12334         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
   12335   }
   12336   C = ConstantVector::get(CV);
   12337   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
   12338   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
   12339                             MachinePointerInfo::getConstantPool(),
   12340                             false, false, false, 16);
   12341   // If the magnitude operand wasn't a constant, we need to AND out the sign.
   12342   if (!isa<ConstantFPSDNode>(Op0))
   12343     Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
   12344 
   12345   // OR the magnitude value with the sign bit.
   12346   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
   12347 }
   12348 
   12349 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   12350   SDValue N0 = Op.getOperand(0);
   12351   SDLoc dl(Op);
   12352   MVT VT = Op.getSimpleValueType();
   12353 
   12354   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
   12355   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
   12356                                   DAG.getConstant(1, VT));
   12357   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
   12358 }
   12359 
   12360 // Check whether an OR'd tree is PTEST-able.
   12361 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
   12362                                       SelectionDAG &DAG) {
   12363   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
   12364 
   12365   if (!Subtarget->hasSSE41())
   12366     return SDValue();
   12367 
   12368   if (!Op->hasOneUse())
   12369     return SDValue();
   12370 
   12371   SDNode *N = Op.getNode();
   12372   SDLoc DL(N);
   12373 
   12374   SmallVector<SDValue, 8> Opnds;
   12375   DenseMap<SDValue, unsigned> VecInMap;
   12376   SmallVector<SDValue, 8> VecIns;
   12377   EVT VT = MVT::Other;
   12378 
   12379   // Recognize a special case where a vector is casted into wide integer to
   12380   // test all 0s.
   12381   Opnds.push_back(N->getOperand(0));
   12382   Opnds.push_back(N->getOperand(1));
   12383 
   12384   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
   12385     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
   12386     // BFS traverse all OR'd operands.
   12387     if (I->getOpcode() == ISD::OR) {
   12388       Opnds.push_back(I->getOperand(0));
   12389       Opnds.push_back(I->getOperand(1));
   12390       // Re-evaluate the number of nodes to be traversed.
   12391       e += 2; // 2 more nodes (LHS and RHS) are pushed.
   12392       continue;
   12393     }
   12394 
   12395     // Quit if a non-EXTRACT_VECTOR_ELT
   12396     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   12397       return SDValue();
   12398 
   12399     // Quit if without a constant index.
   12400     SDValue Idx = I->getOperand(1);
   12401     if (!isa<ConstantSDNode>(Idx))
   12402       return SDValue();
   12403 
   12404     SDValue ExtractedFromVec = I->getOperand(0);
   12405     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
   12406     if (M == VecInMap.end()) {
   12407       VT = ExtractedFromVec.getValueType();
   12408       // Quit if not 128/256-bit vector.
   12409       if (!VT.is128BitVector() && !VT.is256BitVector())
   12410         return SDValue();
   12411       // Quit if not the same type.
   12412       if (VecInMap.begin() != VecInMap.end() &&
   12413           VT != VecInMap.begin()->first.getValueType())
   12414         return SDValue();
   12415       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
   12416       VecIns.push_back(ExtractedFromVec);
   12417     }
   12418     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
   12419   }
   12420 
   12421   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   12422          "Not extracted from 128-/256-bit vector.");
   12423 
   12424   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
   12425 
   12426   for (DenseMap<SDValue, unsigned>::const_iterator
   12427         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
   12428     // Quit if not all elements are used.
   12429     if (I->second != FullMask)
   12430       return SDValue();
   12431   }
   12432 
   12433   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
   12434 
   12435   // Cast all vectors into TestVT for PTEST.
   12436   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
   12437     VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
   12438 
   12439   // If more than one full vectors are evaluated, OR them first before PTEST.
   12440   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
   12441     // Each iteration will OR 2 nodes and append the result until there is only
   12442     // 1 node left, i.e. the final OR'd value of all vectors.
   12443     SDValue LHS = VecIns[Slot];
   12444     SDValue RHS = VecIns[Slot + 1];
   12445     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
   12446   }
   12447 
   12448   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
   12449                      VecIns.back(), VecIns.back());
   12450 }
   12451 
   12452 /// \brief return true if \c Op has a use that doesn't just read flags.
   12453 static bool hasNonFlagsUse(SDValue Op) {
   12454   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
   12455        ++UI) {
   12456     SDNode *User = *UI;
   12457     unsigned UOpNo = UI.getOperandNo();
   12458     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
   12459       // Look pass truncate.
   12460       UOpNo = User->use_begin().getOperandNo();
   12461       User = *User->use_begin();
   12462     }
   12463 
   12464     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
   12465         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
   12466       return true;
   12467   }
   12468   return false;
   12469 }
   12470 
   12471 /// Emit nodes that will be selected as "test Op0,Op0", or something
   12472 /// equivalent.
   12473 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
   12474                                     SelectionDAG &DAG) const {
   12475   if (Op.getValueType() == MVT::i1) {
   12476     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
   12477     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
   12478                        DAG.getConstant(0, MVT::i8));
   12479   }
   12480   // CF and OF aren't always set the way we want. Determine which
   12481   // of these we need.
   12482   bool NeedCF = false;
   12483   bool NeedOF = false;
   12484   switch (X86CC) {
   12485   default: break;
   12486   case X86::COND_A: case X86::COND_AE:
   12487   case X86::COND_B: case X86::COND_BE:
   12488     NeedCF = true;
   12489     break;
   12490   case X86::COND_G: case X86::COND_GE:
   12491   case X86::COND_L: case X86::COND_LE:
   12492   case X86::COND_O: case X86::COND_NO: {
   12493     // Check if we really need to set the
   12494     // Overflow flag. If NoSignedWrap is present
   12495     // that is not actually needed.
   12496     switch (Op->getOpcode()) {
   12497     case ISD::ADD:
   12498     case ISD::SUB:
   12499     case ISD::MUL:
   12500     case ISD::SHL: {
   12501       const BinaryWithFlagsSDNode *BinNode =
   12502           cast<BinaryWithFlagsSDNode>(Op.getNode());
   12503       if (BinNode->hasNoSignedWrap())
   12504         break;
   12505     }
   12506     default:
   12507       NeedOF = true;
   12508       break;
   12509     }
   12510     break;
   12511   }
   12512   }
   12513   // See if we can use the EFLAGS value from the operand instead of
   12514   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   12515   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   12516   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
   12517     // Emit a CMP with 0, which is the TEST pattern.
   12518     //if (Op.getValueType() == MVT::i1)
   12519     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
   12520     //                     DAG.getConstant(0, MVT::i1));
   12521     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   12522                        DAG.getConstant(0, Op.getValueType()));
   12523   }
   12524   unsigned Opcode = 0;
   12525   unsigned NumOperands = 0;
   12526 
   12527   // Truncate operations may prevent the merge of the SETCC instruction
   12528   // and the arithmetic instruction before it. Attempt to truncate the operands
   12529   // of the arithmetic instruction and use a reduced bit-width instruction.
   12530   bool NeedTruncation = false;
   12531   SDValue ArithOp = Op;
   12532   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
   12533     SDValue Arith = Op->getOperand(0);
   12534     // Both the trunc and the arithmetic op need to have one user each.
   12535     if (Arith->hasOneUse())
   12536       switch (Arith.getOpcode()) {
   12537         default: break;
   12538         case ISD::ADD:
   12539         case ISD::SUB:
   12540         case ISD::AND:
   12541         case ISD::OR:
   12542         case ISD::XOR: {
   12543           NeedTruncation = true;
   12544           ArithOp = Arith;
   12545         }
   12546       }
   12547   }
   12548 
   12549   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
   12550   // which may be the result of a CAST.  We use the variable 'Op', which is the
   12551   // non-casted variable when we check for possible users.
   12552   switch (ArithOp.getOpcode()) {
   12553   case ISD::ADD:
   12554     // Due to an isel shortcoming, be conservative if this add is likely to be
   12555     // selected as part of a load-modify-store instruction. When the root node
   12556     // in a match is a store, isel doesn't know how to remap non-chain non-flag
   12557     // uses of other nodes in the match, such as the ADD in this case. This
   12558     // leads to the ADD being left around and reselected, with the result being
   12559     // two adds in the output.  Alas, even if none our users are stores, that
   12560     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
   12561     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
   12562     // climbing the DAG back to the root, and it doesn't seem to be worth the
   12563     // effort.
   12564     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   12565          UE = Op.getNode()->use_end(); UI != UE; ++UI)
   12566       if (UI->getOpcode() != ISD::CopyToReg &&
   12567           UI->getOpcode() != ISD::SETCC &&
   12568           UI->getOpcode() != ISD::STORE)
   12569         goto default_case;
   12570 
   12571     if (ConstantSDNode *C =
   12572         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
   12573       // An add of one will be selected as an INC.
   12574       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
   12575         Opcode = X86ISD::INC;
   12576         NumOperands = 1;
   12577         break;
   12578       }
   12579 
   12580       // An add of negative one (subtract of one) will be selected as a DEC.
   12581       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
   12582         Opcode = X86ISD::DEC;
   12583         NumOperands = 1;
   12584         break;
   12585       }
   12586     }
   12587 
   12588     // Otherwise use a regular EFLAGS-setting add.
   12589     Opcode = X86ISD::ADD;
   12590     NumOperands = 2;
   12591     break;
   12592   case ISD::SHL:
   12593   case ISD::SRL:
   12594     // If we have a constant logical shift that's only used in a comparison
   12595     // against zero turn it into an equivalent AND. This allows turning it into
   12596     // a TEST instruction later.
   12597     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
   12598         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
   12599       EVT VT = Op.getValueType();
   12600       unsigned BitWidth = VT.getSizeInBits();
   12601       unsigned ShAmt = Op->getConstantOperandVal(1);
   12602       if (ShAmt >= BitWidth) // Avoid undefined shifts.
   12603         break;
   12604       APInt Mask = ArithOp.getOpcode() == ISD::SRL
   12605                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
   12606                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
   12607       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
   12608         break;
   12609       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
   12610                                 DAG.getConstant(Mask, VT));
   12611       DAG.ReplaceAllUsesWith(Op, New);
   12612       Op = New;
   12613     }
   12614     break;
   12615 
   12616   case ISD::AND:
   12617     // If the primary and result isn't used, don't bother using X86ISD::AND,
   12618     // because a TEST instruction will be better.
   12619     if (!hasNonFlagsUse(Op))
   12620       break;
   12621     // FALL THROUGH
   12622   case ISD::SUB:
   12623   case ISD::OR:
   12624   case ISD::XOR:
   12625     // Due to the ISEL shortcoming noted above, be conservative if this op is
   12626     // likely to be selected as part of a load-modify-store instruction.
   12627     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   12628            UE = Op.getNode()->use_end(); UI != UE; ++UI)
   12629       if (UI->getOpcode() == ISD::STORE)
   12630         goto default_case;
   12631 
   12632     // Otherwise use a regular EFLAGS-setting instruction.
   12633     switch (ArithOp.getOpcode()) {
   12634     default: llvm_unreachable("unexpected operator!");
   12635     case ISD::SUB: Opcode = X86ISD::SUB; break;
   12636     case ISD::XOR: Opcode = X86ISD::XOR; break;
   12637     case ISD::AND: Opcode = X86ISD::AND; break;
   12638     case ISD::OR: {
   12639       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
   12640         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
   12641         if (EFLAGS.getNode())
   12642           return EFLAGS;
   12643       }
   12644       Opcode = X86ISD::OR;
   12645       break;
   12646     }
   12647     }
   12648 
   12649     NumOperands = 2;
   12650     break;
   12651   case X86ISD::ADD:
   12652   case X86ISD::SUB:
   12653   case X86ISD::INC:
   12654   case X86ISD::DEC:
   12655   case X86ISD::OR:
   12656   case X86ISD::XOR:
   12657   case X86ISD::AND:
   12658     return SDValue(Op.getNode(), 1);
   12659   default:
   12660   default_case:
   12661     break;
   12662   }
   12663 
   12664   // If we found that truncation is beneficial, perform the truncation and
   12665   // update 'Op'.
   12666   if (NeedTruncation) {
   12667     EVT VT = Op.getValueType();
   12668     SDValue WideVal = Op->getOperand(0);
   12669     EVT WideVT = WideVal.getValueType();
   12670     unsigned ConvertedOp = 0;
   12671     // Use a target machine opcode to prevent further DAGCombine
   12672     // optimizations that may separate the arithmetic operations
   12673     // from the setcc node.
   12674     switch (WideVal.getOpcode()) {
   12675       default: break;
   12676       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
   12677       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
   12678       case ISD::AND: ConvertedOp = X86ISD::AND; break;
   12679       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
   12680       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
   12681     }
   12682 
   12683     if (ConvertedOp) {
   12684       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   12685       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
   12686         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
   12687         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
   12688         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
   12689       }
   12690     }
   12691   }
   12692 
   12693   if (Opcode == 0)
   12694     // Emit a CMP with 0, which is the TEST pattern.
   12695     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   12696                        DAG.getConstant(0, Op.getValueType()));
   12697 
   12698   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   12699   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
   12700 
   12701   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
   12702   DAG.ReplaceAllUsesWith(Op, New);
   12703   return SDValue(New.getNode(), 1);
   12704 }
   12705 
   12706 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
   12707 /// equivalent.
   12708 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
   12709                                    SDLoc dl, SelectionDAG &DAG) const {
   12710   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
   12711     if (C->getAPIntValue() == 0)
   12712       return EmitTest(Op0, X86CC, dl, DAG);
   12713 
   12714      if (Op0.getValueType() == MVT::i1)
   12715        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
   12716   }
   12717 
   12718   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
   12719        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
   12720     // Do the comparison at i32 if it's smaller, besides the Atom case.
   12721     // This avoids subregister aliasing issues. Keep the smaller reference
   12722     // if we're optimizing for size, however, as that'll allow better folding
   12723     // of memory operations.
   12724     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
   12725         !DAG.getMachineFunction().getFunction()->hasFnAttribute(
   12726             Attribute::MinSize) &&
   12727         !Subtarget->isAtom()) {
   12728       unsigned ExtendOp =
   12729           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
   12730       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
   12731       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
   12732     }
   12733     // Use SUB instead of CMP to enable CSE between SUB and CMP.
   12734     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
   12735     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
   12736                               Op0, Op1);
   12737     return SDValue(Sub.getNode(), 1);
   12738   }
   12739   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
   12740 }
   12741 
   12742 /// Convert a comparison if required by the subtarget.
   12743 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
   12744                                                  SelectionDAG &DAG) const {
   12745   // If the subtarget does not support the FUCOMI instruction, floating-point
   12746   // comparisons have to be converted.
   12747   if (Subtarget->hasCMov() ||
   12748       Cmp.getOpcode() != X86ISD::CMP ||
   12749       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
   12750       !Cmp.getOperand(1).getValueType().isFloatingPoint())
   12751     return Cmp;
   12752 
   12753   // The instruction selector will select an FUCOM instruction instead of
   12754   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
   12755   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
   12756   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
   12757   SDLoc dl(Cmp);
   12758   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
   12759   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
   12760   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
   12761                             DAG.getConstant(8, MVT::i8));
   12762   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
   12763   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
   12764 }
   12765 
   12766 /// The minimum architected relative accuracy is 2^-12. We need one
   12767 /// Newton-Raphson step to have a good float result (24 bits of precision).
   12768 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
   12769                                             DAGCombinerInfo &DCI,
   12770                                             unsigned &RefinementSteps,
   12771                                             bool &UseOneConstNR) const {
   12772   // FIXME: We should use instruction latency models to calculate the cost of
   12773   // each potential sequence, but this is very hard to do reliably because
   12774   // at least Intel's Core* chips have variable timing based on the number of
   12775   // significant digits in the divisor and/or sqrt operand.
   12776   if (!Subtarget->useSqrtEst())
   12777     return SDValue();
   12778 
   12779   EVT VT = Op.getValueType();
   12780 
   12781   // SSE1 has rsqrtss and rsqrtps.
   12782   // TODO: Add support for AVX512 (v16f32).
   12783   // It is likely not profitable to do this for f64 because a double-precision
   12784   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
   12785   // instructions: convert to single, rsqrtss, convert back to double, refine
   12786   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
   12787   // along with FMA, this could be a throughput win.
   12788   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
   12789       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
   12790     RefinementSteps = 1;
   12791     UseOneConstNR = false;
   12792     return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
   12793   }
   12794   return SDValue();
   12795 }
   12796 
   12797 /// The minimum architected relative accuracy is 2^-12. We need one
   12798 /// Newton-Raphson step to have a good float result (24 bits of precision).
   12799 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
   12800                                             DAGCombinerInfo &DCI,
   12801                                             unsigned &RefinementSteps) const {
   12802   // FIXME: We should use instruction latency models to calculate the cost of
   12803   // each potential sequence, but this is very hard to do reliably because
   12804   // at least Intel's Core* chips have variable timing based on the number of
   12805   // significant digits in the divisor.
   12806   if (!Subtarget->useReciprocalEst())
   12807     return SDValue();
   12808 
   12809   EVT VT = Op.getValueType();
   12810 
   12811   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
   12812   // TODO: Add support for AVX512 (v16f32).
   12813   // It is likely not profitable to do this for f64 because a double-precision
   12814   // reciprocal estimate with refinement on x86 prior to FMA requires
   12815   // 15 instructions: convert to single, rcpss, convert back to double, refine
   12816   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
   12817   // along with FMA, this could be a throughput win.
   12818   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
   12819       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
   12820     RefinementSteps = ReciprocalEstimateRefinementSteps;
   12821     return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
   12822   }
   12823   return SDValue();
   12824 }
   12825 
   12826 /// If we have at least two divisions that use the same divisor, convert to
   12827 /// multplication by a reciprocal. This may need to be adjusted for a given
   12828 /// CPU if a division's cost is not at least twice the cost of a multiplication.
   12829 /// This is because we still need one division to calculate the reciprocal and
   12830 /// then we need two multiplies by that reciprocal as replacements for the
   12831 /// original divisions.
   12832 bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
   12833   return NumUsers > 1;
   12834 }
   12835 
   12836 static bool isAllOnes(SDValue V) {
   12837   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   12838   return C && C->isAllOnesValue();
   12839 }
   12840 
   12841 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
   12842 /// if it's possible.
   12843 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
   12844                                      SDLoc dl, SelectionDAG &DAG) const {
   12845   SDValue Op0 = And.getOperand(0);
   12846   SDValue Op1 = And.getOperand(1);
   12847   if (Op0.getOpcode() == ISD::TRUNCATE)
   12848     Op0 = Op0.getOperand(0);
   12849   if (Op1.getOpcode() == ISD::TRUNCATE)
   12850     Op1 = Op1.getOperand(0);
   12851 
   12852   SDValue LHS, RHS;
   12853   if (Op1.getOpcode() == ISD::SHL)
   12854     std::swap(Op0, Op1);
   12855   if (Op0.getOpcode() == ISD::SHL) {
   12856     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
   12857       if (And00C->getZExtValue() == 1) {
   12858         // If we looked past a truncate, check that it's only truncating away
   12859         // known zeros.
   12860         unsigned BitWidth = Op0.getValueSizeInBits();
   12861         unsigned AndBitWidth = And.getValueSizeInBits();
   12862         if (BitWidth > AndBitWidth) {
   12863           APInt Zeros, Ones;
   12864           DAG.computeKnownBits(Op0, Zeros, Ones);
   12865           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
   12866             return SDValue();
   12867         }
   12868         LHS = Op1;
   12869         RHS = Op0.getOperand(1);
   12870       }
   12871   } else if (Op1.getOpcode() == ISD::Constant) {
   12872     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
   12873     uint64_t AndRHSVal = AndRHS->getZExtValue();
   12874     SDValue AndLHS = Op0;
   12875 
   12876     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
   12877       LHS = AndLHS.getOperand(0);
   12878       RHS = AndLHS.getOperand(1);
   12879     }
   12880 
   12881     // Use BT if the immediate can't be encoded in a TEST instruction.
   12882     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
   12883       LHS = AndLHS;
   12884       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
   12885     }
   12886   }
   12887 
   12888   if (LHS.getNode()) {
   12889     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
   12890     // instruction.  Since the shift amount is in-range-or-undefined, we know
   12891     // that doing a bittest on the i32 value is ok.  We extend to i32 because
   12892     // the encoding for the i16 version is larger than the i32 version.
   12893     // Also promote i16 to i32 for performance / code size reason.
   12894     if (LHS.getValueType() == MVT::i8 ||
   12895         LHS.getValueType() == MVT::i16)
   12896       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
   12897 
   12898     // If the operand types disagree, extend the shift amount to match.  Since
   12899     // BT ignores high bits (like shifts) we can use anyextend.
   12900     if (LHS.getValueType() != RHS.getValueType())
   12901       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
   12902 
   12903     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
   12904     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
   12905     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   12906                        DAG.getConstant(Cond, MVT::i8), BT);
   12907   }
   12908 
   12909   return SDValue();
   12910 }
   12911 
   12912 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
   12913 /// mask CMPs.
   12914 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
   12915                               SDValue &Op1) {
   12916   unsigned SSECC;
   12917   bool Swap = false;
   12918 
   12919   // SSE Condition code mapping:
   12920   //  0 - EQ
   12921   //  1 - LT
   12922   //  2 - LE
   12923   //  3 - UNORD
   12924   //  4 - NEQ
   12925   //  5 - NLT
   12926   //  6 - NLE
   12927   //  7 - ORD
   12928   switch (SetCCOpcode) {
   12929   default: llvm_unreachable("Unexpected SETCC condition");
   12930   case ISD::SETOEQ:
   12931   case ISD::SETEQ:  SSECC = 0; break;
   12932   case ISD::SETOGT:
   12933   case ISD::SETGT:  Swap = true; // Fallthrough
   12934   case ISD::SETLT:
   12935   case ISD::SETOLT: SSECC = 1; break;
   12936   case ISD::SETOGE:
   12937   case ISD::SETGE:  Swap = true; // Fallthrough
   12938   case ISD::SETLE:
   12939   case ISD::SETOLE: SSECC = 2; break;
   12940   case ISD::SETUO:  SSECC = 3; break;
   12941   case ISD::SETUNE:
   12942   case ISD::SETNE:  SSECC = 4; break;
   12943   case ISD::SETULE: Swap = true; // Fallthrough
   12944   case ISD::SETUGE: SSECC = 5; break;
   12945   case ISD::SETULT: Swap = true; // Fallthrough
   12946   case ISD::SETUGT: SSECC = 6; break;
   12947   case ISD::SETO:   SSECC = 7; break;
   12948   case ISD::SETUEQ:
   12949   case ISD::SETONE: SSECC = 8; break;
   12950   }
   12951   if (Swap)
   12952     std::swap(Op0, Op1);
   12953 
   12954   return SSECC;
   12955 }
   12956 
   12957 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
   12958 // ones, and then concatenate the result back.
   12959 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   12960   MVT VT = Op.getSimpleValueType();
   12961 
   12962   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
   12963          "Unsupported value type for operation");
   12964 
   12965   unsigned NumElems = VT.getVectorNumElements();
   12966   SDLoc dl(Op);
   12967   SDValue CC = Op.getOperand(2);
   12968 
   12969   // Extract the LHS vectors
   12970   SDValue LHS = Op.getOperand(0);
   12971   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
   12972   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
   12973 
   12974   // Extract the RHS vectors
   12975   SDValue RHS = Op.getOperand(1);
   12976   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
   12977   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
   12978 
   12979   // Issue the operation on the smaller types and concatenate the result back
   12980   MVT EltVT = VT.getVectorElementType();
   12981   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   12982   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   12983                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
   12984                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
   12985 }
   12986 
   12987 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
   12988                                      const X86Subtarget *Subtarget) {
   12989   SDValue Op0 = Op.getOperand(0);
   12990   SDValue Op1 = Op.getOperand(1);
   12991   SDValue CC = Op.getOperand(2);
   12992   MVT VT = Op.getSimpleValueType();
   12993   SDLoc dl(Op);
   12994 
   12995   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
   12996          Op.getValueType().getScalarType() == MVT::i1 &&
   12997          "Cannot set masked compare for this operation");
   12998 
   12999   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   13000   unsigned  Opc = 0;
   13001   bool Unsigned = false;
   13002   bool Swap = false;
   13003   unsigned SSECC;
   13004   switch (SetCCOpcode) {
   13005   default: llvm_unreachable("Unexpected SETCC condition");
   13006   case ISD::SETNE:  SSECC = 4; break;
   13007   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
   13008   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
   13009   case ISD::SETLT:  Swap = true; //fall-through
   13010   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
   13011   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
   13012   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
   13013   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
   13014   case ISD::SETULE: Unsigned = true; //fall-through
   13015   case ISD::SETLE:  SSECC = 2; break;
   13016   }
   13017 
   13018   if (Swap)
   13019     std::swap(Op0, Op1);
   13020   if (Opc)
   13021     return DAG.getNode(Opc, dl, VT, Op0, Op1);
   13022   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
   13023   return DAG.getNode(Opc, dl, VT, Op0, Op1,
   13024                      DAG.getConstant(SSECC, MVT::i8));
   13025 }
   13026 
   13027 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
   13028 /// operand \p Op1.  If non-trivial (for example because it's not constant)
   13029 /// return an empty value.
   13030 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
   13031 {
   13032   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
   13033   if (!BV)
   13034     return SDValue();
   13035 
   13036   MVT VT = Op1.getSimpleValueType();
   13037   MVT EVT = VT.getVectorElementType();
   13038   unsigned n = VT.getVectorNumElements();
   13039   SmallVector<SDValue, 8> ULTOp1;
   13040 
   13041   for (unsigned i = 0; i < n; ++i) {
   13042     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
   13043     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
   13044       return SDValue();
   13045 
   13046     // Avoid underflow.
   13047     APInt Val = Elt->getAPIntValue();
   13048     if (Val == 0)
   13049       return SDValue();
   13050 
   13051     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
   13052   }
   13053 
   13054   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
   13055 }
   13056 
   13057 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   13058                            SelectionDAG &DAG) {
   13059   SDValue Op0 = Op.getOperand(0);
   13060   SDValue Op1 = Op.getOperand(1);
   13061   SDValue CC = Op.getOperand(2);
   13062   MVT VT = Op.getSimpleValueType();
   13063   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   13064   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
   13065   SDLoc dl(Op);
   13066 
   13067   if (isFP) {
   13068 #ifndef NDEBUG
   13069     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
   13070     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
   13071 #endif
   13072 
   13073     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
   13074     unsigned Opc = X86ISD::CMPP;
   13075     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
   13076       assert(VT.getVectorNumElements() <= 16);
   13077       Opc = X86ISD::CMPM;
   13078     }
   13079     // In the two special cases we can't handle, emit two comparisons.
   13080     if (SSECC == 8) {
   13081       unsigned CC0, CC1;
   13082       unsigned CombineOpc;
   13083       if (SetCCOpcode == ISD::SETUEQ) {
   13084         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
   13085       } else {
   13086         assert(SetCCOpcode == ISD::SETONE);
   13087         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
   13088       }
   13089 
   13090       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
   13091                                  DAG.getConstant(CC0, MVT::i8));
   13092       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
   13093                                  DAG.getConstant(CC1, MVT::i8));
   13094       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
   13095     }
   13096     // Handle all other FP comparisons here.
   13097     return DAG.getNode(Opc, dl, VT, Op0, Op1,
   13098                        DAG.getConstant(SSECC, MVT::i8));
   13099   }
   13100 
   13101   // Break 256-bit integer vector compare into smaller ones.
   13102   if (VT.is256BitVector() && !Subtarget->hasInt256())
   13103     return Lower256IntVSETCC(Op, DAG);
   13104 
   13105   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
   13106   EVT OpVT = Op1.getValueType();
   13107   if (Subtarget->hasAVX512()) {
   13108     if (Op1.getValueType().is512BitVector() ||
   13109         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
   13110         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
   13111       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
   13112 
   13113     // In AVX-512 architecture setcc returns mask with i1 elements,
   13114     // But there is no compare instruction for i8 and i16 elements in KNL.
   13115     // We are not talking about 512-bit operands in this case, these
   13116     // types are illegal.
   13117     if (MaskResult &&
   13118         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
   13119          OpVT.getVectorElementType().getSizeInBits() >= 8))
   13120       return DAG.getNode(ISD::TRUNCATE, dl, VT,
   13121                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
   13122   }
   13123 
   13124   // We are handling one of the integer comparisons here.  Since SSE only has
   13125   // GT and EQ comparisons for integer, swapping operands and multiple
   13126   // operations may be required for some comparisons.
   13127   unsigned Opc;
   13128   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
   13129   bool Subus = false;
   13130 
   13131   switch (SetCCOpcode) {
   13132   default: llvm_unreachable("Unexpected SETCC condition");
   13133   case ISD::SETNE:  Invert = true;
   13134   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
   13135   case ISD::SETLT:  Swap = true;
   13136   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
   13137   case ISD::SETGE:  Swap = true;
   13138   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
   13139                     Invert = true; break;
   13140   case ISD::SETULT: Swap = true;
   13141   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
   13142                     FlipSigns = true; break;
   13143   case ISD::SETUGE: Swap = true;
   13144   case ISD::SETULE: Opc = X86ISD::PCMPGT;
   13145                     FlipSigns = true; Invert = true; break;
   13146   }
   13147 
   13148   // Special case: Use min/max operations for SETULE/SETUGE
   13149   MVT VET = VT.getVectorElementType();
   13150   bool hasMinMax =
   13151        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
   13152     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
   13153 
   13154   if (hasMinMax) {
   13155     switch (SetCCOpcode) {
   13156     default: break;
   13157     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
   13158     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
   13159     }
   13160 
   13161     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
   13162   }
   13163 
   13164   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
   13165   if (!MinMax && hasSubus) {
   13166     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
   13167     // Op0 u<= Op1:
   13168     //   t = psubus Op0, Op1
   13169     //   pcmpeq t, <0..0>
   13170     switch (SetCCOpcode) {
   13171     default: break;
   13172     case ISD::SETULT: {
   13173       // If the comparison is against a constant we can turn this into a
   13174       // setule.  With psubus, setule does not require a swap.  This is
   13175       // beneficial because the constant in the register is no longer
   13176       // destructed as the destination so it can be hoisted out of a loop.
   13177       // Only do this pre-AVX since vpcmp* is no longer destructive.
   13178       if (Subtarget->hasAVX())
   13179         break;
   13180       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
   13181       if (ULEOp1.getNode()) {
   13182         Op1 = ULEOp1;
   13183         Subus = true; Invert = false; Swap = false;
   13184       }
   13185       break;
   13186     }
   13187     // Psubus is better than flip-sign because it requires no inversion.
   13188     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
   13189     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
   13190     }
   13191 
   13192     if (Subus) {
   13193       Opc = X86ISD::SUBUS;
   13194       FlipSigns = false;
   13195     }
   13196   }
   13197 
   13198   if (Swap)
   13199     std::swap(Op0, Op1);
   13200 
   13201   // Check that the operation in question is available (most are plain SSE2,
   13202   // but PCMPGTQ and PCMPEQQ have different requirements).
   13203   if (VT == MVT::v2i64) {
   13204     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
   13205       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
   13206 
   13207       // First cast everything to the right type.
   13208       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
   13209       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
   13210 
   13211       // Since SSE has no unsigned integer comparisons, we need to flip the sign
   13212       // bits of the inputs before performing those operations. The lower
   13213       // compare is always unsigned.
   13214       SDValue SB;
   13215       if (FlipSigns) {
   13216         SB = DAG.getConstant(0x80000000U, MVT::v4i32);
   13217       } else {
   13218         SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
   13219         SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
   13220         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
   13221                          Sign, Zero, Sign, Zero);
   13222       }
   13223       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
   13224       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
   13225 
   13226       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
   13227       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
   13228       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
   13229 
   13230       // Create masks for only the low parts/high parts of the 64 bit integers.
   13231       static const int MaskHi[] = { 1, 1, 3, 3 };
   13232       static const int MaskLo[] = { 0, 0, 2, 2 };
   13233       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
   13234       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
   13235       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
   13236 
   13237       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
   13238       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
   13239 
   13240       if (Invert)
   13241         Result = DAG.getNOT(dl, Result, MVT::v4i32);
   13242 
   13243       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
   13244     }
   13245 
   13246     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
   13247       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
   13248       // pcmpeqd + pshufd + pand.
   13249       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
   13250 
   13251       // First cast everything to the right type.
   13252       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
   13253       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
   13254 
   13255       // Do the compare.
   13256       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
   13257 
   13258       // Make sure the lower and upper halves are both all-ones.
   13259       static const int Mask[] = { 1, 0, 3, 2 };
   13260       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
   13261       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
   13262 
   13263       if (Invert)
   13264         Result = DAG.getNOT(dl, Result, MVT::v4i32);
   13265 
   13266       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
   13267     }
   13268   }
   13269 
   13270   // Since SSE has no unsigned integer comparisons, we need to flip the sign
   13271   // bits of the inputs before performing those operations.
   13272   if (FlipSigns) {
   13273     EVT EltVT = VT.getVectorElementType();
   13274     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
   13275     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
   13276     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
   13277   }
   13278 
   13279   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
   13280 
   13281   // If the logical-not of the result is required, perform that now.
   13282   if (Invert)
   13283     Result = DAG.getNOT(dl, Result, VT);
   13284 
   13285   if (MinMax)
   13286     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
   13287 
   13288   if (Subus)
   13289     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
   13290                          getZeroVector(VT, Subtarget, DAG, dl));
   13291 
   13292   return Result;
   13293 }
   13294 
   13295 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   13296 
   13297   MVT VT = Op.getSimpleValueType();
   13298 
   13299   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
   13300 
   13301   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
   13302          && "SetCC type must be 8-bit or 1-bit integer");
   13303   SDValue Op0 = Op.getOperand(0);
   13304   SDValue Op1 = Op.getOperand(1);
   13305   SDLoc dl(Op);
   13306   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   13307 
   13308   // Optimize to BT if possible.
   13309   // Lower (X & (1 << N)) == 0 to BT(X, N).
   13310   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
   13311   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
   13312   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
   13313       Op1.getOpcode() == ISD::Constant &&
   13314       cast<ConstantSDNode>(Op1)->isNullValue() &&
   13315       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   13316     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
   13317     if (NewSetCC.getNode()) {
   13318       if (VT == MVT::i1)
   13319         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
   13320       return NewSetCC;
   13321     }
   13322   }
   13323 
   13324   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   13325   // these.
   13326   if (Op1.getOpcode() == ISD::Constant &&
   13327       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
   13328        cast<ConstantSDNode>(Op1)->isNullValue()) &&
   13329       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   13330 
   13331     // If the input is a setcc, then reuse the input setcc or use a new one with
   13332     // the inverted condition.
   13333     if (Op0.getOpcode() == X86ISD::SETCC) {
   13334       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
   13335       bool Invert = (CC == ISD::SETNE) ^
   13336         cast<ConstantSDNode>(Op1)->isNullValue();
   13337       if (!Invert)
   13338         return Op0;
   13339 
   13340       CCode = X86::GetOppositeBranchCondition(CCode);
   13341       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   13342                                   DAG.getConstant(CCode, MVT::i8),
   13343                                   Op0.getOperand(1));
   13344       if (VT == MVT::i1)
   13345         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
   13346       return SetCC;
   13347     }
   13348   }
   13349   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
   13350       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
   13351       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   13352 
   13353     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
   13354     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
   13355   }
   13356 
   13357   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
   13358   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
   13359   if (X86CC == X86::COND_INVALID)
   13360     return SDValue();
   13361 
   13362   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
   13363   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   13364   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   13365                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
   13366   if (VT == MVT::i1)
   13367     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
   13368   return SetCC;
   13369 }
   13370 
   13371 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
   13372 static bool isX86LogicalCmp(SDValue Op) {
   13373   unsigned Opc = Op.getNode()->getOpcode();
   13374   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
   13375       Opc == X86ISD::SAHF)
   13376     return true;
   13377   if (Op.getResNo() == 1 &&
   13378       (Opc == X86ISD::ADD ||
   13379        Opc == X86ISD::SUB ||
   13380        Opc == X86ISD::ADC ||
   13381        Opc == X86ISD::SBB ||
   13382        Opc == X86ISD::SMUL ||
   13383        Opc == X86ISD::UMUL ||
   13384        Opc == X86ISD::INC ||
   13385        Opc == X86ISD::DEC ||
   13386        Opc == X86ISD::OR ||
   13387        Opc == X86ISD::XOR ||
   13388        Opc == X86ISD::AND))
   13389     return true;
   13390 
   13391   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
   13392     return true;
   13393 
   13394   return false;
   13395 }
   13396 
   13397 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   13398   if (V.getOpcode() != ISD::TRUNCATE)
   13399     return false;
   13400 
   13401   SDValue VOp0 = V.getOperand(0);
   13402   unsigned InBits = VOp0.getValueSizeInBits();
   13403   unsigned Bits = V.getValueSizeInBits();
   13404   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
   13405 }
   13406 
   13407 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   13408   bool addTest = true;
   13409   SDValue Cond  = Op.getOperand(0);
   13410   SDValue Op1 = Op.getOperand(1);
   13411   SDValue Op2 = Op.getOperand(2);
   13412   SDLoc DL(Op);
   13413   EVT VT = Op1.getValueType();
   13414   SDValue CC;
   13415 
   13416   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
   13417   // are available or VBLENDV if AVX is available.
   13418   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
   13419   if (Cond.getOpcode() == ISD::SETCC &&
   13420       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
   13421        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
   13422       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
   13423     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
   13424     int SSECC = translateX86FSETCC(
   13425         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
   13426 
   13427     if (SSECC != 8) {
   13428       if (Subtarget->hasAVX512()) {
   13429         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
   13430                                   DAG.getConstant(SSECC, MVT::i8));
   13431         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
   13432       }
   13433 
   13434       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
   13435                                 DAG.getConstant(SSECC, MVT::i8));
   13436 
   13437       // If we have AVX, we can use a variable vector select (VBLENDV) instead
   13438       // of 3 logic instructions for size savings and potentially speed.
   13439       // Unfortunately, there is no scalar form of VBLENDV.
   13440 
   13441       // If either operand is a constant, don't try this. We can expect to
   13442       // optimize away at least one of the logic instructions later in that
   13443       // case, so that sequence would be faster than a variable blend.
   13444 
   13445       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
   13446       // uses XMM0 as the selection register. That may need just as many
   13447       // instructions as the AND/ANDN/OR sequence due to register moves, so
   13448       // don't bother.
   13449 
   13450       if (Subtarget->hasAVX() &&
   13451           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
   13452 
   13453         // Convert to vectors, do a VSELECT, and convert back to scalar.
   13454         // All of the conversions should be optimized away.
   13455 
   13456         EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
   13457         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
   13458         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
   13459         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
   13460 
   13461         EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
   13462         VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp);
   13463 
   13464         SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
   13465 
   13466         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
   13467                            VSel, DAG.getIntPtrConstant(0));
   13468       }
   13469       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
   13470       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
   13471       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
   13472     }
   13473   }
   13474 
   13475   if (Cond.getOpcode() == ISD::SETCC) {
   13476     SDValue NewCond = LowerSETCC(Cond, DAG);
   13477     if (NewCond.getNode())
   13478       Cond = NewCond;
   13479   }
   13480 
   13481   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   13482   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
   13483   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
   13484   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
   13485   if (Cond.getOpcode() == X86ISD::SETCC &&
   13486       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
   13487       isZero(Cond.getOperand(1).getOperand(1))) {
   13488     SDValue Cmp = Cond.getOperand(1);
   13489 
   13490     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
   13491 
   13492     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
   13493         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
   13494       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
   13495 
   13496       SDValue CmpOp0 = Cmp.getOperand(0);
   13497       // Apply further optimizations for special cases
   13498       // (select (x != 0), -1, 0) -> neg & sbb
   13499       // (select (x == 0), 0, -1) -> neg & sbb
   13500       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
   13501         if (YC->isNullValue() &&
   13502             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
   13503           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
   13504           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
   13505                                     DAG.getConstant(0, CmpOp0.getValueType()),
   13506                                     CmpOp0);
   13507           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   13508                                     DAG.getConstant(X86::COND_B, MVT::i8),
   13509                                     SDValue(Neg.getNode(), 1));
   13510           return Res;
   13511         }
   13512 
   13513       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
   13514                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
   13515       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   13516 
   13517       SDValue Res =   // Res = 0 or -1.
   13518         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   13519                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
   13520 
   13521       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
   13522         Res = DAG.getNOT(DL, Res, Res.getValueType());
   13523 
   13524       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
   13525       if (!N2C || !N2C->isNullValue())
   13526         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
   13527       return Res;
   13528     }
   13529   }
   13530 
   13531   // Look past (and (setcc_carry (cmp ...)), 1).
   13532   if (Cond.getOpcode() == ISD::AND &&
   13533       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   13534     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
   13535     if (C && C->getAPIntValue() == 1)
   13536       Cond = Cond.getOperand(0);
   13537   }
   13538 
   13539   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   13540   // setting operand in place of the X86ISD::SETCC.
   13541   unsigned CondOpcode = Cond.getOpcode();
   13542   if (CondOpcode == X86ISD::SETCC ||
   13543       CondOpcode == X86ISD::SETCC_CARRY) {
   13544     CC = Cond.getOperand(0);
   13545 
   13546     SDValue Cmp = Cond.getOperand(1);
   13547     unsigned Opc = Cmp.getOpcode();
   13548     MVT VT = Op.getSimpleValueType();
   13549 
   13550     bool IllegalFPCMov = false;
   13551     if (VT.isFloatingPoint() && !VT.isVector() &&
   13552         !isScalarFPTypeInSSEReg(VT))  // FPStack?
   13553       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
   13554 
   13555     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
   13556         Opc == X86ISD::BT) { // FIXME
   13557       Cond = Cmp;
   13558       addTest = false;
   13559     }
   13560   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   13561              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   13562              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   13563               Cond.getOperand(0).getValueType() != MVT::i8)) {
   13564     SDValue LHS = Cond.getOperand(0);
   13565     SDValue RHS = Cond.getOperand(1);
   13566     unsigned X86Opcode;
   13567     unsigned X86Cond;
   13568     SDVTList VTs;
   13569     switch (CondOpcode) {
   13570     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   13571     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   13572     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   13573     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   13574     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   13575     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   13576     default: llvm_unreachable("unexpected overflowing operator");
   13577     }
   13578     if (CondOpcode == ISD::UMULO)
   13579       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   13580                           MVT::i32);
   13581     else
   13582       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   13583 
   13584     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
   13585 
   13586     if (CondOpcode == ISD::UMULO)
   13587       Cond = X86Op.getValue(2);
   13588     else
   13589       Cond = X86Op.getValue(1);
   13590 
   13591     CC = DAG.getConstant(X86Cond, MVT::i8);
   13592     addTest = false;
   13593   }
   13594 
   13595   if (addTest) {
   13596     // Look pass the truncate if the high bits are known zero.
   13597     if (isTruncWithZeroHighBitsInput(Cond, DAG))
   13598         Cond = Cond.getOperand(0);
   13599 
   13600     // We know the result of AND is compared against zero. Try to match
   13601     // it to BT.
   13602     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   13603       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
   13604       if (NewSetCC.getNode()) {
   13605         CC = NewSetCC.getOperand(0);
   13606         Cond = NewSetCC.getOperand(1);
   13607         addTest = false;
   13608       }
   13609     }
   13610   }
   13611 
   13612   if (addTest) {
   13613     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   13614     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
   13615   }
   13616 
   13617   // a <  b ? -1 :  0 -> RES = ~setcc_carry
   13618   // a <  b ?  0 : -1 -> RES = setcc_carry
   13619   // a >= b ? -1 :  0 -> RES = setcc_carry
   13620   // a >= b ?  0 : -1 -> RES = ~setcc_carry
   13621   if (Cond.getOpcode() == X86ISD::SUB) {
   13622     Cond = ConvertCmpIfNecessary(Cond, DAG);
   13623     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
   13624 
   13625     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
   13626         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
   13627       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   13628                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
   13629       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
   13630         return DAG.getNOT(DL, Res, Res.getValueType());
   13631       return Res;
   13632     }
   13633   }
   13634 
   13635   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
   13636   // widen the cmov and push the truncate through. This avoids introducing a new
   13637   // branch during isel and doesn't add any extensions.
   13638   if (Op.getValueType() == MVT::i8 &&
   13639       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
   13640     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
   13641     if (T1.getValueType() == T2.getValueType() &&
   13642         // Blacklist CopyFromReg to avoid partial register stalls.
   13643         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
   13644       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
   13645       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
   13646       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
   13647     }
   13648   }
   13649 
   13650   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   13651   // condition is true.
   13652   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   13653   SDValue Ops[] = { Op2, Op1, CC, Cond };
   13654   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
   13655 }
   13656 
   13657 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
   13658                                        SelectionDAG &DAG) {
   13659   MVT VT = Op->getSimpleValueType(0);
   13660   SDValue In = Op->getOperand(0);
   13661   MVT InVT = In.getSimpleValueType();
   13662   MVT VTElt = VT.getVectorElementType();
   13663   MVT InVTElt = InVT.getVectorElementType();
   13664   SDLoc dl(Op);
   13665 
   13666   // SKX processor
   13667   if ((InVTElt == MVT::i1) &&
   13668       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
   13669         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
   13670 
   13671        ((Subtarget->hasBWI() && VT.is512BitVector() &&
   13672         VTElt.getSizeInBits() <= 16)) ||
   13673 
   13674        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
   13675         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
   13676 
   13677        ((Subtarget->hasDQI() && VT.is512BitVector() &&
   13678         VTElt.getSizeInBits() >= 32))))
   13679     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   13680 
   13681   unsigned int NumElts = VT.getVectorNumElements();
   13682 
   13683   if (NumElts != 8 && NumElts != 16)
   13684     return SDValue();
   13685 
   13686   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
   13687     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
   13688       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
   13689     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   13690   }
   13691 
   13692   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   13693   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
   13694 
   13695   MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
   13696   Constant *C = ConstantInt::get(*DAG.getContext(),
   13697     APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
   13698 
   13699   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
   13700   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   13701   SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
   13702                           MachinePointerInfo::getConstantPool(),
   13703                           false, false, false, Alignment);
   13704   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
   13705   if (VT.is512BitVector())
   13706     return Brcst;
   13707   return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
   13708 }
   13709 
   13710 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   13711                                 SelectionDAG &DAG) {
   13712   MVT VT = Op->getSimpleValueType(0);
   13713   SDValue In = Op->getOperand(0);
   13714   MVT InVT = In.getSimpleValueType();
   13715   SDLoc dl(Op);
   13716 
   13717   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
   13718     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
   13719 
   13720   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
   13721       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
   13722       (VT != MVT::v16i16 || InVT != MVT::v16i8))
   13723     return SDValue();
   13724 
   13725   if (Subtarget->hasInt256())
   13726     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   13727 
   13728   // Optimize vectors in AVX mode
   13729   // Sign extend  v8i16 to v8i32 and
   13730   //              v4i32 to v4i64
   13731   //
   13732   // Divide input vector into two parts
   13733   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
   13734   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
   13735   // concat the vectors to original VT
   13736 
   13737   unsigned NumElems = InVT.getVectorNumElements();
   13738   SDValue Undef = DAG.getUNDEF(InVT);
   13739 
   13740   SmallVector<int,8> ShufMask1(NumElems, -1);
   13741   for (unsigned i = 0; i != NumElems/2; ++i)
   13742     ShufMask1[i] = i;
   13743 
   13744   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
   13745 
   13746   SmallVector<int,8> ShufMask2(NumElems, -1);
   13747   for (unsigned i = 0; i != NumElems/2; ++i)
   13748     ShufMask2[i] = i + NumElems/2;
   13749 
   13750   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
   13751 
   13752   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
   13753                                 VT.getVectorNumElements()/2);
   13754 
   13755   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
   13756   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
   13757 
   13758   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   13759 }
   13760 
   13761 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
   13762 // may emit an illegal shuffle but the expansion is still better than scalar
   13763 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
   13764 // we'll emit a shuffle and a arithmetic shift.
   13765 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
   13766 // TODO: It is possible to support ZExt by zeroing the undef values during
   13767 // the shuffle phase or after the shuffle.
   13768 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
   13769                                  SelectionDAG &DAG) {
   13770   MVT RegVT = Op.getSimpleValueType();
   13771   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
   13772   assert(RegVT.isInteger() &&
   13773          "We only custom lower integer vector sext loads.");
   13774 
   13775   // Nothing useful we can do without SSE2 shuffles.
   13776   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
   13777 
   13778   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   13779   SDLoc dl(Ld);
   13780   EVT MemVT = Ld->getMemoryVT();
   13781   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   13782   unsigned RegSz = RegVT.getSizeInBits();
   13783 
   13784   ISD::LoadExtType Ext = Ld->getExtensionType();
   13785 
   13786   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
   13787          && "Only anyext and sext are currently implemented.");
   13788   assert(MemVT != RegVT && "Cannot extend to the same type");
   13789   assert(MemVT.isVector() && "Must load a vector from memory");
   13790 
   13791   unsigned NumElems = RegVT.getVectorNumElements();
   13792   unsigned MemSz = MemVT.getSizeInBits();
   13793   assert(RegSz > MemSz && "Register size must be greater than the mem size");
   13794 
   13795   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
   13796     // The only way in which we have a legal 256-bit vector result but not the
   13797     // integer 256-bit operations needed to directly lower a sextload is if we
   13798     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
   13799     // a 128-bit vector and a normal sign_extend to 256-bits that should get
   13800     // correctly legalized. We do this late to allow the canonical form of
   13801     // sextload to persist throughout the rest of the DAG combiner -- it wants
   13802     // to fold together any extensions it can, and so will fuse a sign_extend
   13803     // of an sextload into a sextload targeting a wider value.
   13804     SDValue Load;
   13805     if (MemSz == 128) {
   13806       // Just switch this to a normal load.
   13807       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
   13808                                        "it must be a legal 128-bit vector "
   13809                                        "type!");
   13810       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
   13811                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
   13812                   Ld->isInvariant(), Ld->getAlignment());
   13813     } else {
   13814       assert(MemSz < 128 &&
   13815              "Can't extend a type wider than 128 bits to a 256 bit vector!");
   13816       // Do an sext load to a 128-bit vector type. We want to use the same
   13817       // number of elements, but elements half as wide. This will end up being
   13818       // recursively lowered by this routine, but will succeed as we definitely
   13819       // have all the necessary features if we're using AVX1.
   13820       EVT HalfEltVT =
   13821           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
   13822       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
   13823       Load =
   13824           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
   13825                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
   13826                          Ld->isNonTemporal(), Ld->isInvariant(),
   13827                          Ld->getAlignment());
   13828     }
   13829 
   13830     // Replace chain users with the new chain.
   13831     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
   13832     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
   13833 
   13834     // Finally, do a normal sign-extend to the desired register.
   13835     return DAG.getSExtOrTrunc(Load, dl, RegVT);
   13836   }
   13837 
   13838   // All sizes must be a power of two.
   13839   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
   13840          "Non-power-of-two elements are not custom lowered!");
   13841 
   13842   // Attempt to load the original value using scalar loads.
   13843   // Find the largest scalar type that divides the total loaded size.
   13844   MVT SclrLoadTy = MVT::i8;
   13845   for (MVT Tp : MVT::integer_valuetypes()) {
   13846     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
   13847       SclrLoadTy = Tp;
   13848     }
   13849   }
   13850 
   13851   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
   13852   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
   13853       (64 <= MemSz))
   13854     SclrLoadTy = MVT::f64;
   13855 
   13856   // Calculate the number of scalar loads that we need to perform
   13857   // in order to load our vector from memory.
   13858   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
   13859 
   13860   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
   13861          "Can only lower sext loads with a single scalar load!");
   13862 
   13863   unsigned loadRegZize = RegSz;
   13864   if (Ext == ISD::SEXTLOAD && RegSz == 256)
   13865     loadRegZize /= 2;
   13866 
   13867   // Represent our vector as a sequence of elements which are the
   13868   // largest scalar that we can load.
   13869   EVT LoadUnitVecVT = EVT::getVectorVT(
   13870       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
   13871 
   13872   // Represent the data using the same element type that is stored in
   13873   // memory. In practice, we ''widen'' MemVT.
   13874   EVT WideVecVT =
   13875       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
   13876                        loadRegZize / MemVT.getScalarType().getSizeInBits());
   13877 
   13878   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
   13879          "Invalid vector type");
   13880 
   13881   // We can't shuffle using an illegal type.
   13882   assert(TLI.isTypeLegal(WideVecVT) &&
   13883          "We only lower types that form legal widened vector types");
   13884 
   13885   SmallVector<SDValue, 8> Chains;
   13886   SDValue Ptr = Ld->getBasePtr();
   13887   SDValue Increment =
   13888       DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
   13889   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
   13890 
   13891   for (unsigned i = 0; i < NumLoads; ++i) {
   13892     // Perform a single load.
   13893     SDValue ScalarLoad =
   13894         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
   13895                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
   13896                     Ld->getAlignment());
   13897     Chains.push_back(ScalarLoad.getValue(1));
   13898     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
   13899     // another round of DAGCombining.
   13900     if (i == 0)
   13901       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
   13902     else
   13903       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
   13904                         ScalarLoad, DAG.getIntPtrConstant(i));
   13905 
   13906     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   13907   }
   13908 
   13909   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   13910 
   13911   // Bitcast the loaded value to a vector of the original element type, in
   13912   // the size of the target vector type.
   13913   SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
   13914   unsigned SizeRatio = RegSz / MemSz;
   13915 
   13916   if (Ext == ISD::SEXTLOAD) {
   13917     // If we have SSE4.1, we can directly emit a VSEXT node.
   13918     if (Subtarget->hasSSE41()) {
   13919       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
   13920       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   13921       return Sext;
   13922     }
   13923 
   13924     // Otherwise we'll shuffle the small elements in the high bits of the
   13925     // larger type and perform an arithmetic shift. If the shift is not legal
   13926     // it's better to scalarize.
   13927     assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
   13928            "We can't implement a sext load without an arithmetic right shift!");
   13929 
   13930     // Redistribute the loaded elements into the different locations.
   13931     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   13932     for (unsigned i = 0; i != NumElems; ++i)
   13933       ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
   13934 
   13935     SDValue Shuff = DAG.getVectorShuffle(
   13936         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
   13937 
   13938     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
   13939 
   13940     // Build the arithmetic shift.
   13941     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
   13942                    MemVT.getVectorElementType().getSizeInBits();
   13943     Shuff =
   13944         DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
   13945 
   13946     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   13947     return Shuff;
   13948   }
   13949 
   13950   // Redistribute the loaded elements into the different locations.
   13951   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   13952   for (unsigned i = 0; i != NumElems; ++i)
   13953     ShuffleVec[i * SizeRatio] = i;
   13954 
   13955   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
   13956                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
   13957 
   13958   // Bitcast to the requested type.
   13959   Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
   13960   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   13961   return Shuff;
   13962 }
   13963 
   13964 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
   13965 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
   13966 // from the AND / OR.
   13967 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
   13968   Opc = Op.getOpcode();
   13969   if (Opc != ISD::OR && Opc != ISD::AND)
   13970     return false;
   13971   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   13972           Op.getOperand(0).hasOneUse() &&
   13973           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
   13974           Op.getOperand(1).hasOneUse());
   13975 }
   13976 
   13977 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
   13978 // 1 and that the SETCC node has a single use.
   13979 static bool isXor1OfSetCC(SDValue Op) {
   13980   if (Op.getOpcode() != ISD::XOR)
   13981     return false;
   13982   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   13983   if (N1C && N1C->getAPIntValue() == 1) {
   13984     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   13985       Op.getOperand(0).hasOneUse();
   13986   }
   13987   return false;
   13988 }
   13989 
   13990 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   13991   bool addTest = true;
   13992   SDValue Chain = Op.getOperand(0);
   13993   SDValue Cond  = Op.getOperand(1);
   13994   SDValue Dest  = Op.getOperand(2);
   13995   SDLoc dl(Op);
   13996   SDValue CC;
   13997   bool Inverted = false;
   13998 
   13999   if (Cond.getOpcode() == ISD::SETCC) {
   14000     // Check for setcc([su]{add,sub,mul}o == 0).
   14001     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
   14002         isa<ConstantSDNode>(Cond.getOperand(1)) &&
   14003         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
   14004         Cond.getOperand(0).getResNo() == 1 &&
   14005         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
   14006          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
   14007          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
   14008          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
   14009          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
   14010          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
   14011       Inverted = true;
   14012       Cond = Cond.getOperand(0);
   14013     } else {
   14014       SDValue NewCond = LowerSETCC(Cond, DAG);
   14015       if (NewCond.getNode())
   14016         Cond = NewCond;
   14017     }
   14018   }
   14019 #if 0
   14020   // FIXME: LowerXALUO doesn't handle these!!
   14021   else if (Cond.getOpcode() == X86ISD::ADD  ||
   14022            Cond.getOpcode() == X86ISD::SUB  ||
   14023            Cond.getOpcode() == X86ISD::SMUL ||
   14024            Cond.getOpcode() == X86ISD::UMUL)
   14025     Cond = LowerXALUO(Cond, DAG);
   14026 #endif
   14027 
   14028   // Look pass (and (setcc_carry (cmp ...)), 1).
   14029   if (Cond.getOpcode() == ISD::AND &&
   14030       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   14031     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
   14032     if (C && C->getAPIntValue() == 1)
   14033       Cond = Cond.getOperand(0);
   14034   }
   14035 
   14036   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   14037   // setting operand in place of the X86ISD::SETCC.
   14038   unsigned CondOpcode = Cond.getOpcode();
   14039   if (CondOpcode == X86ISD::SETCC ||
   14040       CondOpcode == X86ISD::SETCC_CARRY) {
   14041     CC = Cond.getOperand(0);
   14042 
   14043     SDValue Cmp = Cond.getOperand(1);
   14044     unsigned Opc = Cmp.getOpcode();
   14045     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
   14046     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
   14047       Cond = Cmp;
   14048       addTest = false;
   14049     } else {
   14050       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
   14051       default: break;
   14052       case X86::COND_O:
   14053       case X86::COND_B:
   14054         // These can only come from an arithmetic instruction with overflow,
   14055         // e.g. SADDO, UADDO.
   14056         Cond = Cond.getNode()->getOperand(1);
   14057         addTest = false;
   14058         break;
   14059       }
   14060     }
   14061   }
   14062   CondOpcode = Cond.getOpcode();
   14063   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   14064       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   14065       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   14066        Cond.getOperand(0).getValueType() != MVT::i8)) {
   14067     SDValue LHS = Cond.getOperand(0);
   14068     SDValue RHS = Cond.getOperand(1);
   14069     unsigned X86Opcode;
   14070     unsigned X86Cond;
   14071     SDVTList VTs;
   14072     // Keep this in sync with LowerXALUO, otherwise we might create redundant
   14073     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
   14074     // X86ISD::INC).
   14075     switch (CondOpcode) {
   14076     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   14077     case ISD::SADDO:
   14078       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
   14079         if (C->isOne()) {
   14080           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
   14081           break;
   14082         }
   14083       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   14084     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   14085     case ISD::SSUBO:
   14086       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
   14087         if (C->isOne()) {
   14088           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
   14089           break;
   14090         }
   14091       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   14092     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   14093     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   14094     default: llvm_unreachable("unexpected overflowing operator");
   14095     }
   14096     if (Inverted)
   14097       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
   14098     if (CondOpcode == ISD::UMULO)
   14099       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   14100                           MVT::i32);
   14101     else
   14102       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   14103 
   14104     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
   14105 
   14106     if (CondOpcode == ISD::UMULO)
   14107       Cond = X86Op.getValue(2);
   14108     else
   14109       Cond = X86Op.getValue(1);
   14110 
   14111     CC = DAG.getConstant(X86Cond, MVT::i8);
   14112     addTest = false;
   14113   } else {
   14114     unsigned CondOpc;
   14115     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
   14116       SDValue Cmp = Cond.getOperand(0).getOperand(1);
   14117       if (CondOpc == ISD::OR) {
   14118         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
   14119         // two branches instead of an explicit OR instruction with a
   14120         // separate test.
   14121         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   14122             isX86LogicalCmp(Cmp)) {
   14123           CC = Cond.getOperand(0).getOperand(0);
   14124           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   14125                               Chain, Dest, CC, Cmp);
   14126           CC = Cond.getOperand(1).getOperand(0);
   14127           Cond = Cmp;
   14128           addTest = false;
   14129         }
   14130       } else { // ISD::AND
   14131         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
   14132         // two branches instead of an explicit AND instruction with a
   14133         // separate test. However, we only do this if this block doesn't
   14134         // have a fall-through edge, because this requires an explicit
   14135         // jmp when the condition is false.
   14136         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   14137             isX86LogicalCmp(Cmp) &&
   14138             Op.getNode()->hasOneUse()) {
   14139           X86::CondCode CCode =
   14140             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   14141           CCode = X86::GetOppositeBranchCondition(CCode);
   14142           CC = DAG.getConstant(CCode, MVT::i8);
   14143           SDNode *User = *Op.getNode()->use_begin();
   14144           // Look for an unconditional branch following this conditional branch.
   14145           // We need this because we need to reverse the successors in order
   14146           // to implement FCMP_OEQ.
   14147           if (User->getOpcode() == ISD::BR) {
   14148             SDValue FalseBB = User->getOperand(1);
   14149             SDNode *NewBR =
   14150               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   14151             assert(NewBR == User);
   14152             (void)NewBR;
   14153             Dest = FalseBB;
   14154 
   14155             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   14156                                 Chain, Dest, CC, Cmp);
   14157             X86::CondCode CCode =
   14158               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
   14159             CCode = X86::GetOppositeBranchCondition(CCode);
   14160             CC = DAG.getConstant(CCode, MVT::i8);
   14161             Cond = Cmp;
   14162             addTest = false;
   14163           }
   14164         }
   14165       }
   14166     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
   14167       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
   14168       // It should be transformed during dag combiner except when the condition
   14169       // is set by a arithmetics with overflow node.
   14170       X86::CondCode CCode =
   14171         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   14172       CCode = X86::GetOppositeBranchCondition(CCode);
   14173       CC = DAG.getConstant(CCode, MVT::i8);
   14174       Cond = Cond.getOperand(0).getOperand(1);
   14175       addTest = false;
   14176     } else if (Cond.getOpcode() == ISD::SETCC &&
   14177                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
   14178       // For FCMP_OEQ, we can emit
   14179       // two branches instead of an explicit AND instruction with a
   14180       // separate test. However, we only do this if this block doesn't
   14181       // have a fall-through edge, because this requires an explicit
   14182       // jmp when the condition is false.
   14183       if (Op.getNode()->hasOneUse()) {
   14184         SDNode *User = *Op.getNode()->use_begin();
   14185         // Look for an unconditional branch following this conditional branch.
   14186         // We need this because we need to reverse the successors in order
   14187         // to implement FCMP_OEQ.
   14188         if (User->getOpcode() == ISD::BR) {
   14189           SDValue FalseBB = User->getOperand(1);
   14190           SDNode *NewBR =
   14191             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   14192           assert(NewBR == User);
   14193           (void)NewBR;
   14194           Dest = FalseBB;
   14195 
   14196           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   14197                                     Cond.getOperand(0), Cond.getOperand(1));
   14198           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   14199           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   14200           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   14201                               Chain, Dest, CC, Cmp);
   14202           CC = DAG.getConstant(X86::COND_P, MVT::i8);
   14203           Cond = Cmp;
   14204           addTest = false;
   14205         }
   14206       }
   14207     } else if (Cond.getOpcode() == ISD::SETCC &&
   14208                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
   14209       // For FCMP_UNE, we can emit
   14210       // two branches instead of an explicit AND instruction with a
   14211       // separate test. However, we only do this if this block doesn't
   14212       // have a fall-through edge, because this requires an explicit
   14213       // jmp when the condition is false.
   14214       if (Op.getNode()->hasOneUse()) {
   14215         SDNode *User = *Op.getNode()->use_begin();
   14216         // Look for an unconditional branch following this conditional branch.
   14217         // We need this because we need to reverse the successors in order
   14218         // to implement FCMP_UNE.
   14219         if (User->getOpcode() == ISD::BR) {
   14220           SDValue FalseBB = User->getOperand(1);
   14221           SDNode *NewBR =
   14222             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   14223           assert(NewBR == User);
   14224           (void)NewBR;
   14225 
   14226           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   14227                                     Cond.getOperand(0), Cond.getOperand(1));
   14228           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   14229           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
   14230           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   14231                               Chain, Dest, CC, Cmp);
   14232           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
   14233           Cond = Cmp;
   14234           addTest = false;
   14235           Dest = FalseBB;
   14236         }
   14237       }
   14238     }
   14239   }
   14240 
   14241   if (addTest) {
   14242     // Look pass the truncate if the high bits are known zero.
   14243     if (isTruncWithZeroHighBitsInput(Cond, DAG))
   14244         Cond = Cond.getOperand(0);
   14245 
   14246     // We know the result of AND is compared against zero. Try to match
   14247     // it to BT.
   14248     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   14249       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
   14250       if (NewSetCC.getNode()) {
   14251         CC = NewSetCC.getOperand(0);
   14252         Cond = NewSetCC.getOperand(1);
   14253         addTest = false;
   14254       }
   14255     }
   14256   }
   14257 
   14258   if (addTest) {
   14259     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
   14260     CC = DAG.getConstant(X86Cond, MVT::i8);
   14261     Cond = EmitTest(Cond, X86Cond, dl, DAG);
   14262   }
   14263   Cond = ConvertCmpIfNecessary(Cond, DAG);
   14264   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   14265                      Chain, Dest, CC, Cond);
   14266 }
   14267 
   14268 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
   14269 // Calls to _alloca are needed to probe the stack when allocating more than 4k
   14270 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
   14271 // that the guard pages used by the OS virtual memory manager are allocated in
   14272 // correct sequence.
   14273 SDValue
   14274 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   14275                                            SelectionDAG &DAG) const {
   14276   MachineFunction &MF = DAG.getMachineFunction();
   14277   bool SplitStack = MF.shouldSplitStack();
   14278   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
   14279                SplitStack;
   14280   SDLoc dl(Op);
   14281 
   14282   if (!Lower) {
   14283     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   14284     SDNode* Node = Op.getNode();
   14285 
   14286     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
   14287     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
   14288         " not tell us which reg is the stack pointer!");
   14289     EVT VT = Node->getValueType(0);
   14290     SDValue Tmp1 = SDValue(Node, 0);
   14291     SDValue Tmp2 = SDValue(Node, 1);
   14292     SDValue Tmp3 = Node->getOperand(2);
   14293     SDValue Chain = Tmp1.getOperand(0);
   14294 
   14295     // Chain the dynamic stack allocation so that it doesn't modify the stack
   14296     // pointer when other instructions are using the stack.
   14297     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
   14298         SDLoc(Node));
   14299 
   14300     SDValue Size = Tmp2.getOperand(1);
   14301     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
   14302     Chain = SP.getValue(1);
   14303     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
   14304     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   14305     unsigned StackAlign = TFI.getStackAlignment();
   14306     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
   14307     if (Align > StackAlign)
   14308       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
   14309           DAG.getConstant(-(uint64_t)Align, VT));
   14310     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
   14311 
   14312     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
   14313         DAG.getIntPtrConstant(0, true), SDValue(),
   14314         SDLoc(Node));
   14315 
   14316     SDValue Ops[2] = { Tmp1, Tmp2 };
   14317     return DAG.getMergeValues(Ops, dl);
   14318   }
   14319 
   14320   // Get the inputs.
   14321   SDValue Chain = Op.getOperand(0);
   14322   SDValue Size  = Op.getOperand(1);
   14323   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   14324   EVT VT = Op.getNode()->getValueType(0);
   14325 
   14326   bool Is64Bit = Subtarget->is64Bit();
   14327   EVT SPTy = getPointerTy();
   14328 
   14329   if (SplitStack) {
   14330     MachineRegisterInfo &MRI = MF.getRegInfo();
   14331 
   14332     if (Is64Bit) {
   14333       // The 64 bit implementation of segmented stacks needs to clobber both r10
   14334       // r11. This makes it impossible to use it along with nested parameters.
   14335       const Function *F = MF.getFunction();
   14336 
   14337       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
   14338            I != E; ++I)
   14339         if (I->hasNestAttr())
   14340           report_fatal_error("Cannot use segmented stacks with functions that "
   14341                              "have nested arguments.");
   14342     }
   14343 
   14344     const TargetRegisterClass *AddrRegClass =
   14345       getRegClassFor(getPointerTy());
   14346     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
   14347     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
   14348     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
   14349                                 DAG.getRegister(Vreg, SPTy));
   14350     SDValue Ops1[2] = { Value, Chain };
   14351     return DAG.getMergeValues(Ops1, dl);
   14352   } else {
   14353     SDValue Flag;
   14354     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
   14355 
   14356     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
   14357     Flag = Chain.getValue(1);
   14358     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   14359 
   14360     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
   14361 
   14362     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   14363     unsigned SPReg = RegInfo->getStackRegister();
   14364     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
   14365     Chain = SP.getValue(1);
   14366 
   14367     if (Align) {
   14368       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
   14369                        DAG.getConstant(-(uint64_t)Align, VT));
   14370       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
   14371     }
   14372 
   14373     SDValue Ops1[2] = { SP, Chain };
   14374     return DAG.getMergeValues(Ops1, dl);
   14375   }
   14376 }
   14377 
   14378 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   14379   MachineFunction &MF = DAG.getMachineFunction();
   14380   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   14381 
   14382   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   14383   SDLoc DL(Op);
   14384 
   14385   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
   14386     // vastart just stores the address of the VarArgsFrameIndex slot into the
   14387     // memory location argument.
   14388     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
   14389                                    getPointerTy());
   14390     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
   14391                         MachinePointerInfo(SV), false, false, 0);
   14392   }
   14393 
   14394   // __va_list_tag:
   14395   //   gp_offset         (0 - 6 * 8)
   14396   //   fp_offset         (48 - 48 + 8 * 16)
   14397   //   overflow_arg_area (point to parameters coming in memory).
   14398   //   reg_save_area
   14399   SmallVector<SDValue, 8> MemOps;
   14400   SDValue FIN = Op.getOperand(1);
   14401   // Store gp_offset
   14402   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
   14403                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
   14404                                                MVT::i32),
   14405                                FIN, MachinePointerInfo(SV), false, false, 0);
   14406   MemOps.push_back(Store);
   14407 
   14408   // Store fp_offset
   14409   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   14410                     FIN, DAG.getIntPtrConstant(4));
   14411   Store = DAG.getStore(Op.getOperand(0), DL,
   14412                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
   14413                                        MVT::i32),
   14414                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
   14415   MemOps.push_back(Store);
   14416 
   14417   // Store ptr to overflow_arg_area
   14418   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   14419                     FIN, DAG.getIntPtrConstant(4));
   14420   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
   14421                                     getPointerTy());
   14422   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
   14423                        MachinePointerInfo(SV, 8),
   14424                        false, false, 0);
   14425   MemOps.push_back(Store);
   14426 
   14427   // Store ptr to reg_save_area.
   14428   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
   14429                     FIN, DAG.getIntPtrConstant(8));
   14430   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   14431                                     getPointerTy());
   14432   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
   14433                        MachinePointerInfo(SV, 16), false, false, 0);
   14434   MemOps.push_back(Store);
   14435   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
   14436 }
   14437 
   14438 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   14439   assert(Subtarget->is64Bit() &&
   14440          "LowerVAARG only handles 64-bit va_arg!");
   14441   assert((Subtarget->isTargetLinux() ||
   14442           Subtarget->isTargetDarwin()) &&
   14443           "Unhandled target in LowerVAARG");
   14444   assert(Op.getNode()->getNumOperands() == 4);
   14445   SDValue Chain = Op.getOperand(0);
   14446   SDValue SrcPtr = Op.getOperand(1);
   14447   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   14448   unsigned Align = Op.getConstantOperandVal(3);
   14449   SDLoc dl(Op);
   14450 
   14451   EVT ArgVT = Op.getNode()->getValueType(0);
   14452   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   14453   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
   14454   uint8_t ArgMode;
   14455 
   14456   // Decide which area this value should be read from.
   14457   // TODO: Implement the AMD64 ABI in its entirety. This simple
   14458   // selection mechanism works only for the basic types.
   14459   if (ArgVT == MVT::f80) {
   14460     llvm_unreachable("va_arg for f80 not yet implemented");
   14461   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
   14462     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
   14463   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
   14464     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
   14465   } else {
   14466     llvm_unreachable("Unhandled argument type in LowerVAARG");
   14467   }
   14468 
   14469   if (ArgMode == 2) {
   14470     // Sanity Check: Make sure using fp_offset makes sense.
   14471     assert(!DAG.getTarget().Options.UseSoftFloat &&
   14472            !(DAG.getMachineFunction().getFunction()->hasFnAttribute(
   14473                Attribute::NoImplicitFloat)) &&
   14474            Subtarget->hasSSE1());
   14475   }
   14476 
   14477   // Insert VAARG_64 node into the DAG
   14478   // VAARG_64 returns two values: Variable Argument Address, Chain
   14479   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, MVT::i32),
   14480                        DAG.getConstant(ArgMode, MVT::i8),
   14481                        DAG.getConstant(Align, MVT::i32)};
   14482   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
   14483   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
   14484                                           VTs, InstOps, MVT::i64,
   14485                                           MachinePointerInfo(SV),
   14486                                           /*Align=*/0,
   14487                                           /*Volatile=*/false,
   14488                                           /*ReadMem=*/true,
   14489                                           /*WriteMem=*/true);
   14490   Chain = VAARG.getValue(1);
   14491 
   14492   // Load the next argument and return it
   14493   return DAG.getLoad(ArgVT, dl,
   14494                      Chain,
   14495                      VAARG,
   14496                      MachinePointerInfo(),
   14497                      false, false, false, 0);
   14498 }
   14499 
   14500 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
   14501                            SelectionDAG &DAG) {
   14502   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
   14503   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
   14504   SDValue Chain = Op.getOperand(0);
   14505   SDValue DstPtr = Op.getOperand(1);
   14506   SDValue SrcPtr = Op.getOperand(2);
   14507   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   14508   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   14509   SDLoc DL(Op);
   14510 
   14511   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
   14512                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
   14513                        false, false,
   14514                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
   14515 }
   14516 
   14517 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
   14518 // amount is a constant. Takes immediate version of shift as input.
   14519 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
   14520                                           SDValue SrcOp, uint64_t ShiftAmt,
   14521                                           SelectionDAG &DAG) {
   14522   MVT ElementType = VT.getVectorElementType();
   14523 
   14524   // Fold this packed shift into its first operand if ShiftAmt is 0.
   14525   if (ShiftAmt == 0)
   14526     return SrcOp;
   14527 
   14528   // Check for ShiftAmt >= element width
   14529   if (ShiftAmt >= ElementType.getSizeInBits()) {
   14530     if (Opc == X86ISD::VSRAI)
   14531       ShiftAmt = ElementType.getSizeInBits() - 1;
   14532     else
   14533       return DAG.getConstant(0, VT);
   14534   }
   14535 
   14536   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
   14537          && "Unknown target vector shift-by-constant node");
   14538 
   14539   // Fold this packed vector shift into a build vector if SrcOp is a
   14540   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
   14541   if (VT == SrcOp.getSimpleValueType() &&
   14542       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
   14543     SmallVector<SDValue, 8> Elts;
   14544     unsigned NumElts = SrcOp->getNumOperands();
   14545     ConstantSDNode *ND;
   14546 
   14547     switch(Opc) {
   14548     default: llvm_unreachable(nullptr);
   14549     case X86ISD::VSHLI:
   14550       for (unsigned i=0; i!=NumElts; ++i) {
   14551         SDValue CurrentOp = SrcOp->getOperand(i);
   14552         if (CurrentOp->getOpcode() == ISD::UNDEF) {
   14553           Elts.push_back(CurrentOp);
   14554           continue;
   14555         }
   14556         ND = cast<ConstantSDNode>(CurrentOp);
   14557         const APInt &C = ND->getAPIntValue();
   14558         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
   14559       }
   14560       break;
   14561     case X86ISD::VSRLI:
   14562       for (unsigned i=0; i!=NumElts; ++i) {
   14563         SDValue CurrentOp = SrcOp->getOperand(i);
   14564         if (CurrentOp->getOpcode() == ISD::UNDEF) {
   14565           Elts.push_back(CurrentOp);
   14566           continue;
   14567         }
   14568         ND = cast<ConstantSDNode>(CurrentOp);
   14569         const APInt &C = ND->getAPIntValue();
   14570         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
   14571       }
   14572       break;
   14573     case X86ISD::VSRAI:
   14574       for (unsigned i=0; i!=NumElts; ++i) {
   14575         SDValue CurrentOp = SrcOp->getOperand(i);
   14576         if (CurrentOp->getOpcode() == ISD::UNDEF) {
   14577           Elts.push_back(CurrentOp);
   14578           continue;
   14579         }
   14580         ND = cast<ConstantSDNode>(CurrentOp);
   14581         const APInt &C = ND->getAPIntValue();
   14582         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
   14583       }
   14584       break;
   14585     }
   14586 
   14587     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
   14588   }
   14589 
   14590   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
   14591 }
   14592 
   14593 // getTargetVShiftNode - Handle vector element shifts where the shift amount
   14594 // may or may not be a constant. Takes immediate version of shift as input.
   14595 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
   14596                                    SDValue SrcOp, SDValue ShAmt,
   14597                                    SelectionDAG &DAG) {
   14598   MVT SVT = ShAmt.getSimpleValueType();
   14599   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
   14600 
   14601   // Catch shift-by-constant.
   14602   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
   14603     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
   14604                                       CShAmt->getZExtValue(), DAG);
   14605 
   14606   // Change opcode to non-immediate version
   14607   switch (Opc) {
   14608     default: llvm_unreachable("Unknown target vector shift node");
   14609     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
   14610     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
   14611     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
   14612   }
   14613 
   14614   const X86Subtarget &Subtarget =
   14615       static_cast<const X86Subtarget &>(DAG.getSubtarget());
   14616   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
   14617       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
   14618     // Let the shuffle legalizer expand this shift amount node.
   14619     SDValue Op0 = ShAmt.getOperand(0);
   14620     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
   14621     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
   14622   } else {
   14623     // Need to build a vector containing shift amount.
   14624     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
   14625     SmallVector<SDValue, 4> ShOps;
   14626     ShOps.push_back(ShAmt);
   14627     if (SVT == MVT::i32) {
   14628       ShOps.push_back(DAG.getConstant(0, SVT));
   14629       ShOps.push_back(DAG.getUNDEF(SVT));
   14630     }
   14631     ShOps.push_back(DAG.getUNDEF(SVT));
   14632 
   14633     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
   14634     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
   14635   }
   14636 
   14637   // The return type has to be a 128-bit type with the same element
   14638   // type as the input type.
   14639   MVT EltVT = VT.getVectorElementType();
   14640   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
   14641 
   14642   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
   14643   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
   14644 }
   14645 
   14646 /// \brief Return (and \p Op, \p Mask) for compare instructions or
   14647 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
   14648 /// necessary casting for \p Mask when lowering masking intrinsics.
   14649 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
   14650                                     SDValue PreservedSrc,
   14651                                     const X86Subtarget *Subtarget,
   14652                                     SelectionDAG &DAG) {
   14653     EVT VT = Op.getValueType();
   14654     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
   14655                                   MVT::i1, VT.getVectorNumElements());
   14656     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
   14657                                      Mask.getValueType().getSizeInBits());
   14658     SDLoc dl(Op);
   14659 
   14660     assert(MaskVT.isSimple() && "invalid mask type");
   14661 
   14662     if (isAllOnes(Mask))
   14663       return Op;
   14664 
   14665     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
   14666     // are extracted by EXTRACT_SUBVECTOR.
   14667     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
   14668                               DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
   14669                               DAG.getIntPtrConstant(0));
   14670 
   14671     switch (Op.getOpcode()) {
   14672       default: break;
   14673       case X86ISD::PCMPEQM:
   14674       case X86ISD::PCMPGTM:
   14675       case X86ISD::CMPM:
   14676       case X86ISD::CMPMU:
   14677         return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
   14678     }
   14679     if (PreservedSrc.getOpcode() == ISD::UNDEF)
   14680       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   14681     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
   14682 }
   14683 
   14684 /// \brief Creates an SDNode for a predicated scalar operation.
   14685 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
   14686 /// The mask is comming as MVT::i8 and it should be truncated
   14687 /// to MVT::i1 while lowering masking intrinsics.
   14688 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
   14689 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
   14690 /// a scalar instruction.
   14691 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
   14692                                     SDValue PreservedSrc,
   14693                                     const X86Subtarget *Subtarget,
   14694                                     SelectionDAG &DAG) {
   14695     if (isAllOnes(Mask))
   14696       return Op;
   14697 
   14698     EVT VT = Op.getValueType();
   14699     SDLoc dl(Op);
   14700     // The mask should be of type MVT::i1
   14701     SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
   14702 
   14703     if (PreservedSrc.getOpcode() == ISD::UNDEF)
   14704       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   14705     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
   14706 }
   14707 
   14708 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   14709                                        SelectionDAG &DAG) {
   14710   SDLoc dl(Op);
   14711   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   14712   EVT VT = Op.getValueType();
   14713   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
   14714   if (IntrData) {
   14715     switch(IntrData->Type) {
   14716     case INTR_TYPE_1OP:
   14717       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
   14718     case INTR_TYPE_2OP:
   14719       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
   14720         Op.getOperand(2));
   14721     case INTR_TYPE_3OP:
   14722       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
   14723         Op.getOperand(2), Op.getOperand(3));
   14724     case INTR_TYPE_1OP_MASK_RM: {
   14725       SDValue Src = Op.getOperand(1);
   14726       SDValue Src0 = Op.getOperand(2);
   14727       SDValue Mask = Op.getOperand(3);
   14728       SDValue RoundingMode = Op.getOperand(4);
   14729       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
   14730                                               RoundingMode),
   14731                                   Mask, Src0, Subtarget, DAG);
   14732     }
   14733     case INTR_TYPE_SCALAR_MASK_RM: {
   14734       SDValue Src1 = Op.getOperand(1);
   14735       SDValue Src2 = Op.getOperand(2);
   14736       SDValue Src0 = Op.getOperand(3);
   14737       SDValue Mask = Op.getOperand(4);
   14738       // There are 2 kinds of intrinsics in this group:
   14739       // (1) With supress-all-exceptions (sae) - 6 operands
   14740       // (2) With rounding mode and sae - 7 operands.
   14741       if (Op.getNumOperands() == 6) {
   14742         SDValue Sae  = Op.getOperand(5);
   14743         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
   14744                                                 Sae),
   14745                                     Mask, Src0, Subtarget, DAG);
   14746       }
   14747       assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
   14748       SDValue RoundingMode  = Op.getOperand(5);
   14749       SDValue Sae  = Op.getOperand(6);
   14750       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
   14751                                               RoundingMode, Sae),
   14752                                   Mask, Src0, Subtarget, DAG);
   14753     }
   14754     case INTR_TYPE_2OP_MASK: {
   14755       SDValue Src1 = Op.getOperand(1);
   14756       SDValue Src2 = Op.getOperand(2);
   14757       SDValue PassThru = Op.getOperand(3);
   14758       SDValue Mask = Op.getOperand(4);
   14759       // We specify 2 possible opcodes for intrinsics with rounding modes.
   14760       // First, we check if the intrinsic may have non-default rounding mode,
   14761       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   14762       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   14763       if (IntrWithRoundingModeOpcode != 0) {
   14764         SDValue Rnd = Op.getOperand(5);
   14765         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
   14766         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
   14767           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   14768                                       dl, Op.getValueType(),
   14769                                       Src1, Src2, Rnd),
   14770                                       Mask, PassThru, Subtarget, DAG);
   14771         }
   14772       }
   14773       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   14774                                               Src1,Src2),
   14775                                   Mask, PassThru, Subtarget, DAG);
   14776     }
   14777     case FMA_OP_MASK: {
   14778       SDValue Src1 = Op.getOperand(1);
   14779       SDValue Src2 = Op.getOperand(2);
   14780       SDValue Src3 = Op.getOperand(3);
   14781       SDValue Mask = Op.getOperand(4);
   14782       // We specify 2 possible opcodes for intrinsics with rounding modes.
   14783       // First, we check if the intrinsic may have non-default rounding mode,
   14784       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   14785       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   14786       if (IntrWithRoundingModeOpcode != 0) {
   14787         SDValue Rnd = Op.getOperand(5);
   14788         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
   14789             X86::STATIC_ROUNDING::CUR_DIRECTION)
   14790           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   14791                                                   dl, Op.getValueType(),
   14792                                                   Src1, Src2, Src3, Rnd),
   14793                                       Mask, Src1, Subtarget, DAG);
   14794       }
   14795       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
   14796                                               dl, Op.getValueType(),
   14797                                               Src1, Src2, Src3),
   14798                                   Mask, Src1, Subtarget, DAG);
   14799     }
   14800     case CMP_MASK:
   14801     case CMP_MASK_CC: {
   14802       // Comparison intrinsics with masks.
   14803       // Example of transformation:
   14804       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
   14805       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
   14806       // (i8 (bitcast
   14807       //   (v8i1 (insert_subvector undef,
   14808       //           (v2i1 (and (PCMPEQM %a, %b),
   14809       //                      (extract_subvector
   14810       //                         (v8i1 (bitcast %mask)), 0))), 0))))
   14811       EVT VT = Op.getOperand(1).getValueType();
   14812       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
   14813                                     VT.getVectorNumElements());
   14814       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
   14815       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
   14816                                        Mask.getValueType().getSizeInBits());
   14817       SDValue Cmp;
   14818       if (IntrData->Type == CMP_MASK_CC) {
   14819         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
   14820                     Op.getOperand(2), Op.getOperand(3));
   14821       } else {
   14822         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
   14823         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
   14824                     Op.getOperand(2));
   14825       }
   14826       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
   14827                                              DAG.getTargetConstant(0, MaskVT),
   14828                                              Subtarget, DAG);
   14829       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
   14830                                 DAG.getUNDEF(BitcastVT), CmpMask,
   14831                                 DAG.getIntPtrConstant(0));
   14832       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
   14833     }
   14834     case COMI: { // Comparison intrinsics
   14835       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
   14836       SDValue LHS = Op.getOperand(1);
   14837       SDValue RHS = Op.getOperand(2);
   14838       unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
   14839       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
   14840       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
   14841       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   14842                                   DAG.getConstant(X86CC, MVT::i8), Cond);
   14843       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   14844     }
   14845     case VSHIFT:
   14846       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
   14847                                  Op.getOperand(1), Op.getOperand(2), DAG);
   14848     case VSHIFT_MASK:
   14849       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
   14850                                                       Op.getSimpleValueType(),
   14851                                                       Op.getOperand(1),
   14852                                                       Op.getOperand(2), DAG),
   14853                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
   14854                                   DAG);
   14855     case COMPRESS_EXPAND_IN_REG: {
   14856       SDValue Mask = Op.getOperand(3);
   14857       SDValue DataToCompress = Op.getOperand(1);
   14858       SDValue PassThru = Op.getOperand(2);
   14859       if (isAllOnes(Mask)) // return data as is
   14860         return Op.getOperand(1);
   14861       EVT VT = Op.getValueType();
   14862       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
   14863                                     VT.getVectorNumElements());
   14864       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
   14865                                        Mask.getValueType().getSizeInBits());
   14866       SDLoc dl(Op);
   14867       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
   14868                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
   14869                                   DAG.getIntPtrConstant(0));
   14870 
   14871       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
   14872                          PassThru);
   14873     }
   14874     case BLEND: {
   14875       SDValue Mask = Op.getOperand(3);
   14876       EVT VT = Op.getValueType();
   14877       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
   14878                                     VT.getVectorNumElements());
   14879       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
   14880                                        Mask.getValueType().getSizeInBits());
   14881       SDLoc dl(Op);
   14882       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
   14883                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
   14884                                   DAG.getIntPtrConstant(0));
   14885       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
   14886                          Op.getOperand(2));
   14887     }
   14888     default:
   14889       break;
   14890     }
   14891   }
   14892 
   14893   switch (IntNo) {
   14894   default: return SDValue();    // Don't custom lower most intrinsics.
   14895 
   14896   case Intrinsic::x86_avx2_permd:
   14897   case Intrinsic::x86_avx2_permps:
   14898     // Operands intentionally swapped. Mask is last operand to intrinsic,
   14899     // but second operand for node/instruction.
   14900     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
   14901                        Op.getOperand(2), Op.getOperand(1));
   14902 
   14903   case Intrinsic::x86_avx512_mask_valign_q_512:
   14904   case Intrinsic::x86_avx512_mask_valign_d_512:
   14905     // Vector source operands are swapped.
   14906     return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
   14907                                             Op.getValueType(), Op.getOperand(2),
   14908                                             Op.getOperand(1),
   14909                                             Op.getOperand(3)),
   14910                                 Op.getOperand(5), Op.getOperand(4),
   14911                                 Subtarget, DAG);
   14912 
   14913   // ptest and testp intrinsics. The intrinsic these come from are designed to
   14914   // return an integer value, not just an instruction so lower it to the ptest
   14915   // or testp pattern and a setcc for the result.
   14916   case Intrinsic::x86_sse41_ptestz:
   14917   case Intrinsic::x86_sse41_ptestc:
   14918   case Intrinsic::x86_sse41_ptestnzc:
   14919   case Intrinsic::x86_avx_ptestz_256:
   14920   case Intrinsic::x86_avx_ptestc_256:
   14921   case Intrinsic::x86_avx_ptestnzc_256:
   14922   case Intrinsic::x86_avx_vtestz_ps:
   14923   case Intrinsic::x86_avx_vtestc_ps:
   14924   case Intrinsic::x86_avx_vtestnzc_ps:
   14925   case Intrinsic::x86_avx_vtestz_pd:
   14926   case Intrinsic::x86_avx_vtestc_pd:
   14927   case Intrinsic::x86_avx_vtestnzc_pd:
   14928   case Intrinsic::x86_avx_vtestz_ps_256:
   14929   case Intrinsic::x86_avx_vtestc_ps_256:
   14930   case Intrinsic::x86_avx_vtestnzc_ps_256:
   14931   case Intrinsic::x86_avx_vtestz_pd_256:
   14932   case Intrinsic::x86_avx_vtestc_pd_256:
   14933   case Intrinsic::x86_avx_vtestnzc_pd_256: {
   14934     bool IsTestPacked = false;
   14935     unsigned X86CC;
   14936     switch (IntNo) {
   14937     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
   14938     case Intrinsic::x86_avx_vtestz_ps:
   14939     case Intrinsic::x86_avx_vtestz_pd:
   14940     case Intrinsic::x86_avx_vtestz_ps_256:
   14941     case Intrinsic::x86_avx_vtestz_pd_256:
   14942       IsTestPacked = true; // Fallthrough
   14943     case Intrinsic::x86_sse41_ptestz:
   14944     case Intrinsic::x86_avx_ptestz_256:
   14945       // ZF = 1
   14946       X86CC = X86::COND_E;
   14947       break;
   14948     case Intrinsic::x86_avx_vtestc_ps:
   14949     case Intrinsic::x86_avx_vtestc_pd:
   14950     case Intrinsic::x86_avx_vtestc_ps_256:
   14951     case Intrinsic::x86_avx_vtestc_pd_256:
   14952       IsTestPacked = true; // Fallthrough
   14953     case Intrinsic::x86_sse41_ptestc:
   14954     case Intrinsic::x86_avx_ptestc_256:
   14955       // CF = 1
   14956       X86CC = X86::COND_B;
   14957       break;
   14958     case Intrinsic::x86_avx_vtestnzc_ps:
   14959     case Intrinsic::x86_avx_vtestnzc_pd:
   14960     case Intrinsic::x86_avx_vtestnzc_ps_256:
   14961     case Intrinsic::x86_avx_vtestnzc_pd_256:
   14962       IsTestPacked = true; // Fallthrough
   14963     case Intrinsic::x86_sse41_ptestnzc:
   14964     case Intrinsic::x86_avx_ptestnzc_256:
   14965       // ZF and CF = 0
   14966       X86CC = X86::COND_A;
   14967       break;
   14968     }
   14969 
   14970     SDValue LHS = Op.getOperand(1);
   14971     SDValue RHS = Op.getOperand(2);
   14972     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
   14973     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
   14974     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
   14975     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
   14976     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   14977   }
   14978   case Intrinsic::x86_avx512_kortestz_w:
   14979   case Intrinsic::x86_avx512_kortestc_w: {
   14980     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
   14981     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
   14982     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
   14983     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
   14984     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
   14985     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
   14986     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   14987   }
   14988 
   14989   case Intrinsic::x86_sse42_pcmpistria128:
   14990   case Intrinsic::x86_sse42_pcmpestria128:
   14991   case Intrinsic::x86_sse42_pcmpistric128:
   14992   case Intrinsic::x86_sse42_pcmpestric128:
   14993   case Intrinsic::x86_sse42_pcmpistrio128:
   14994   case Intrinsic::x86_sse42_pcmpestrio128:
   14995   case Intrinsic::x86_sse42_pcmpistris128:
   14996   case Intrinsic::x86_sse42_pcmpestris128:
   14997   case Intrinsic::x86_sse42_pcmpistriz128:
   14998   case Intrinsic::x86_sse42_pcmpestriz128: {
   14999     unsigned Opcode;
   15000     unsigned X86CC;
   15001     switch (IntNo) {
   15002     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   15003     case Intrinsic::x86_sse42_pcmpistria128:
   15004       Opcode = X86ISD::PCMPISTRI;
   15005       X86CC = X86::COND_A;
   15006       break;
   15007     case Intrinsic::x86_sse42_pcmpestria128:
   15008       Opcode = X86ISD::PCMPESTRI;
   15009       X86CC = X86::COND_A;
   15010       break;
   15011     case Intrinsic::x86_sse42_pcmpistric128:
   15012       Opcode = X86ISD::PCMPISTRI;
   15013       X86CC = X86::COND_B;
   15014       break;
   15015     case Intrinsic::x86_sse42_pcmpestric128:
   15016       Opcode = X86ISD::PCMPESTRI;
   15017       X86CC = X86::COND_B;
   15018       break;
   15019     case Intrinsic::x86_sse42_pcmpistrio128:
   15020       Opcode = X86ISD::PCMPISTRI;
   15021       X86CC = X86::COND_O;
   15022       break;
   15023     case Intrinsic::x86_sse42_pcmpestrio128:
   15024       Opcode = X86ISD::PCMPESTRI;
   15025       X86CC = X86::COND_O;
   15026       break;
   15027     case Intrinsic::x86_sse42_pcmpistris128:
   15028       Opcode = X86ISD::PCMPISTRI;
   15029       X86CC = X86::COND_S;
   15030       break;
   15031     case Intrinsic::x86_sse42_pcmpestris128:
   15032       Opcode = X86ISD::PCMPESTRI;
   15033       X86CC = X86::COND_S;
   15034       break;
   15035     case Intrinsic::x86_sse42_pcmpistriz128:
   15036       Opcode = X86ISD::PCMPISTRI;
   15037       X86CC = X86::COND_E;
   15038       break;
   15039     case Intrinsic::x86_sse42_pcmpestriz128:
   15040       Opcode = X86ISD::PCMPESTRI;
   15041       X86CC = X86::COND_E;
   15042       break;
   15043     }
   15044     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
   15045     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   15046     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
   15047     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   15048                                 DAG.getConstant(X86CC, MVT::i8),
   15049                                 SDValue(PCMP.getNode(), 1));
   15050     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   15051   }
   15052 
   15053   case Intrinsic::x86_sse42_pcmpistri128:
   15054   case Intrinsic::x86_sse42_pcmpestri128: {
   15055     unsigned Opcode;
   15056     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
   15057       Opcode = X86ISD::PCMPISTRI;
   15058     else
   15059       Opcode = X86ISD::PCMPESTRI;
   15060 
   15061     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
   15062     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   15063     return DAG.getNode(Opcode, dl, VTs, NewOps);
   15064   }
   15065   }
   15066 }
   15067 
   15068 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   15069                               SDValue Src, SDValue Mask, SDValue Base,
   15070                               SDValue Index, SDValue ScaleOp, SDValue Chain,
   15071                               const X86Subtarget * Subtarget) {
   15072   SDLoc dl(Op);
   15073   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
   15074   assert(C && "Invalid scale type");
   15075   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
   15076   EVT MaskVT = MVT::getVectorVT(MVT::i1,
   15077                              Index.getSimpleValueType().getVectorNumElements());
   15078   SDValue MaskInReg;
   15079   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
   15080   if (MaskC)
   15081     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
   15082   else
   15083     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
   15084   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   15085   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   15086   SDValue Segment = DAG.getRegister(0, MVT::i32);
   15087   if (Src.getOpcode() == ISD::UNDEF)
   15088     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
   15089   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   15090   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   15091   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
   15092   return DAG.getMergeValues(RetOps, dl);
   15093 }
   15094 
   15095 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   15096                                SDValue Src, SDValue Mask, SDValue Base,
   15097                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
   15098   SDLoc dl(Op);
   15099   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
   15100   assert(C && "Invalid scale type");
   15101   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
   15102   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   15103   SDValue Segment = DAG.getRegister(0, MVT::i32);
   15104   EVT MaskVT = MVT::getVectorVT(MVT::i1,
   15105                              Index.getSimpleValueType().getVectorNumElements());
   15106   SDValue MaskInReg;
   15107   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
   15108   if (MaskC)
   15109     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
   15110   else
   15111     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
   15112   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   15113   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
   15114   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   15115   return SDValue(Res, 1);
   15116 }
   15117 
   15118 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   15119                                SDValue Mask, SDValue Base, SDValue Index,
   15120                                SDValue ScaleOp, SDValue Chain) {
   15121   SDLoc dl(Op);
   15122   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
   15123   assert(C && "Invalid scale type");
   15124   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
   15125   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   15126   SDValue Segment = DAG.getRegister(0, MVT::i32);
   15127   EVT MaskVT =
   15128     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   15129   SDValue MaskInReg;
   15130   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
   15131   if (MaskC)
   15132     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
   15133   else
   15134     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
   15135   //SDVTList VTs = DAG.getVTList(MVT::Other);
   15136   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   15137   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   15138   return SDValue(Res, 0);
   15139 }
   15140 
   15141 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
   15142 // read performance monitor counters (x86_rdpmc).
   15143 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
   15144                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
   15145                               SmallVectorImpl<SDValue> &Results) {
   15146   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   15147   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   15148   SDValue LO, HI;
   15149 
   15150   // The ECX register is used to select the index of the performance counter
   15151   // to read.
   15152   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
   15153                                    N->getOperand(2));
   15154   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
   15155 
   15156   // Reads the content of a 64-bit performance counter and returns it in the
   15157   // registers EDX:EAX.
   15158   if (Subtarget->is64Bit()) {
   15159     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
   15160     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
   15161                             LO.getValue(2));
   15162   } else {
   15163     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
   15164     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
   15165                             LO.getValue(2));
   15166   }
   15167   Chain = HI.getValue(1);
   15168 
   15169   if (Subtarget->is64Bit()) {
   15170     // The EAX register is loaded with the low-order 32 bits. The EDX register
   15171     // is loaded with the supported high-order bits of the counter.
   15172     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
   15173                               DAG.getConstant(32, MVT::i8));
   15174     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
   15175     Results.push_back(Chain);
   15176     return;
   15177   }
   15178 
   15179   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   15180   SDValue Ops[] = { LO, HI };
   15181   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   15182   Results.push_back(Pair);
   15183   Results.push_back(Chain);
   15184 }
   15185 
   15186 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
   15187 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
   15188 // also used to custom lower READCYCLECOUNTER nodes.
   15189 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
   15190                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
   15191                               SmallVectorImpl<SDValue> &Results) {
   15192   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   15193   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
   15194   SDValue LO, HI;
   15195 
   15196   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
   15197   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
   15198   // and the EAX register is loaded with the low-order 32 bits.
   15199   if (Subtarget->is64Bit()) {
   15200     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
   15201     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
   15202                             LO.getValue(2));
   15203   } else {
   15204     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
   15205     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
   15206                             LO.getValue(2));
   15207   }
   15208   SDValue Chain = HI.getValue(1);
   15209 
   15210   if (Opcode == X86ISD::RDTSCP_DAG) {
   15211     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   15212 
   15213     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
   15214     // the ECX register. Add 'ecx' explicitly to the chain.
   15215     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
   15216                                      HI.getValue(2));
   15217     // Explicitly store the content of ECX at the location passed in input
   15218     // to the 'rdtscp' intrinsic.
   15219     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
   15220                          MachinePointerInfo(), false, false, 0);
   15221   }
   15222 
   15223   if (Subtarget->is64Bit()) {
   15224     // The EDX register is loaded with the high-order 32 bits of the MSR, and
   15225     // the EAX register is loaded with the low-order 32 bits.
   15226     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
   15227                               DAG.getConstant(32, MVT::i8));
   15228     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
   15229     Results.push_back(Chain);
   15230     return;
   15231   }
   15232 
   15233   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   15234   SDValue Ops[] = { LO, HI };
   15235   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   15236   Results.push_back(Pair);
   15237   Results.push_back(Chain);
   15238 }
   15239 
   15240 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
   15241                                      SelectionDAG &DAG) {
   15242   SmallVector<SDValue, 2> Results;
   15243   SDLoc DL(Op);
   15244   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
   15245                           Results);
   15246   return DAG.getMergeValues(Results, DL);
   15247 }
   15248 
   15249 
   15250 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   15251                                       SelectionDAG &DAG) {
   15252   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   15253 
   15254   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
   15255   if (!IntrData)
   15256     return SDValue();
   15257 
   15258   SDLoc dl(Op);
   15259   switch(IntrData->Type) {
   15260   default:
   15261     llvm_unreachable("Unknown Intrinsic Type");
   15262     break;
   15263   case RDSEED:
   15264   case RDRAND: {
   15265     // Emit the node with the right value type.
   15266     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
   15267     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
   15268 
   15269     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
   15270     // Otherwise return the value from Rand, which is always 0, casted to i32.
   15271     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
   15272                       DAG.getConstant(1, Op->getValueType(1)),
   15273                       DAG.getConstant(X86::COND_B, MVT::i32),
   15274                       SDValue(Result.getNode(), 1) };
   15275     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
   15276                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
   15277                                   Ops);
   15278 
   15279     // Return { result, isValid, chain }.
   15280     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
   15281                        SDValue(Result.getNode(), 2));
   15282   }
   15283   case GATHER: {
   15284   //gather(v1, mask, index, base, scale);
   15285     SDValue Chain = Op.getOperand(0);
   15286     SDValue Src   = Op.getOperand(2);
   15287     SDValue Base  = Op.getOperand(3);
   15288     SDValue Index = Op.getOperand(4);
   15289     SDValue Mask  = Op.getOperand(5);
   15290     SDValue Scale = Op.getOperand(6);
   15291     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
   15292                           Subtarget);
   15293   }
   15294   case SCATTER: {
   15295   //scatter(base, mask, index, v1, scale);
   15296     SDValue Chain = Op.getOperand(0);
   15297     SDValue Base  = Op.getOperand(2);
   15298     SDValue Mask  = Op.getOperand(3);
   15299     SDValue Index = Op.getOperand(4);
   15300     SDValue Src   = Op.getOperand(5);
   15301     SDValue Scale = Op.getOperand(6);
   15302     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
   15303   }
   15304   case PREFETCH: {
   15305     SDValue Hint = Op.getOperand(6);
   15306     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
   15307     assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
   15308     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
   15309     SDValue Chain = Op.getOperand(0);
   15310     SDValue Mask  = Op.getOperand(2);
   15311     SDValue Index = Op.getOperand(3);
   15312     SDValue Base  = Op.getOperand(4);
   15313     SDValue Scale = Op.getOperand(5);
   15314     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
   15315   }
   15316   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
   15317   case RDTSC: {
   15318     SmallVector<SDValue, 2> Results;
   15319     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
   15320     return DAG.getMergeValues(Results, dl);
   15321   }
   15322   // Read Performance Monitoring Counters.
   15323   case RDPMC: {
   15324     SmallVector<SDValue, 2> Results;
   15325     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
   15326     return DAG.getMergeValues(Results, dl);
   15327   }
   15328   // XTEST intrinsics.
   15329   case XTEST: {
   15330     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
   15331     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
   15332     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   15333                                 DAG.getConstant(X86::COND_NE, MVT::i8),
   15334                                 InTrans);
   15335     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
   15336     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
   15337                        Ret, SDValue(InTrans.getNode(), 1));
   15338   }
   15339   // ADC/ADCX/SBB
   15340   case ADX: {
   15341     SmallVector<SDValue, 2> Results;
   15342     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
   15343     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
   15344     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
   15345                                 DAG.getConstant(-1, MVT::i8));
   15346     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
   15347                               Op.getOperand(4), GenCF.getValue(1));
   15348     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
   15349                                  Op.getOperand(5), MachinePointerInfo(),
   15350                                  false, false, 0);
   15351     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   15352                                 DAG.getConstant(X86::COND_B, MVT::i8),
   15353                                 Res.getValue(1));
   15354     Results.push_back(SetCC);
   15355     Results.push_back(Store);
   15356     return DAG.getMergeValues(Results, dl);
   15357   }
   15358   case COMPRESS_TO_MEM: {
   15359     SDLoc dl(Op);
   15360     SDValue Mask = Op.getOperand(4);
   15361     SDValue DataToCompress = Op.getOperand(3);
   15362     SDValue Addr = Op.getOperand(2);
   15363     SDValue Chain = Op.getOperand(0);
   15364 
   15365     if (isAllOnes(Mask)) // return just a store
   15366       return DAG.getStore(Chain, dl, DataToCompress, Addr,
   15367                           MachinePointerInfo(), false, false, 0);
   15368 
   15369     EVT VT = DataToCompress.getValueType();
   15370     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
   15371                                   VT.getVectorNumElements());
   15372     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
   15373                                      Mask.getValueType().getSizeInBits());
   15374     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
   15375                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
   15376                                 DAG.getIntPtrConstant(0));
   15377 
   15378     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
   15379                                       DataToCompress, DAG.getUNDEF(VT));
   15380     return DAG.getStore(Chain, dl, Compressed, Addr,
   15381                         MachinePointerInfo(), false, false, 0);
   15382   }
   15383   case EXPAND_FROM_MEM: {
   15384     SDLoc dl(Op);
   15385     SDValue Mask = Op.getOperand(4);
   15386     SDValue PathThru = Op.getOperand(3);
   15387     SDValue Addr = Op.getOperand(2);
   15388     SDValue Chain = Op.getOperand(0);
   15389     EVT VT = Op.getValueType();
   15390 
   15391     if (isAllOnes(Mask)) // return just a load
   15392       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
   15393                          false, 0);
   15394     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
   15395                                   VT.getVectorNumElements());
   15396     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
   15397                                      Mask.getValueType().getSizeInBits());
   15398     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
   15399                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
   15400                                 DAG.getIntPtrConstant(0));
   15401 
   15402     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
   15403                                    false, false, false, 0);
   15404 
   15405     SDValue Results[] = {
   15406         DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru),
   15407         Chain};
   15408     return DAG.getMergeValues(Results, dl);
   15409   }
   15410   }
   15411 }
   15412 
   15413 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
   15414                                            SelectionDAG &DAG) const {
   15415   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   15416   MFI->setReturnAddressIsTaken(true);
   15417 
   15418   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
   15419     return SDValue();
   15420 
   15421   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   15422   SDLoc dl(Op);
   15423   EVT PtrVT = getPointerTy();
   15424 
   15425   if (Depth > 0) {
   15426     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
   15427     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   15428     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
   15429     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
   15430                        DAG.getNode(ISD::ADD, dl, PtrVT,
   15431                                    FrameAddr, Offset),
   15432                        MachinePointerInfo(), false, false, false, 0);
   15433   }
   15434 
   15435   // Just load the return address.
   15436   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
   15437   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
   15438                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
   15439 }
   15440 
   15441 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   15442   MachineFunction &MF = DAG.getMachineFunction();
   15443   MachineFrameInfo *MFI = MF.getFrameInfo();
   15444   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   15445   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   15446   EVT VT = Op.getValueType();
   15447 
   15448   MFI->setFrameAddressIsTaken(true);
   15449 
   15450   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
   15451     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
   15452     // is not possible to crawl up the stack without looking at the unwind codes
   15453     // simultaneously.
   15454     int FrameAddrIndex = FuncInfo->getFAIndex();
   15455     if (!FrameAddrIndex) {
   15456       // Set up a frame object for the return address.
   15457       unsigned SlotSize = RegInfo->getSlotSize();
   15458       FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
   15459           SlotSize, /*Offset=*/INT64_MIN, /*IsImmutable=*/false);
   15460       FuncInfo->setFAIndex(FrameAddrIndex);
   15461     }
   15462     return DAG.getFrameIndex(FrameAddrIndex, VT);
   15463   }
   15464 
   15465   unsigned FrameReg =
   15466       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   15467   SDLoc dl(Op);  // FIXME probably not meaningful
   15468   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   15469   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
   15470           (FrameReg == X86::EBP && VT == MVT::i32)) &&
   15471          "Invalid Frame Register!");
   15472   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   15473   while (Depth--)
   15474     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
   15475                             MachinePointerInfo(),
   15476                             false, false, false, 0);
   15477   return FrameAddr;
   15478 }
   15479 
   15480 // FIXME? Maybe this could be a TableGen attribute on some registers and
   15481 // this table could be generated automatically from RegInfo.
   15482 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
   15483                                               EVT VT) const {
   15484   unsigned Reg = StringSwitch<unsigned>(RegName)
   15485                        .Case("esp", X86::ESP)
   15486                        .Case("rsp", X86::RSP)
   15487                        .Default(0);
   15488   if (Reg)
   15489     return Reg;
   15490   report_fatal_error("Invalid register name global variable");
   15491 }
   15492 
   15493 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
   15494                                                      SelectionDAG &DAG) const {
   15495   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   15496   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
   15497 }
   15498 
   15499 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   15500   SDValue Chain     = Op.getOperand(0);
   15501   SDValue Offset    = Op.getOperand(1);
   15502   SDValue Handler   = Op.getOperand(2);
   15503   SDLoc dl      (Op);
   15504 
   15505   EVT PtrVT = getPointerTy();
   15506   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   15507   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   15508   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
   15509           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
   15510          "Invalid Frame Register!");
   15511   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
   15512   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
   15513 
   15514   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
   15515                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
   15516   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
   15517   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
   15518                        false, false, 0);
   15519   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
   15520 
   15521   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
   15522                      DAG.getRegister(StoreAddrReg, PtrVT));
   15523 }
   15524 
   15525 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
   15526                                                SelectionDAG &DAG) const {
   15527   SDLoc DL(Op);
   15528   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
   15529                      DAG.getVTList(MVT::i32, MVT::Other),
   15530                      Op.getOperand(0), Op.getOperand(1));
   15531 }
   15532 
   15533 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
   15534                                                 SelectionDAG &DAG) const {
   15535   SDLoc DL(Op);
   15536   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
   15537                      Op.getOperand(0), Op.getOperand(1));
   15538 }
   15539 
   15540 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
   15541   return Op.getOperand(0);
   15542 }
   15543 
   15544 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   15545                                                 SelectionDAG &DAG) const {
   15546   SDValue Root = Op.getOperand(0);
   15547   SDValue Trmp = Op.getOperand(1); // trampoline
   15548   SDValue FPtr = Op.getOperand(2); // nested function
   15549   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   15550   SDLoc dl (Op);
   15551 
   15552   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   15553   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   15554 
   15555   if (Subtarget->is64Bit()) {
   15556     SDValue OutChains[6];
   15557 
   15558     // Large code-model.
   15559     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
   15560     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
   15561 
   15562     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
   15563     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
   15564 
   15565     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
   15566 
   15567     // Load the pointer to the nested function into R11.
   15568     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
   15569     SDValue Addr = Trmp;
   15570     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
   15571                                 Addr, MachinePointerInfo(TrmpAddr),
   15572                                 false, false, 0);
   15573 
   15574     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   15575                        DAG.getConstant(2, MVT::i64));
   15576     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
   15577                                 MachinePointerInfo(TrmpAddr, 2),
   15578                                 false, false, 2);
   15579 
   15580     // Load the 'nest' parameter value into R10.
   15581     // R10 is specified in X86CallingConv.td
   15582     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
   15583     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   15584                        DAG.getConstant(10, MVT::i64));
   15585     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
   15586                                 Addr, MachinePointerInfo(TrmpAddr, 10),
   15587                                 false, false, 0);
   15588 
   15589     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   15590                        DAG.getConstant(12, MVT::i64));
   15591     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
   15592                                 MachinePointerInfo(TrmpAddr, 12),
   15593                                 false, false, 2);
   15594 
   15595     // Jump to the nested function.
   15596     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
   15597     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   15598                        DAG.getConstant(20, MVT::i64));
   15599     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
   15600                                 Addr, MachinePointerInfo(TrmpAddr, 20),
   15601                                 false, false, 0);
   15602 
   15603     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
   15604     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   15605                        DAG.getConstant(22, MVT::i64));
   15606     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
   15607                                 MachinePointerInfo(TrmpAddr, 22),
   15608                                 false, false, 0);
   15609 
   15610     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   15611   } else {
   15612     const Function *Func =
   15613       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
   15614     CallingConv::ID CC = Func->getCallingConv();
   15615     unsigned NestReg;
   15616 
   15617     switch (CC) {
   15618     default:
   15619       llvm_unreachable("Unsupported calling convention");
   15620     case CallingConv::C:
   15621     case CallingConv::X86_StdCall: {
   15622       // Pass 'nest' parameter in ECX.
   15623       // Must be kept in sync with X86CallingConv.td
   15624       NestReg = X86::ECX;
   15625 
   15626       // Check that ECX wasn't needed by an 'inreg' parameter.
   15627       FunctionType *FTy = Func->getFunctionType();
   15628       const AttributeSet &Attrs = Func->getAttributes();
   15629 
   15630       if (!Attrs.isEmpty() && !Func->isVarArg()) {
   15631         unsigned InRegCount = 0;
   15632         unsigned Idx = 1;
   15633 
   15634         for (FunctionType::param_iterator I = FTy->param_begin(),
   15635              E = FTy->param_end(); I != E; ++I, ++Idx)
   15636           if (Attrs.hasAttribute(Idx, Attribute::InReg))
   15637             // FIXME: should only count parameters that are lowered to integers.
   15638             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
   15639 
   15640         if (InRegCount > 2) {
   15641           report_fatal_error("Nest register in use - reduce number of inreg"
   15642                              " parameters!");
   15643         }
   15644       }
   15645       break;
   15646     }
   15647     case CallingConv::X86_FastCall:
   15648     case CallingConv::X86_ThisCall:
   15649     case CallingConv::Fast:
   15650       // Pass 'nest' parameter in EAX.
   15651       // Must be kept in sync with X86CallingConv.td
   15652       NestReg = X86::EAX;
   15653       break;
   15654     }
   15655 
   15656     SDValue OutChains[4];
   15657     SDValue Addr, Disp;
   15658 
   15659     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   15660                        DAG.getConstant(10, MVT::i32));
   15661     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
   15662 
   15663     // This is storing the opcode for MOV32ri.
   15664     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
   15665     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
   15666     OutChains[0] = DAG.getStore(Root, dl,
   15667                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
   15668                                 Trmp, MachinePointerInfo(TrmpAddr),
   15669                                 false, false, 0);
   15670 
   15671     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   15672                        DAG.getConstant(1, MVT::i32));
   15673     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
   15674                                 MachinePointerInfo(TrmpAddr, 1),
   15675                                 false, false, 1);
   15676 
   15677     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
   15678     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   15679                        DAG.getConstant(5, MVT::i32));
   15680     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
   15681                                 MachinePointerInfo(TrmpAddr, 5),
   15682                                 false, false, 1);
   15683 
   15684     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   15685                        DAG.getConstant(6, MVT::i32));
   15686     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
   15687                                 MachinePointerInfo(TrmpAddr, 6),
   15688                                 false, false, 1);
   15689 
   15690     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   15691   }
   15692 }
   15693 
   15694 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   15695                                             SelectionDAG &DAG) const {
   15696   /*
   15697    The rounding mode is in bits 11:10 of FPSR, and has the following
   15698    settings:
   15699      00 Round to nearest
   15700      01 Round to -inf
   15701      10 Round to +inf
   15702      11 Round to 0
   15703 
   15704   FLT_ROUNDS, on the other hand, expects the following:
   15705     -1 Undefined
   15706      0 Round to 0
   15707      1 Round to nearest
   15708      2 Round to +inf
   15709      3 Round to -inf
   15710 
   15711   To perform the conversion, we do:
   15712     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
   15713   */
   15714 
   15715   MachineFunction &MF = DAG.getMachineFunction();
   15716   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   15717   unsigned StackAlignment = TFI.getStackAlignment();
   15718   MVT VT = Op.getSimpleValueType();
   15719   SDLoc DL(Op);
   15720 
   15721   // Save FP Control Word to stack slot
   15722   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
   15723   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
   15724 
   15725   MachineMemOperand *MMO =
   15726    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
   15727                            MachineMemOperand::MOStore, 2, 2);
   15728 
   15729   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
   15730   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
   15731                                           DAG.getVTList(MVT::Other),
   15732                                           Ops, MVT::i16, MMO);
   15733 
   15734   // Load FP Control Word from stack slot
   15735   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
   15736                             MachinePointerInfo(), false, false, false, 0);
   15737 
   15738   // Transform as necessary
   15739   SDValue CWD1 =
   15740     DAG.getNode(ISD::SRL, DL, MVT::i16,
   15741                 DAG.getNode(ISD::AND, DL, MVT::i16,
   15742                             CWD, DAG.getConstant(0x800, MVT::i16)),
   15743                 DAG.getConstant(11, MVT::i8));
   15744   SDValue CWD2 =
   15745     DAG.getNode(ISD::SRL, DL, MVT::i16,
   15746                 DAG.getNode(ISD::AND, DL, MVT::i16,
   15747                             CWD, DAG.getConstant(0x400, MVT::i16)),
   15748                 DAG.getConstant(9, MVT::i8));
   15749 
   15750   SDValue RetVal =
   15751     DAG.getNode(ISD::AND, DL, MVT::i16,
   15752                 DAG.getNode(ISD::ADD, DL, MVT::i16,
   15753                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
   15754                             DAG.getConstant(1, MVT::i16)),
   15755                 DAG.getConstant(3, MVT::i16));
   15756 
   15757   return DAG.getNode((VT.getSizeInBits() < 16 ?
   15758                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
   15759 }
   15760 
   15761 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
   15762   MVT VT = Op.getSimpleValueType();
   15763   EVT OpVT = VT;
   15764   unsigned NumBits = VT.getSizeInBits();
   15765   SDLoc dl(Op);
   15766 
   15767   Op = Op.getOperand(0);
   15768   if (VT == MVT::i8) {
   15769     // Zero extend to i32 since there is not an i8 bsr.
   15770     OpVT = MVT::i32;
   15771     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   15772   }
   15773 
   15774   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
   15775   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   15776   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
   15777 
   15778   // If src is zero (i.e. bsr sets ZF), returns NumBits.
   15779   SDValue Ops[] = {
   15780     Op,
   15781     DAG.getConstant(NumBits+NumBits-1, OpVT),
   15782     DAG.getConstant(X86::COND_E, MVT::i8),
   15783     Op.getValue(1)
   15784   };
   15785   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
   15786 
   15787   // Finally xor with NumBits-1.
   15788   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
   15789 
   15790   if (VT == MVT::i8)
   15791     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
   15792   return Op;
   15793 }
   15794 
   15795 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
   15796   MVT VT = Op.getSimpleValueType();
   15797   EVT OpVT = VT;
   15798   unsigned NumBits = VT.getSizeInBits();
   15799   SDLoc dl(Op);
   15800 
   15801   Op = Op.getOperand(0);
   15802   if (VT == MVT::i8) {
   15803     // Zero extend to i32 since there is not an i8 bsr.
   15804     OpVT = MVT::i32;
   15805     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   15806   }
   15807 
   15808   // Issue a bsr (scan bits in reverse).
   15809   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   15810   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
   15811 
   15812   // And xor with NumBits-1.
   15813   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
   15814 
   15815   if (VT == MVT::i8)
   15816     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
   15817   return Op;
   15818 }
   15819 
   15820 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   15821   MVT VT = Op.getSimpleValueType();
   15822   unsigned NumBits = VT.getSizeInBits();
   15823   SDLoc dl(Op);
   15824   Op = Op.getOperand(0);
   15825 
   15826   // Issue a bsf (scan bits forward) which also sets EFLAGS.
   15827   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   15828   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
   15829 
   15830   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   15831   SDValue Ops[] = {
   15832     Op,
   15833     DAG.getConstant(NumBits, VT),
   15834     DAG.getConstant(X86::COND_E, MVT::i8),
   15835     Op.getValue(1)
   15836   };
   15837   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
   15838 }
   15839 
   15840 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
   15841 // ones, and then concatenate the result back.
   15842 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
   15843   MVT VT = Op.getSimpleValueType();
   15844 
   15845   assert(VT.is256BitVector() && VT.isInteger() &&
   15846          "Unsupported value type for operation");
   15847 
   15848   unsigned NumElems = VT.getVectorNumElements();
   15849   SDLoc dl(Op);
   15850 
   15851   // Extract the LHS vectors
   15852   SDValue LHS = Op.getOperand(0);
   15853   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
   15854   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
   15855 
   15856   // Extract the RHS vectors
   15857   SDValue RHS = Op.getOperand(1);
   15858   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
   15859   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
   15860 
   15861   MVT EltVT = VT.getVectorElementType();
   15862   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   15863 
   15864   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   15865                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
   15866                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
   15867 }
   15868 
   15869 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
   15870   assert(Op.getSimpleValueType().is256BitVector() &&
   15871          Op.getSimpleValueType().isInteger() &&
   15872          "Only handle AVX 256-bit vector integer operation");
   15873   return Lower256IntArith(Op, DAG);
   15874 }
   15875 
   15876 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
   15877   assert(Op.getSimpleValueType().is256BitVector() &&
   15878          Op.getSimpleValueType().isInteger() &&
   15879          "Only handle AVX 256-bit vector integer operation");
   15880   return Lower256IntArith(Op, DAG);
   15881 }
   15882 
   15883 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   15884                         SelectionDAG &DAG) {
   15885   SDLoc dl(Op);
   15886   MVT VT = Op.getSimpleValueType();
   15887 
   15888   // Decompose 256-bit ops into smaller 128-bit ops.
   15889   if (VT.is256BitVector() && !Subtarget->hasInt256())
   15890     return Lower256IntArith(Op, DAG);
   15891 
   15892   SDValue A = Op.getOperand(0);
   15893   SDValue B = Op.getOperand(1);
   15894 
   15895   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
   15896   if (VT == MVT::v4i32) {
   15897     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
   15898            "Should not custom lower when pmuldq is available!");
   15899 
   15900     // Extract the odd parts.
   15901     static const int UnpackMask[] = { 1, -1, 3, -1 };
   15902     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
   15903     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
   15904 
   15905     // Multiply the even parts.
   15906     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
   15907     // Now multiply odd parts.
   15908     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
   15909 
   15910     Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
   15911     Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
   15912 
   15913     // Merge the two vectors back together with a shuffle. This expands into 2
   15914     // shuffles.
   15915     static const int ShufMask[] = { 0, 4, 2, 6 };
   15916     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
   15917   }
   15918 
   15919   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
   15920          "Only know how to lower V2I64/V4I64/V8I64 multiply");
   15921 
   15922   //  Ahi = psrlqi(a, 32);
   15923   //  Bhi = psrlqi(b, 32);
   15924   //
   15925   //  AloBlo = pmuludq(a, b);
   15926   //  AloBhi = pmuludq(a, Bhi);
   15927   //  AhiBlo = pmuludq(Ahi, b);
   15928 
   15929   //  AloBhi = psllqi(AloBhi, 32);
   15930   //  AhiBlo = psllqi(AhiBlo, 32);
   15931   //  return AloBlo + AloBhi + AhiBlo;
   15932 
   15933   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
   15934   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
   15935 
   15936   // Bit cast to 32-bit vectors for MULUDQ
   15937   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
   15938                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
   15939   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
   15940   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
   15941   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
   15942   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
   15943 
   15944   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
   15945   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
   15946   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
   15947 
   15948   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
   15949   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
   15950 
   15951   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
   15952   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
   15953 }
   15954 
   15955 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
   15956   assert(Subtarget->isTargetWin64() && "Unexpected target");
   15957   EVT VT = Op.getValueType();
   15958   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
   15959          "Unexpected return type for lowering");
   15960 
   15961   RTLIB::Libcall LC;
   15962   bool isSigned;
   15963   switch (Op->getOpcode()) {
   15964   default: llvm_unreachable("Unexpected request for libcall!");
   15965   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
   15966   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
   15967   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
   15968   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
   15969   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
   15970   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
   15971   }
   15972 
   15973   SDLoc dl(Op);
   15974   SDValue InChain = DAG.getEntryNode();
   15975 
   15976   TargetLowering::ArgListTy Args;
   15977   TargetLowering::ArgListEntry Entry;
   15978   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
   15979     EVT ArgVT = Op->getOperand(i).getValueType();
   15980     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
   15981            "Unexpected argument type for lowering");
   15982     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
   15983     Entry.Node = StackPtr;
   15984     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
   15985                            false, false, 16);
   15986     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   15987     Entry.Ty = PointerType::get(ArgTy,0);
   15988     Entry.isSExt = false;
   15989     Entry.isZExt = false;
   15990     Args.push_back(Entry);
   15991   }
   15992 
   15993   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
   15994                                          getPointerTy());
   15995 
   15996   TargetLowering::CallLoweringInfo CLI(DAG);
   15997   CLI.setDebugLoc(dl).setChain(InChain)
   15998     .setCallee(getLibcallCallingConv(LC),
   15999                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
   16000                Callee, std::move(Args), 0)
   16001     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
   16002 
   16003   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   16004   return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
   16005 }
   16006 
   16007 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
   16008                              SelectionDAG &DAG) {
   16009   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
   16010   EVT VT = Op0.getValueType();
   16011   SDLoc dl(Op);
   16012 
   16013   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
   16014          (VT == MVT::v8i32 && Subtarget->hasInt256()));
   16015 
   16016   // PMULxD operations multiply each even value (starting at 0) of LHS with
   16017   // the related value of RHS and produce a widen result.
   16018   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   16019   // => <2 x i64> <ae|cg>
   16020   //
   16021   // In other word, to have all the results, we need to perform two PMULxD:
   16022   // 1. one with the even values.
   16023   // 2. one with the odd values.
   16024   // To achieve #2, with need to place the odd values at an even position.
   16025   //
   16026   // Place the odd value at an even position (basically, shift all values 1
   16027   // step to the left):
   16028   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
   16029   // <a|b|c|d> => <b|undef|d|undef>
   16030   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
   16031   // <e|f|g|h> => <f|undef|h|undef>
   16032   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
   16033 
   16034   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
   16035   // ints.
   16036   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
   16037   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
   16038   unsigned Opcode =
   16039       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
   16040   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   16041   // => <2 x i64> <ae|cg>
   16042   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
   16043                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
   16044   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
   16045   // => <2 x i64> <bf|dh>
   16046   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
   16047                              DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
   16048 
   16049   // Shuffle it back into the right order.
   16050   SDValue Highs, Lows;
   16051   if (VT == MVT::v8i32) {
   16052     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
   16053     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
   16054     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
   16055     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
   16056   } else {
   16057     const int HighMask[] = {1, 5, 3, 7};
   16058     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
   16059     const int LowMask[] = {0, 4, 2, 6};
   16060     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
   16061   }
   16062 
   16063   // If we have a signed multiply but no PMULDQ fix up the high parts of a
   16064   // unsigned multiply.
   16065   if (IsSigned && !Subtarget->hasSSE41()) {
   16066     SDValue ShAmt =
   16067         DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
   16068     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
   16069                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
   16070     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
   16071                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
   16072 
   16073     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
   16074     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
   16075   }
   16076 
   16077   // The first result of MUL_LOHI is actually the low value, followed by the
   16078   // high value.
   16079   SDValue Ops[] = {Lows, Highs};
   16080   return DAG.getMergeValues(Ops, dl);
   16081 }
   16082 
   16083 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   16084                                          const X86Subtarget *Subtarget) {
   16085   MVT VT = Op.getSimpleValueType();
   16086   SDLoc dl(Op);
   16087   SDValue R = Op.getOperand(0);
   16088   SDValue Amt = Op.getOperand(1);
   16089 
   16090   // Optimize shl/srl/sra with constant shift amount.
   16091   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
   16092     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
   16093       uint64_t ShiftAmt = ShiftConst->getZExtValue();
   16094 
   16095       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
   16096           (Subtarget->hasInt256() &&
   16097            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
   16098           (Subtarget->hasAVX512() &&
   16099            (VT == MVT::v8i64 || VT == MVT::v16i32))) {
   16100         if (Op.getOpcode() == ISD::SHL)
   16101           return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
   16102                                             DAG);
   16103         if (Op.getOpcode() == ISD::SRL)
   16104           return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
   16105                                             DAG);
   16106         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
   16107           return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
   16108                                             DAG);
   16109       }
   16110 
   16111       if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
   16112         unsigned NumElts = VT.getVectorNumElements();
   16113         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
   16114 
   16115         if (Op.getOpcode() == ISD::SHL) {
   16116           // Make a large shift.
   16117           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
   16118                                                    R, ShiftAmt, DAG);
   16119           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
   16120           // Zero out the rightmost bits.
   16121           SmallVector<SDValue, 32> V(
   16122               NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8));
   16123           return DAG.getNode(ISD::AND, dl, VT, SHL,
   16124                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
   16125         }
   16126         if (Op.getOpcode() == ISD::SRL) {
   16127           // Make a large shift.
   16128           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
   16129                                                    R, ShiftAmt, DAG);
   16130           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
   16131           // Zero out the leftmost bits.
   16132           SmallVector<SDValue, 32> V(
   16133               NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8));
   16134           return DAG.getNode(ISD::AND, dl, VT, SRL,
   16135                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
   16136         }
   16137         if (Op.getOpcode() == ISD::SRA) {
   16138           if (ShiftAmt == 7) {
   16139             // R s>> 7  ===  R s< 0
   16140             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   16141             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
   16142           }
   16143 
   16144           // R s>> a === ((R u>> a) ^ m) - m
   16145           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
   16146           SmallVector<SDValue, 32> V(NumElts,
   16147                                      DAG.getConstant(128 >> ShiftAmt, MVT::i8));
   16148           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
   16149           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
   16150           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
   16151           return Res;
   16152         }
   16153         llvm_unreachable("Unknown shift opcode.");
   16154       }
   16155     }
   16156   }
   16157 
   16158   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
   16159   if (!Subtarget->is64Bit() &&
   16160       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
   16161       Amt.getOpcode() == ISD::BITCAST &&
   16162       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
   16163     Amt = Amt.getOperand(0);
   16164     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
   16165                      VT.getVectorNumElements();
   16166     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
   16167     uint64_t ShiftAmt = 0;
   16168     for (unsigned i = 0; i != Ratio; ++i) {
   16169       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
   16170       if (!C)
   16171         return SDValue();
   16172       // 6 == Log2(64)
   16173       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
   16174     }
   16175     // Check remaining shift amounts.
   16176     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
   16177       uint64_t ShAmt = 0;
   16178       for (unsigned j = 0; j != Ratio; ++j) {
   16179         ConstantSDNode *C =
   16180           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
   16181         if (!C)
   16182           return SDValue();
   16183         // 6 == Log2(64)
   16184         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
   16185       }
   16186       if (ShAmt != ShiftAmt)
   16187         return SDValue();
   16188     }
   16189     switch (Op.getOpcode()) {
   16190     default:
   16191       llvm_unreachable("Unknown shift opcode!");
   16192     case ISD::SHL:
   16193       return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
   16194                                         DAG);
   16195     case ISD::SRL:
   16196       return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
   16197                                         DAG);
   16198     case ISD::SRA:
   16199       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
   16200                                         DAG);
   16201     }
   16202   }
   16203 
   16204   return SDValue();
   16205 }
   16206 
   16207 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   16208                                         const X86Subtarget* Subtarget) {
   16209   MVT VT = Op.getSimpleValueType();
   16210   SDLoc dl(Op);
   16211   SDValue R = Op.getOperand(0);
   16212   SDValue Amt = Op.getOperand(1);
   16213 
   16214   if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
   16215       VT == MVT::v4i32 || VT == MVT::v8i16 ||
   16216       (Subtarget->hasInt256() &&
   16217        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
   16218         VT == MVT::v8i32 || VT == MVT::v16i16)) ||
   16219        (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
   16220     SDValue BaseShAmt;
   16221     EVT EltVT = VT.getVectorElementType();
   16222 
   16223     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
   16224       // Check if this build_vector node is doing a splat.
   16225       // If so, then set BaseShAmt equal to the splat value.
   16226       BaseShAmt = BV->getSplatValue();
   16227       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
   16228         BaseShAmt = SDValue();
   16229     } else {
   16230       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
   16231         Amt = Amt.getOperand(0);
   16232 
   16233       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
   16234       if (SVN && SVN->isSplat()) {
   16235         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
   16236         SDValue InVec = Amt.getOperand(0);
   16237         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
   16238           assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
   16239                  "Unexpected shuffle index found!");
   16240           BaseShAmt = InVec.getOperand(SplatIdx);
   16241         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
   16242            if (ConstantSDNode *C =
   16243                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
   16244              if (C->getZExtValue() == SplatIdx)
   16245                BaseShAmt = InVec.getOperand(1);
   16246            }
   16247         }
   16248 
   16249         if (!BaseShAmt)
   16250           // Avoid introducing an extract element from a shuffle.
   16251           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
   16252                                     DAG.getIntPtrConstant(SplatIdx));
   16253       }
   16254     }
   16255 
   16256     if (BaseShAmt.getNode()) {
   16257       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
   16258       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
   16259         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
   16260       else if (EltVT.bitsLT(MVT::i32))
   16261         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
   16262 
   16263       switch (Op.getOpcode()) {
   16264       default:
   16265         llvm_unreachable("Unknown shift opcode!");
   16266       case ISD::SHL:
   16267         switch (VT.SimpleTy) {
   16268         default: return SDValue();
   16269         case MVT::v2i64:
   16270         case MVT::v4i32:
   16271         case MVT::v8i16:
   16272         case MVT::v4i64:
   16273         case MVT::v8i32:
   16274         case MVT::v16i16:
   16275         case MVT::v16i32:
   16276         case MVT::v8i64:
   16277           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
   16278         }
   16279       case ISD::SRA:
   16280         switch (VT.SimpleTy) {
   16281         default: return SDValue();
   16282         case MVT::v4i32:
   16283         case MVT::v8i16:
   16284         case MVT::v8i32:
   16285         case MVT::v16i16:
   16286         case MVT::v16i32:
   16287         case MVT::v8i64:
   16288           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
   16289         }
   16290       case ISD::SRL:
   16291         switch (VT.SimpleTy) {
   16292         default: return SDValue();
   16293         case MVT::v2i64:
   16294         case MVT::v4i32:
   16295         case MVT::v8i16:
   16296         case MVT::v4i64:
   16297         case MVT::v8i32:
   16298         case MVT::v16i16:
   16299         case MVT::v16i32:
   16300         case MVT::v8i64:
   16301           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
   16302         }
   16303       }
   16304     }
   16305   }
   16306 
   16307   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
   16308   if (!Subtarget->is64Bit() &&
   16309       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
   16310       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
   16311       Amt.getOpcode() == ISD::BITCAST &&
   16312       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
   16313     Amt = Amt.getOperand(0);
   16314     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
   16315                      VT.getVectorNumElements();
   16316     std::vector<SDValue> Vals(Ratio);
   16317     for (unsigned i = 0; i != Ratio; ++i)
   16318       Vals[i] = Amt.getOperand(i);
   16319     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
   16320       for (unsigned j = 0; j != Ratio; ++j)
   16321         if (Vals[j] != Amt.getOperand(i + j))
   16322           return SDValue();
   16323     }
   16324     switch (Op.getOpcode()) {
   16325     default:
   16326       llvm_unreachable("Unknown shift opcode!");
   16327     case ISD::SHL:
   16328       return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
   16329     case ISD::SRL:
   16330       return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
   16331     case ISD::SRA:
   16332       return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
   16333     }
   16334   }
   16335 
   16336   return SDValue();
   16337 }
   16338 
   16339 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
   16340                           SelectionDAG &DAG) {
   16341   MVT VT = Op.getSimpleValueType();
   16342   SDLoc dl(Op);
   16343   SDValue R = Op.getOperand(0);
   16344   SDValue Amt = Op.getOperand(1);
   16345 
   16346   assert(VT.isVector() && "Custom lowering only for vector shifts!");
   16347   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
   16348 
   16349   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
   16350     return V;
   16351 
   16352   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
   16353       return V;
   16354 
   16355   if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
   16356     return Op;
   16357 
   16358   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
   16359   if (Subtarget->hasInt256()) {
   16360     if (Op.getOpcode() == ISD::SRL &&
   16361         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
   16362          VT == MVT::v4i64 || VT == MVT::v8i32))
   16363       return Op;
   16364     if (Op.getOpcode() == ISD::SHL &&
   16365         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
   16366          VT == MVT::v4i64 || VT == MVT::v8i32))
   16367       return Op;
   16368     if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
   16369       return Op;
   16370   }
   16371 
   16372   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
   16373   // shifts per-lane and then shuffle the partial results back together.
   16374   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
   16375     // Splat the shift amounts so the scalar shifts above will catch it.
   16376     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
   16377     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
   16378     SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
   16379     SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
   16380     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
   16381   }
   16382 
   16383   // If possible, lower this packed shift into a vector multiply instead of
   16384   // expanding it into a sequence of scalar shifts.
   16385   // Do this only if the vector shift count is a constant build_vector.
   16386   if (Op.getOpcode() == ISD::SHL &&
   16387       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
   16388        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
   16389       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
   16390     SmallVector<SDValue, 8> Elts;
   16391     EVT SVT = VT.getScalarType();
   16392     unsigned SVTBits = SVT.getSizeInBits();
   16393     const APInt &One = APInt(SVTBits, 1);
   16394     unsigned NumElems = VT.getVectorNumElements();
   16395 
   16396     for (unsigned i=0; i !=NumElems; ++i) {
   16397       SDValue Op = Amt->getOperand(i);
   16398       if (Op->getOpcode() == ISD::UNDEF) {
   16399         Elts.push_back(Op);
   16400         continue;
   16401       }
   16402 
   16403       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
   16404       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
   16405       uint64_t ShAmt = C.getZExtValue();
   16406       if (ShAmt >= SVTBits) {
   16407         Elts.push_back(DAG.getUNDEF(SVT));
   16408         continue;
   16409       }
   16410       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
   16411     }
   16412     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
   16413     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
   16414   }
   16415 
   16416   // Lower SHL with variable shift amount.
   16417   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
   16418     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
   16419 
   16420     Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
   16421     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
   16422     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
   16423     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   16424   }
   16425 
   16426   // If possible, lower this shift as a sequence of two shifts by
   16427   // constant plus a MOVSS/MOVSD instead of scalarizing it.
   16428   // Example:
   16429   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
   16430   //
   16431   // Could be rewritten as:
   16432   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
   16433   //
   16434   // The advantage is that the two shifts from the example would be
   16435   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
   16436   // the vector shift into four scalar shifts plus four pairs of vector
   16437   // insert/extract.
   16438   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
   16439       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
   16440     unsigned TargetOpcode = X86ISD::MOVSS;
   16441     bool CanBeSimplified;
   16442     // The splat value for the first packed shift (the 'X' from the example).
   16443     SDValue Amt1 = Amt->getOperand(0);
   16444     // The splat value for the second packed shift (the 'Y' from the example).
   16445     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
   16446                                         Amt->getOperand(2);
   16447 
   16448     // See if it is possible to replace this node with a sequence of
   16449     // two shifts followed by a MOVSS/MOVSD
   16450     if (VT == MVT::v4i32) {
   16451       // Check if it is legal to use a MOVSS.
   16452       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
   16453                         Amt2 == Amt->getOperand(3);
   16454       if (!CanBeSimplified) {
   16455         // Otherwise, check if we can still simplify this node using a MOVSD.
   16456         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
   16457                           Amt->getOperand(2) == Amt->getOperand(3);
   16458         TargetOpcode = X86ISD::MOVSD;
   16459         Amt2 = Amt->getOperand(2);
   16460       }
   16461     } else {
   16462       // Do similar checks for the case where the machine value type
   16463       // is MVT::v8i16.
   16464       CanBeSimplified = Amt1 == Amt->getOperand(1);
   16465       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
   16466         CanBeSimplified = Amt2 == Amt->getOperand(i);
   16467 
   16468       if (!CanBeSimplified) {
   16469         TargetOpcode = X86ISD::MOVSD;
   16470         CanBeSimplified = true;
   16471         Amt2 = Amt->getOperand(4);
   16472         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
   16473           CanBeSimplified = Amt1 == Amt->getOperand(i);
   16474         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
   16475           CanBeSimplified = Amt2 == Amt->getOperand(j);
   16476       }
   16477     }
   16478 
   16479     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
   16480         isa<ConstantSDNode>(Amt2)) {
   16481       // Replace this node with two shifts followed by a MOVSS/MOVSD.
   16482       EVT CastVT = MVT::v4i32;
   16483       SDValue Splat1 =
   16484         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
   16485       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
   16486       SDValue Splat2 =
   16487         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
   16488       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
   16489       if (TargetOpcode == X86ISD::MOVSD)
   16490         CastVT = MVT::v2i64;
   16491       SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
   16492       SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
   16493       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
   16494                                             BitCast1, DAG);
   16495       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
   16496     }
   16497   }
   16498 
   16499   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
   16500     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
   16501 
   16502     // a = a << 5;
   16503     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
   16504     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
   16505 
   16506     // Turn 'a' into a mask suitable for VSELECT
   16507     SDValue VSelM = DAG.getConstant(0x80, VT);
   16508     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
   16509     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
   16510 
   16511     SDValue CM1 = DAG.getConstant(0x0f, VT);
   16512     SDValue CM2 = DAG.getConstant(0x3f, VT);
   16513 
   16514     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
   16515     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
   16516     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
   16517     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
   16518     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
   16519 
   16520     // a += a
   16521     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
   16522     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
   16523     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
   16524 
   16525     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
   16526     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
   16527     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
   16528     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
   16529     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
   16530 
   16531     // a += a
   16532     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
   16533     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
   16534     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
   16535 
   16536     // return VSELECT(r, r+r, a);
   16537     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
   16538                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
   16539     return R;
   16540   }
   16541 
   16542   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
   16543   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
   16544   // solution better.
   16545   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
   16546     MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
   16547     unsigned ExtOpc =
   16548         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
   16549     R = DAG.getNode(ExtOpc, dl, NewVT, R);
   16550     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
   16551     return DAG.getNode(ISD::TRUNCATE, dl, VT,
   16552                        DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
   16553   }
   16554 
   16555   // Decompose 256-bit shifts into smaller 128-bit shifts.
   16556   if (VT.is256BitVector()) {
   16557     unsigned NumElems = VT.getVectorNumElements();
   16558     MVT EltVT = VT.getVectorElementType();
   16559     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   16560 
   16561     // Extract the two vectors
   16562     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
   16563     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
   16564 
   16565     // Recreate the shift amount vectors
   16566     SDValue Amt1, Amt2;
   16567     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
   16568       // Constant shift amount
   16569       SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems);
   16570       ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2);
   16571       ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2);
   16572 
   16573       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
   16574       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
   16575     } else {
   16576       // Variable shift amount
   16577       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
   16578       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
   16579     }
   16580 
   16581     // Issue new vector shifts for the smaller types
   16582     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
   16583     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
   16584 
   16585     // Concatenate the result back
   16586     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
   16587   }
   16588 
   16589   return SDValue();
   16590 }
   16591 
   16592 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   16593   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
   16594   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
   16595   // looks for this combo and may remove the "setcc" instruction if the "setcc"
   16596   // has only one use.
   16597   SDNode *N = Op.getNode();
   16598   SDValue LHS = N->getOperand(0);
   16599   SDValue RHS = N->getOperand(1);
   16600   unsigned BaseOp = 0;
   16601   unsigned Cond = 0;
   16602   SDLoc DL(Op);
   16603   switch (Op.getOpcode()) {
   16604   default: llvm_unreachable("Unknown ovf instruction!");
   16605   case ISD::SADDO:
   16606     // A subtract of one will be selected as a INC. Note that INC doesn't
   16607     // set CF, so we can't do this for UADDO.
   16608     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
   16609       if (C->isOne()) {
   16610         BaseOp = X86ISD::INC;
   16611         Cond = X86::COND_O;
   16612         break;
   16613       }
   16614     BaseOp = X86ISD::ADD;
   16615     Cond = X86::COND_O;
   16616     break;
   16617   case ISD::UADDO:
   16618     BaseOp = X86ISD::ADD;
   16619     Cond = X86::COND_B;
   16620     break;
   16621   case ISD::SSUBO:
   16622     // A subtract of one will be selected as a DEC. Note that DEC doesn't
   16623     // set CF, so we can't do this for USUBO.
   16624     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
   16625       if (C->isOne()) {
   16626         BaseOp = X86ISD::DEC;
   16627         Cond = X86::COND_O;
   16628         break;
   16629       }
   16630     BaseOp = X86ISD::SUB;
   16631     Cond = X86::COND_O;
   16632     break;
   16633   case ISD::USUBO:
   16634     BaseOp = X86ISD::SUB;
   16635     Cond = X86::COND_B;
   16636     break;
   16637   case ISD::SMULO:
   16638     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
   16639     Cond = X86::COND_O;
   16640     break;
   16641   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
   16642     if (N->getValueType(0) == MVT::i8) {
   16643       BaseOp = X86ISD::UMUL8;
   16644       Cond = X86::COND_O;
   16645       break;
   16646     }
   16647     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
   16648                                  MVT::i32);
   16649     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
   16650 
   16651     SDValue SetCC =
   16652       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   16653                   DAG.getConstant(X86::COND_O, MVT::i32),
   16654                   SDValue(Sum.getNode(), 2));
   16655 
   16656     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   16657   }
   16658   }
   16659 
   16660   // Also sets EFLAGS.
   16661   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
   16662   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
   16663 
   16664   SDValue SetCC =
   16665     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
   16666                 DAG.getConstant(Cond, MVT::i32),
   16667                 SDValue(Sum.getNode(), 1));
   16668 
   16669   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   16670 }
   16671 
   16672 /// Returns true if the operand type is exactly twice the native width, and
   16673 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
   16674 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
   16675 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
   16676 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
   16677   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
   16678 
   16679   if (OpWidth == 64)
   16680     return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
   16681   else if (OpWidth == 128)
   16682     return Subtarget->hasCmpxchg16b();
   16683   else
   16684     return false;
   16685 }
   16686 
   16687 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   16688   return needsCmpXchgNb(SI->getValueOperand()->getType());
   16689 }
   16690 
   16691 // Note: this turns large loads into lock cmpxchg8b/16b.
   16692 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
   16693 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   16694   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
   16695   return needsCmpXchgNb(PTy->getElementType());
   16696 }
   16697 
   16698 TargetLoweringBase::AtomicRMWExpansionKind
   16699 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   16700   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
   16701   const Type *MemType = AI->getType();
   16702 
   16703   // If the operand is too big, we must see if cmpxchg8/16b is available
   16704   // and default to library calls otherwise.
   16705   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
   16706     return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg
   16707                                    : AtomicRMWExpansionKind::None;
   16708   }
   16709 
   16710   AtomicRMWInst::BinOp Op = AI->getOperation();
   16711   switch (Op) {
   16712   default:
   16713     llvm_unreachable("Unknown atomic operation");
   16714   case AtomicRMWInst::Xchg:
   16715   case AtomicRMWInst::Add:
   16716   case AtomicRMWInst::Sub:
   16717     // It's better to use xadd, xsub or xchg for these in all cases.
   16718     return AtomicRMWExpansionKind::None;
   16719   case AtomicRMWInst::Or:
   16720   case AtomicRMWInst::And:
   16721   case AtomicRMWInst::Xor:
   16722     // If the atomicrmw's result isn't actually used, we can just add a "lock"
   16723     // prefix to a normal instruction for these operations.
   16724     return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg
   16725                             : AtomicRMWExpansionKind::None;
   16726   case AtomicRMWInst::Nand:
   16727   case AtomicRMWInst::Max:
   16728   case AtomicRMWInst::Min:
   16729   case AtomicRMWInst::UMax:
   16730   case AtomicRMWInst::UMin:
   16731     // These always require a non-trivial set of data operations on x86. We must
   16732     // use a cmpxchg loop.
   16733     return AtomicRMWExpansionKind::CmpXChg;
   16734   }
   16735 }
   16736 
   16737 static bool hasMFENCE(const X86Subtarget& Subtarget) {
   16738   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
   16739   // no-sse2). There isn't any reason to disable it if the target processor
   16740   // supports it.
   16741   return Subtarget.hasSSE2() || Subtarget.is64Bit();
   16742 }
   16743 
   16744 LoadInst *
   16745 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   16746   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
   16747   const Type *MemType = AI->getType();
   16748   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
   16749   // there is no benefit in turning such RMWs into loads, and it is actually
   16750   // harmful as it introduces a mfence.
   16751   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
   16752     return nullptr;
   16753 
   16754   auto Builder = IRBuilder<>(AI);
   16755   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   16756   auto SynchScope = AI->getSynchScope();
   16757   // We must restrict the ordering to avoid generating loads with Release or
   16758   // ReleaseAcquire orderings.
   16759   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
   16760   auto Ptr = AI->getPointerOperand();
   16761 
   16762   // Before the load we need a fence. Here is an example lifted from
   16763   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
   16764   // is required:
   16765   // Thread 0:
   16766   //   x.store(1, relaxed);
   16767   //   r1 = y.fetch_add(0, release);
   16768   // Thread 1:
   16769   //   y.fetch_add(42, acquire);
   16770   //   r2 = x.load(relaxed);
   16771   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
   16772   // lowered to just a load without a fence. A mfence flushes the store buffer,
   16773   // making the optimization clearly correct.
   16774   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
   16775   // otherwise, we might be able to be more agressive on relaxed idempotent
   16776   // rmw. In practice, they do not look useful, so we don't try to be
   16777   // especially clever.
   16778   if (SynchScope == SingleThread) {
   16779     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
   16780     // the IR level, so we must wrap it in an intrinsic.
   16781     return nullptr;
   16782   } else if (hasMFENCE(*Subtarget)) {
   16783     Function *MFence = llvm::Intrinsic::getDeclaration(M,
   16784             Intrinsic::x86_sse2_mfence);
   16785     Builder.CreateCall(MFence);
   16786   } else {
   16787     // FIXME: it might make sense to use a locked operation here but on a
   16788     // different cache-line to prevent cache-line bouncing. In practice it
   16789     // is probably a small win, and x86 processors without mfence are rare
   16790     // enough that we do not bother.
   16791     return nullptr;
   16792   }
   16793 
   16794   // Finally we can emit the atomic load.
   16795   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
   16796           AI->getType()->getPrimitiveSizeInBits());
   16797   Loaded->setAtomic(Order, SynchScope);
   16798   AI->replaceAllUsesWith(Loaded);
   16799   AI->eraseFromParent();
   16800   return Loaded;
   16801 }
   16802 
   16803 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
   16804                                  SelectionDAG &DAG) {
   16805   SDLoc dl(Op);
   16806   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
   16807     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   16808   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
   16809     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
   16810 
   16811   // The only fence that needs an instruction is a sequentially-consistent
   16812   // cross-thread fence.
   16813   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
   16814     if (hasMFENCE(*Subtarget))
   16815       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
   16816 
   16817     SDValue Chain = Op.getOperand(0);
   16818     SDValue Zero = DAG.getConstant(0, MVT::i32);
   16819     SDValue Ops[] = {
   16820       DAG.getRegister(X86::ESP, MVT::i32), // Base
   16821       DAG.getTargetConstant(1, MVT::i8),   // Scale
   16822       DAG.getRegister(0, MVT::i32),        // Index
   16823       DAG.getTargetConstant(0, MVT::i32),  // Disp
   16824       DAG.getRegister(0, MVT::i32),        // Segment.
   16825       Zero,
   16826       Chain
   16827     };
   16828     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
   16829     return SDValue(Res, 0);
   16830   }
   16831 
   16832   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
   16833   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
   16834 }
   16835 
   16836 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
   16837                              SelectionDAG &DAG) {
   16838   MVT T = Op.getSimpleValueType();
   16839   SDLoc DL(Op);
   16840   unsigned Reg = 0;
   16841   unsigned size = 0;
   16842   switch(T.SimpleTy) {
   16843   default: llvm_unreachable("Invalid value type!");
   16844   case MVT::i8:  Reg = X86::AL;  size = 1; break;
   16845   case MVT::i16: Reg = X86::AX;  size = 2; break;
   16846   case MVT::i32: Reg = X86::EAX; size = 4; break;
   16847   case MVT::i64:
   16848     assert(Subtarget->is64Bit() && "Node not type legal!");
   16849     Reg = X86::RAX; size = 8;
   16850     break;
   16851   }
   16852   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
   16853                                   Op.getOperand(2), SDValue());
   16854   SDValue Ops[] = { cpIn.getValue(0),
   16855                     Op.getOperand(1),
   16856                     Op.getOperand(3),
   16857                     DAG.getTargetConstant(size, MVT::i8),
   16858                     cpIn.getValue(1) };
   16859   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   16860   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   16861   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
   16862                                            Ops, T, MMO);
   16863 
   16864   SDValue cpOut =
   16865     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
   16866   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
   16867                                       MVT::i32, cpOut.getValue(2));
   16868   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
   16869                                 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
   16870 
   16871   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
   16872   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
   16873   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
   16874   return SDValue();
   16875 }
   16876 
   16877 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
   16878                             SelectionDAG &DAG) {
   16879   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   16880   MVT DstVT = Op.getSimpleValueType();
   16881 
   16882   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
   16883     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
   16884     if (DstVT != MVT::f64)
   16885       // This conversion needs to be expanded.
   16886       return SDValue();
   16887 
   16888     SDValue InVec = Op->getOperand(0);
   16889     SDLoc dl(Op);
   16890     unsigned NumElts = SrcVT.getVectorNumElements();
   16891     EVT SVT = SrcVT.getVectorElementType();
   16892 
   16893     // Widen the vector in input in the case of MVT::v2i32.
   16894     // Example: from MVT::v2i32 to MVT::v4i32.
   16895     SmallVector<SDValue, 16> Elts;
   16896     for (unsigned i = 0, e = NumElts; i != e; ++i)
   16897       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
   16898                                  DAG.getIntPtrConstant(i)));
   16899 
   16900     // Explicitly mark the extra elements as Undef.
   16901     Elts.append(NumElts, DAG.getUNDEF(SVT));
   16902 
   16903     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
   16904     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
   16905     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
   16906     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
   16907                        DAG.getIntPtrConstant(0));
   16908   }
   16909 
   16910   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
   16911          Subtarget->hasMMX() && "Unexpected custom BITCAST");
   16912   assert((DstVT == MVT::i64 ||
   16913           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
   16914          "Unexpected custom BITCAST");
   16915   // i64 <=> MMX conversions are Legal.
   16916   if (SrcVT==MVT::i64 && DstVT.isVector())
   16917     return Op;
   16918   if (DstVT==MVT::i64 && SrcVT.isVector())
   16919     return Op;
   16920   // MMX <=> MMX conversions are Legal.
   16921   if (SrcVT.isVector() && DstVT.isVector())
   16922     return Op;
   16923   // All other conversions need to be expanded.
   16924   return SDValue();
   16925 }
   16926 
   16927 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
   16928                           SelectionDAG &DAG) {
   16929   SDNode *Node = Op.getNode();
   16930   SDLoc dl(Node);
   16931 
   16932   Op = Op.getOperand(0);
   16933   EVT VT = Op.getValueType();
   16934   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   16935          "CTPOP lowering only implemented for 128/256-bit wide vector types");
   16936 
   16937   unsigned NumElts = VT.getVectorNumElements();
   16938   EVT EltVT = VT.getVectorElementType();
   16939   unsigned Len = EltVT.getSizeInBits();
   16940 
   16941   // This is the vectorized version of the "best" algorithm from
   16942   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
   16943   // with a minor tweak to use a series of adds + shifts instead of vector
   16944   // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
   16945   //
   16946   //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
   16947   //  v8i32 => Always profitable
   16948   //
   16949   // FIXME: There a couple of possible improvements:
   16950   //
   16951   // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
   16952   // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
   16953   //
   16954   assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
   16955          "CTPOP not implemented for this vector element type.");
   16956 
   16957   // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
   16958   // extra legalization.
   16959   bool NeedsBitcast = EltVT == MVT::i32;
   16960   MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
   16961 
   16962   SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
   16963   SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
   16964   SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
   16965 
   16966   // v = v - ((v >> 1) & 0x55555555...)
   16967   SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
   16968   SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
   16969   SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
   16970   if (NeedsBitcast)
   16971     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
   16972 
   16973   SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
   16974   SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
   16975   if (NeedsBitcast)
   16976     M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
   16977 
   16978   SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
   16979   if (VT != And.getValueType())
   16980     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
   16981   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
   16982 
   16983   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
   16984   SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
   16985   SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
   16986   SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
   16987   SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
   16988 
   16989   Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
   16990   if (NeedsBitcast) {
   16991     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
   16992     M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
   16993     Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
   16994   }
   16995 
   16996   SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
   16997   SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
   16998   if (VT != AndRHS.getValueType()) {
   16999     AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
   17000     AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
   17001   }
   17002   SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
   17003 
   17004   // v = (v + (v >> 4)) & 0x0F0F0F0F...
   17005   SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
   17006   SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
   17007   Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
   17008   Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
   17009 
   17010   SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
   17011   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
   17012   if (NeedsBitcast) {
   17013     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
   17014     M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
   17015   }
   17016   And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
   17017   if (VT != And.getValueType())
   17018     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
   17019 
   17020   // The algorithm mentioned above uses:
   17021   //    v = (v * 0x01010101...) >> (Len - 8)
   17022   //
   17023   // Change it to use vector adds + vector shifts which yield faster results on
   17024   // Haswell than using vector integer multiplication.
   17025   //
   17026   // For i32 elements:
   17027   //    v = v + (v >> 8)
   17028   //    v = v + (v >> 16)
   17029   //
   17030   // For i64 elements:
   17031   //    v = v + (v >> 8)
   17032   //    v = v + (v >> 16)
   17033   //    v = v + (v >> 32)
   17034   //
   17035   Add = And;
   17036   SmallVector<SDValue, 8> Csts;
   17037   for (unsigned i = 8; i <= Len/2; i *= 2) {
   17038     Csts.assign(NumElts, DAG.getConstant(i, EltVT));
   17039     SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
   17040     Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
   17041     Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
   17042     Csts.clear();
   17043   }
   17044 
   17045   // The result is on the least significant 6-bits on i32 and 7-bits on i64.
   17046   SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
   17047   SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
   17048   SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
   17049   if (NeedsBitcast) {
   17050     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
   17051     M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
   17052   }
   17053   And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
   17054   if (VT != And.getValueType())
   17055     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
   17056 
   17057   return And;
   17058 }
   17059 
   17060 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
   17061   SDNode *Node = Op.getNode();
   17062   SDLoc dl(Node);
   17063   EVT T = Node->getValueType(0);
   17064   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
   17065                               DAG.getConstant(0, T), Node->getOperand(2));
   17066   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
   17067                        cast<AtomicSDNode>(Node)->getMemoryVT(),
   17068                        Node->getOperand(0),
   17069                        Node->getOperand(1), negOp,
   17070                        cast<AtomicSDNode>(Node)->getMemOperand(),
   17071                        cast<AtomicSDNode>(Node)->getOrdering(),
   17072                        cast<AtomicSDNode>(Node)->getSynchScope());
   17073 }
   17074 
   17075 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   17076   SDNode *Node = Op.getNode();
   17077   SDLoc dl(Node);
   17078   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
   17079 
   17080   // Convert seq_cst store -> xchg
   17081   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
   17082   // FIXME: On 32-bit, store -> fist or movq would be more efficient
   17083   //        (The only way to get a 16-byte store is cmpxchg16b)
   17084   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
   17085   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
   17086       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
   17087     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
   17088                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
   17089                                  Node->getOperand(0),
   17090                                  Node->getOperand(1), Node->getOperand(2),
   17091                                  cast<AtomicSDNode>(Node)->getMemOperand(),
   17092                                  cast<AtomicSDNode>(Node)->getOrdering(),
   17093                                  cast<AtomicSDNode>(Node)->getSynchScope());
   17094     return Swap.getValue(1);
   17095   }
   17096   // Other atomic stores have a simple pattern.
   17097   return Op;
   17098 }
   17099 
   17100 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   17101   EVT VT = Op.getNode()->getSimpleValueType(0);
   17102 
   17103   // Let legalize expand this if it isn't a legal type yet.
   17104   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   17105     return SDValue();
   17106 
   17107   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   17108 
   17109   unsigned Opc;
   17110   bool ExtraOp = false;
   17111   switch (Op.getOpcode()) {
   17112   default: llvm_unreachable("Invalid code");
   17113   case ISD::ADDC: Opc = X86ISD::ADD; break;
   17114   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
   17115   case ISD::SUBC: Opc = X86ISD::SUB; break;
   17116   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
   17117   }
   17118 
   17119   if (!ExtraOp)
   17120     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
   17121                        Op.getOperand(1));
   17122   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
   17123                      Op.getOperand(1), Op.getOperand(2));
   17124 }
   17125 
   17126 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
   17127                             SelectionDAG &DAG) {
   17128   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
   17129 
   17130   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   17131   // which returns the values as { float, float } (in XMM0) or
   17132   // { double, double } (which is returned in XMM0, XMM1).
   17133   SDLoc dl(Op);
   17134   SDValue Arg = Op.getOperand(0);
   17135   EVT ArgVT = Arg.getValueType();
   17136   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   17137 
   17138   TargetLowering::ArgListTy Args;
   17139   TargetLowering::ArgListEntry Entry;
   17140 
   17141   Entry.Node = Arg;
   17142   Entry.Ty = ArgTy;
   17143   Entry.isSExt = false;
   17144   Entry.isZExt = false;
   17145   Args.push_back(Entry);
   17146 
   17147   bool isF64 = ArgVT == MVT::f64;
   17148   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
   17149   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   17150   // the results are returned via SRet in memory.
   17151   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
   17152   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   17153   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
   17154 
   17155   Type *RetTy = isF64
   17156     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
   17157     : (Type*)VectorType::get(ArgTy, 4);
   17158 
   17159   TargetLowering::CallLoweringInfo CLI(DAG);
   17160   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
   17161     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
   17162 
   17163   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
   17164 
   17165   if (isF64)
   17166     // Returned in xmm0 and xmm1.
   17167     return CallResult.first;
   17168 
   17169   // Returned in bits 0:31 and 32:64 xmm0.
   17170   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
   17171                                CallResult.first, DAG.getIntPtrConstant(0));
   17172   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
   17173                                CallResult.first, DAG.getIntPtrConstant(1));
   17174   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
   17175   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
   17176 }
   17177 
   17178 /// LowerOperation - Provide custom lowering hooks for some operations.
   17179 ///
   17180 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   17181   switch (Op.getOpcode()) {
   17182   default: llvm_unreachable("Should not custom lower this!");
   17183   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
   17184   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
   17185     return LowerCMP_SWAP(Op, Subtarget, DAG);
   17186   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
   17187   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
   17188   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
   17189   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   17190   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
   17191   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
   17192   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
   17193   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   17194   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   17195   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
   17196   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
   17197   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
   17198   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   17199   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   17200   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   17201   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
   17202   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   17203   case ISD::SHL_PARTS:
   17204   case ISD::SRA_PARTS:
   17205   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   17206   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   17207   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   17208   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
   17209   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
   17210   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
   17211   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
   17212   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   17213   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
   17214   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
   17215   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
   17216   case ISD::FABS:
   17217   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
   17218   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   17219   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   17220   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   17221   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   17222   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   17223   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   17224   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   17225   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   17226   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
   17227   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
   17228   case ISD::INTRINSIC_VOID:
   17229   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
   17230   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   17231   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   17232   case ISD::FRAME_TO_ARGS_OFFSET:
   17233                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   17234   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   17235   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
   17236   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   17237   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
   17238   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   17239   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   17240   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   17241   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
   17242   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
   17243   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
   17244   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   17245   case ISD::UMUL_LOHI:
   17246   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
   17247   case ISD::SRA:
   17248   case ISD::SRL:
   17249   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
   17250   case ISD::SADDO:
   17251   case ISD::UADDO:
   17252   case ISD::SSUBO:
   17253   case ISD::USUBO:
   17254   case ISD::SMULO:
   17255   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   17256   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
   17257   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
   17258   case ISD::ADDC:
   17259   case ISD::ADDE:
   17260   case ISD::SUBC:
   17261   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   17262   case ISD::ADD:                return LowerADD(Op, DAG);
   17263   case ISD::SUB:                return LowerSUB(Op, DAG);
   17264   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   17265   }
   17266 }
   17267 
   17268 /// ReplaceNodeResults - Replace a node with an illegal result type
   17269 /// with a new node built out of custom code.
   17270 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   17271                                            SmallVectorImpl<SDValue>&Results,
   17272                                            SelectionDAG &DAG) const {
   17273   SDLoc dl(N);
   17274   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   17275   switch (N->getOpcode()) {
   17276   default:
   17277     llvm_unreachable("Do not know how to custom type legalize this operation!");
   17278   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
   17279   case X86ISD::FMINC:
   17280   case X86ISD::FMIN:
   17281   case X86ISD::FMAXC:
   17282   case X86ISD::FMAX: {
   17283     EVT VT = N->getValueType(0);
   17284     if (VT != MVT::v2f32)
   17285       llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
   17286     SDValue UNDEF = DAG.getUNDEF(VT);
   17287     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
   17288                               N->getOperand(0), UNDEF);
   17289     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
   17290                               N->getOperand(1), UNDEF);
   17291     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
   17292     return;
   17293   }
   17294   case ISD::SIGN_EXTEND_INREG:
   17295   case ISD::ADDC:
   17296   case ISD::ADDE:
   17297   case ISD::SUBC:
   17298   case ISD::SUBE:
   17299     // We don't want to expand or promote these.
   17300     return;
   17301   case ISD::SDIV:
   17302   case ISD::UDIV:
   17303   case ISD::SREM:
   17304   case ISD::UREM:
   17305   case ISD::SDIVREM:
   17306   case ISD::UDIVREM: {
   17307     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
   17308     Results.push_back(V);
   17309     return;
   17310   }
   17311   case ISD::FP_TO_SINT:
   17312     // FP_TO_INT*_IN_MEM is not legal for f16 inputs.  Do not convert
   17313     // (FP_TO_SINT (load f16)) to FP_TO_INT*.
   17314     if (N->getOperand(0).getValueType() == MVT::f16)
   17315       break;
   17316     // fallthrough
   17317   case ISD::FP_TO_UINT: {
   17318     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
   17319 
   17320     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
   17321       return;
   17322 
   17323     std::pair<SDValue,SDValue> Vals =
   17324         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
   17325     SDValue FIST = Vals.first, StackSlot = Vals.second;
   17326     if (FIST.getNode()) {
   17327       EVT VT = N->getValueType(0);
   17328       // Return a load from the stack slot.
   17329       if (StackSlot.getNode())
   17330         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
   17331                                       MachinePointerInfo(),
   17332                                       false, false, false, 0));
   17333       else
   17334         Results.push_back(FIST);
   17335     }
   17336     return;
   17337   }
   17338   case ISD::UINT_TO_FP: {
   17339     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
   17340     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
   17341         N->getValueType(0) != MVT::v2f32)
   17342       return;
   17343     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
   17344                                  N->getOperand(0));
   17345     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
   17346                                      MVT::f64);
   17347     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
   17348     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
   17349                              DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
   17350     Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
   17351     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
   17352     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
   17353     return;
   17354   }
   17355   case ISD::FP_ROUND: {
   17356     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
   17357         return;
   17358     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
   17359     Results.push_back(V);
   17360     return;
   17361   }
   17362   case ISD::FP_EXTEND: {
   17363     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
   17364     // No other ValueType for FP_EXTEND should reach this point.
   17365     assert(N->getValueType(0) == MVT::v2f32 &&
   17366            "Do not know how to legalize this Node");
   17367     return;
   17368   }
   17369   case ISD::INTRINSIC_W_CHAIN: {
   17370     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   17371     switch (IntNo) {
   17372     default : llvm_unreachable("Do not know how to custom type "
   17373                                "legalize this intrinsic operation!");
   17374     case Intrinsic::x86_rdtsc:
   17375       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
   17376                                      Results);
   17377     case Intrinsic::x86_rdtscp:
   17378       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
   17379                                      Results);
   17380     case Intrinsic::x86_rdpmc:
   17381       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
   17382     }
   17383   }
   17384   case ISD::READCYCLECOUNTER: {
   17385     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
   17386                                    Results);
   17387   }
   17388   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
   17389     EVT T = N->getValueType(0);
   17390     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
   17391     bool Regs64bit = T == MVT::i128;
   17392     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
   17393     SDValue cpInL, cpInH;
   17394     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
   17395                         DAG.getConstant(0, HalfT));
   17396     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
   17397                         DAG.getConstant(1, HalfT));
   17398     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
   17399                              Regs64bit ? X86::RAX : X86::EAX,
   17400                              cpInL, SDValue());
   17401     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
   17402                              Regs64bit ? X86::RDX : X86::EDX,
   17403                              cpInH, cpInL.getValue(1));
   17404     SDValue swapInL, swapInH;
   17405     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
   17406                           DAG.getConstant(0, HalfT));
   17407     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
   17408                           DAG.getConstant(1, HalfT));
   17409     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
   17410                                Regs64bit ? X86::RBX : X86::EBX,
   17411                                swapInL, cpInH.getValue(1));
   17412     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
   17413                                Regs64bit ? X86::RCX : X86::ECX,
   17414                                swapInH, swapInL.getValue(1));
   17415     SDValue Ops[] = { swapInH.getValue(0),
   17416                       N->getOperand(1),
   17417                       swapInH.getValue(1) };
   17418     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   17419     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
   17420     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
   17421                                   X86ISD::LCMPXCHG8_DAG;
   17422     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
   17423     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
   17424                                         Regs64bit ? X86::RAX : X86::EAX,
   17425                                         HalfT, Result.getValue(1));
   17426     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
   17427                                         Regs64bit ? X86::RDX : X86::EDX,
   17428                                         HalfT, cpOutL.getValue(2));
   17429     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
   17430 
   17431     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
   17432                                         MVT::i32, cpOutH.getValue(2));
   17433     SDValue Success =
   17434         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   17435                     DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
   17436     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
   17437 
   17438     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
   17439     Results.push_back(Success);
   17440     Results.push_back(EFLAGS.getValue(1));
   17441     return;
   17442   }
   17443   case ISD::ATOMIC_SWAP:
   17444   case ISD::ATOMIC_LOAD_ADD:
   17445   case ISD::ATOMIC_LOAD_SUB:
   17446   case ISD::ATOMIC_LOAD_AND:
   17447   case ISD::ATOMIC_LOAD_OR:
   17448   case ISD::ATOMIC_LOAD_XOR:
   17449   case ISD::ATOMIC_LOAD_NAND:
   17450   case ISD::ATOMIC_LOAD_MIN:
   17451   case ISD::ATOMIC_LOAD_MAX:
   17452   case ISD::ATOMIC_LOAD_UMIN:
   17453   case ISD::ATOMIC_LOAD_UMAX:
   17454   case ISD::ATOMIC_LOAD: {
   17455     // Delegate to generic TypeLegalization. Situations we can really handle
   17456     // should have already been dealt with by AtomicExpandPass.cpp.
   17457     break;
   17458   }
   17459   case ISD::BITCAST: {
   17460     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
   17461     EVT DstVT = N->getValueType(0);
   17462     EVT SrcVT = N->getOperand(0)->getValueType(0);
   17463 
   17464     if (SrcVT != MVT::f64 ||
   17465         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
   17466       return;
   17467 
   17468     unsigned NumElts = DstVT.getVectorNumElements();
   17469     EVT SVT = DstVT.getVectorElementType();
   17470     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
   17471     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   17472                                    MVT::v2f64, N->getOperand(0));
   17473     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
   17474 
   17475     if (ExperimentalVectorWideningLegalization) {
   17476       // If we are legalizing vectors by widening, we already have the desired
   17477       // legal vector type, just return it.
   17478       Results.push_back(ToVecInt);
   17479       return;
   17480     }
   17481 
   17482     SmallVector<SDValue, 8> Elts;
   17483     for (unsigned i = 0, e = NumElts; i != e; ++i)
   17484       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
   17485                                    ToVecInt, DAG.getIntPtrConstant(i)));
   17486 
   17487     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
   17488   }
   17489   }
   17490 }
   17491 
   17492 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   17493   switch (Opcode) {
   17494   default: return nullptr;
   17495   case X86ISD::BSF:                return "X86ISD::BSF";
   17496   case X86ISD::BSR:                return "X86ISD::BSR";
   17497   case X86ISD::SHLD:               return "X86ISD::SHLD";
   17498   case X86ISD::SHRD:               return "X86ISD::SHRD";
   17499   case X86ISD::FAND:               return "X86ISD::FAND";
   17500   case X86ISD::FANDN:              return "X86ISD::FANDN";
   17501   case X86ISD::FOR:                return "X86ISD::FOR";
   17502   case X86ISD::FXOR:               return "X86ISD::FXOR";
   17503   case X86ISD::FSRL:               return "X86ISD::FSRL";
   17504   case X86ISD::FILD:               return "X86ISD::FILD";
   17505   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
   17506   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
   17507   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
   17508   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
   17509   case X86ISD::FLD:                return "X86ISD::FLD";
   17510   case X86ISD::FST:                return "X86ISD::FST";
   17511   case X86ISD::CALL:               return "X86ISD::CALL";
   17512   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
   17513   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
   17514   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
   17515   case X86ISD::BT:                 return "X86ISD::BT";
   17516   case X86ISD::CMP:                return "X86ISD::CMP";
   17517   case X86ISD::COMI:               return "X86ISD::COMI";
   17518   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
   17519   case X86ISD::CMPM:               return "X86ISD::CMPM";
   17520   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
   17521   case X86ISD::SETCC:              return "X86ISD::SETCC";
   17522   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   17523   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
   17524   case X86ISD::CMOV:               return "X86ISD::CMOV";
   17525   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   17526   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
   17527   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
   17528   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
   17529   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
   17530   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
   17531   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
   17532   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
   17533   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
   17534   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   17535   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   17536   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
   17537   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   17538   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   17539   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
   17540   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
   17541   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
   17542   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
   17543   case X86ISD::HADD:               return "X86ISD::HADD";
   17544   case X86ISD::HSUB:               return "X86ISD::HSUB";
   17545   case X86ISD::FHADD:              return "X86ISD::FHADD";
   17546   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   17547   case X86ISD::UMAX:               return "X86ISD::UMAX";
   17548   case X86ISD::UMIN:               return "X86ISD::UMIN";
   17549   case X86ISD::SMAX:               return "X86ISD::SMAX";
   17550   case X86ISD::SMIN:               return "X86ISD::SMIN";
   17551   case X86ISD::FMAX:               return "X86ISD::FMAX";
   17552   case X86ISD::FMIN:               return "X86ISD::FMIN";
   17553   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
   17554   case X86ISD::FMINC:              return "X86ISD::FMINC";
   17555   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   17556   case X86ISD::FRCP:               return "X86ISD::FRCP";
   17557   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   17558   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   17559   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   17560   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
   17561   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
   17562   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   17563   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   17564   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
   17565   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
   17566   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   17567   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
   17568   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
   17569   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   17570   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   17571   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
   17572   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
   17573   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
   17574   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
   17575   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
   17576   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   17577   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   17578   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   17579   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   17580   case X86ISD::VSHL:               return "X86ISD::VSHL";
   17581   case X86ISD::VSRL:               return "X86ISD::VSRL";
   17582   case X86ISD::VSRA:               return "X86ISD::VSRA";
   17583   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
   17584   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
   17585   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
   17586   case X86ISD::CMPP:               return "X86ISD::CMPP";
   17587   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   17588   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
   17589   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
   17590   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
   17591   case X86ISD::ADD:                return "X86ISD::ADD";
   17592   case X86ISD::SUB:                return "X86ISD::SUB";
   17593   case X86ISD::ADC:                return "X86ISD::ADC";
   17594   case X86ISD::SBB:                return "X86ISD::SBB";
   17595   case X86ISD::SMUL:               return "X86ISD::SMUL";
   17596   case X86ISD::UMUL:               return "X86ISD::UMUL";
   17597   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
   17598   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
   17599   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
   17600   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
   17601   case X86ISD::INC:                return "X86ISD::INC";
   17602   case X86ISD::DEC:                return "X86ISD::DEC";
   17603   case X86ISD::OR:                 return "X86ISD::OR";
   17604   case X86ISD::XOR:                return "X86ISD::XOR";
   17605   case X86ISD::AND:                return "X86ISD::AND";
   17606   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   17607   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   17608   case X86ISD::PTEST:              return "X86ISD::PTEST";
   17609   case X86ISD::TESTP:              return "X86ISD::TESTP";
   17610   case X86ISD::TESTM:              return "X86ISD::TESTM";
   17611   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
   17612   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
   17613   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
   17614   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
   17615   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   17616   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
   17617   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   17618   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
   17619   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
   17620   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
   17621   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
   17622   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
   17623   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
   17624   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
   17625   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
   17626   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
   17627   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
   17628   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
   17629   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
   17630   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
   17631   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
   17632   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   17633   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   17634   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
   17635   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
   17636   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
   17637   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   17638   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
   17639   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
   17640   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
   17641   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   17642   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   17643   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
   17644   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   17645   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   17646   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
   17647   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
   17648   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   17649   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
   17650   case X86ISD::SAHF:               return "X86ISD::SAHF";
   17651   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
   17652   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
   17653   case X86ISD::FMADD:              return "X86ISD::FMADD";
   17654   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
   17655   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
   17656   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
   17657   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
   17658   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
   17659   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
   17660   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
   17661   case X86ISD::XTEST:              return "X86ISD::XTEST";
   17662   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
   17663   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
   17664   case X86ISD::SELECT:             return "X86ISD::SELECT";
   17665   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
   17666   case X86ISD::RCP28:              return "X86ISD::RCP28";
   17667   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
   17668   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
   17669   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
   17670   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
   17671   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
   17672   }
   17673 }
   17674 
   17675 // isLegalAddressingMode - Return true if the addressing mode represented
   17676 // by AM is legal for this target, for a load/store of the specified type.
   17677 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
   17678                                               Type *Ty) const {
   17679   // X86 supports extremely general addressing modes.
   17680   CodeModel::Model M = getTargetMachine().getCodeModel();
   17681   Reloc::Model R = getTargetMachine().getRelocationModel();
   17682 
   17683   // X86 allows a sign-extended 32-bit immediate field as a displacement.
   17684   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
   17685     return false;
   17686 
   17687   if (AM.BaseGV) {
   17688     unsigned GVFlags =
   17689       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
   17690 
   17691     // If a reference to this global requires an extra load, we can't fold it.
   17692     if (isGlobalStubReference(GVFlags))
   17693       return false;
   17694 
   17695     // If BaseGV requires a register for the PIC base, we cannot also have a
   17696     // BaseReg specified.
   17697     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
   17698       return false;
   17699 
   17700     // If lower 4G is not available, then we must use rip-relative addressing.
   17701     if ((M != CodeModel::Small || R != Reloc::Static) &&
   17702         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
   17703       return false;
   17704   }
   17705 
   17706   switch (AM.Scale) {
   17707   case 0:
   17708   case 1:
   17709   case 2:
   17710   case 4:
   17711   case 8:
   17712     // These scales always work.
   17713     break;
   17714   case 3:
   17715   case 5:
   17716   case 9:
   17717     // These scales are formed with basereg+scalereg.  Only accept if there is
   17718     // no basereg yet.
   17719     if (AM.HasBaseReg)
   17720       return false;
   17721     break;
   17722   default:  // Other stuff never works.
   17723     return false;
   17724   }
   17725 
   17726   return true;
   17727 }
   17728 
   17729 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
   17730   unsigned Bits = Ty->getScalarSizeInBits();
   17731 
   17732   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
   17733   // particularly cheaper than those without.
   17734   if (Bits == 8)
   17735     return false;
   17736 
   17737   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
   17738   // variable shifts just as cheap as scalar ones.
   17739   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
   17740     return false;
   17741 
   17742   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
   17743   // fully general vector.
   17744   return true;
   17745 }
   17746 
   17747 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   17748   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   17749     return false;
   17750   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   17751   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   17752   return NumBits1 > NumBits2;
   17753 }
   17754 
   17755 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
   17756   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   17757     return false;
   17758 
   17759   if (!isTypeLegal(EVT::getEVT(Ty1)))
   17760     return false;
   17761 
   17762   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
   17763 
   17764   // Assuming the caller doesn't have a zeroext or signext return parameter,
   17765   // truncation all the way down to i1 is valid.
   17766   return true;
   17767 }
   17768 
   17769 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   17770   return isInt<32>(Imm);
   17771 }
   17772 
   17773 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
   17774   // Can also use sub to handle negated immediates.
   17775   return isInt<32>(Imm);
   17776 }
   17777 
   17778 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   17779   if (!VT1.isInteger() || !VT2.isInteger())
   17780     return false;
   17781   unsigned NumBits1 = VT1.getSizeInBits();
   17782   unsigned NumBits2 = VT2.getSizeInBits();
   17783   return NumBits1 > NumBits2;
   17784 }
   17785 
   17786 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   17787   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   17788   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
   17789 }
   17790 
   17791 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   17792   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   17793   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
   17794 }
   17795 
   17796 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   17797   EVT VT1 = Val.getValueType();
   17798   if (isZExtFree(VT1, VT2))
   17799     return true;
   17800 
   17801   if (Val.getOpcode() != ISD::LOAD)
   17802     return false;
   17803 
   17804   if (!VT1.isSimple() || !VT1.isInteger() ||
   17805       !VT2.isSimple() || !VT2.isInteger())
   17806     return false;
   17807 
   17808   switch (VT1.getSimpleVT().SimpleTy) {
   17809   default: break;
   17810   case MVT::i8:
   17811   case MVT::i16:
   17812   case MVT::i32:
   17813     // X86 has 8, 16, and 32-bit zero-extending loads.
   17814     return true;
   17815   }
   17816 
   17817   return false;
   17818 }
   17819 
   17820 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
   17821 
   17822 bool
   17823 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   17824   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
   17825     return false;
   17826 
   17827   VT = VT.getScalarType();
   17828 
   17829   if (!VT.isSimple())
   17830     return false;
   17831 
   17832   switch (VT.getSimpleVT().SimpleTy) {
   17833   case MVT::f32:
   17834   case MVT::f64:
   17835     return true;
   17836   default:
   17837     break;
   17838   }
   17839 
   17840   return false;
   17841 }
   17842 
   17843 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
   17844   // i16 instructions are longer (0x66 prefix) and potentially slower.
   17845   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
   17846 }
   17847 
   17848 /// isShuffleMaskLegal - Targets can use this to indicate that they only
   17849 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
   17850 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
   17851 /// are assumed to be legal.
   17852 bool
   17853 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
   17854                                       EVT VT) const {
   17855   if (!VT.isSimple())
   17856     return false;
   17857 
   17858   // Very little shuffling can be done for 64-bit vectors right now.
   17859   if (VT.getSizeInBits() == 64)
   17860     return false;
   17861 
   17862   // We only care that the types being shuffled are legal. The lowering can
   17863   // handle any possible shuffle mask that results.
   17864   return isTypeLegal(VT.getSimpleVT());
   17865 }
   17866 
   17867 bool
   17868 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
   17869                                           EVT VT) const {
   17870   // Just delegate to the generic legality, clear masks aren't special.
   17871   return isShuffleMaskLegal(Mask, VT);
   17872 }
   17873 
   17874 //===----------------------------------------------------------------------===//
   17875 //                           X86 Scheduler Hooks
   17876 //===----------------------------------------------------------------------===//
   17877 
   17878 /// Utility function to emit xbegin specifying the start of an RTM region.
   17879 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
   17880                                      const TargetInstrInfo *TII) {
   17881   DebugLoc DL = MI->getDebugLoc();
   17882 
   17883   const BasicBlock *BB = MBB->getBasicBlock();
   17884   MachineFunction::iterator I = MBB;
   17885   ++I;
   17886 
   17887   // For the v = xbegin(), we generate
   17888   //
   17889   // thisMBB:
   17890   //  xbegin sinkMBB
   17891   //
   17892   // mainMBB:
   17893   //  eax = -1
   17894   //
   17895   // sinkMBB:
   17896   //  v = eax
   17897 
   17898   MachineBasicBlock *thisMBB = MBB;
   17899   MachineFunction *MF = MBB->getParent();
   17900   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   17901   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   17902   MF->insert(I, mainMBB);
   17903   MF->insert(I, sinkMBB);
   17904 
   17905   // Transfer the remainder of BB and its successor edges to sinkMBB.
   17906   sinkMBB->splice(sinkMBB->begin(), MBB,
   17907                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   17908   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
   17909 
   17910   // thisMBB:
   17911   //  xbegin sinkMBB
   17912   //  # fallthrough to mainMBB
   17913   //  # abortion to sinkMBB
   17914   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
   17915   thisMBB->addSuccessor(mainMBB);
   17916   thisMBB->addSuccessor(sinkMBB);
   17917 
   17918   // mainMBB:
   17919   //  EAX = -1
   17920   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
   17921   mainMBB->addSuccessor(sinkMBB);
   17922 
   17923   // sinkMBB:
   17924   // EAX is live into the sinkMBB
   17925   sinkMBB->addLiveIn(X86::EAX);
   17926   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
   17927           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
   17928     .addReg(X86::EAX);
   17929 
   17930   MI->eraseFromParent();
   17931   return sinkMBB;
   17932 }
   17933 
   17934 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
   17935 // or XMM0_V32I8 in AVX all of this code can be replaced with that
   17936 // in the .td file.
   17937 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
   17938                                        const TargetInstrInfo *TII) {
   17939   unsigned Opc;
   17940   switch (MI->getOpcode()) {
   17941   default: llvm_unreachable("illegal opcode!");
   17942   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
   17943   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
   17944   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
   17945   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
   17946   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
   17947   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
   17948   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
   17949   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
   17950   }
   17951 
   17952   DebugLoc dl = MI->getDebugLoc();
   17953   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
   17954 
   17955   unsigned NumArgs = MI->getNumOperands();
   17956   for (unsigned i = 1; i < NumArgs; ++i) {
   17957     MachineOperand &Op = MI->getOperand(i);
   17958     if (!(Op.isReg() && Op.isImplicit()))
   17959       MIB.addOperand(Op);
   17960   }
   17961   if (MI->hasOneMemOperand())
   17962     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   17963 
   17964   BuildMI(*BB, MI, dl,
   17965     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
   17966     .addReg(X86::XMM0);
   17967 
   17968   MI->eraseFromParent();
   17969   return BB;
   17970 }
   17971 
   17972 // FIXME: Custom handling because TableGen doesn't support multiple implicit
   17973 // defs in an instruction pattern
   17974 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
   17975                                        const TargetInstrInfo *TII) {
   17976   unsigned Opc;
   17977   switch (MI->getOpcode()) {
   17978   default: llvm_unreachable("illegal opcode!");
   17979   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
   17980   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
   17981   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
   17982   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
   17983   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
   17984   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
   17985   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
   17986   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
   17987   }
   17988 
   17989   DebugLoc dl = MI->getDebugLoc();
   17990   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
   17991 
   17992   unsigned NumArgs = MI->getNumOperands(); // remove the results
   17993   for (unsigned i = 1; i < NumArgs; ++i) {
   17994     MachineOperand &Op = MI->getOperand(i);
   17995     if (!(Op.isReg() && Op.isImplicit()))
   17996       MIB.addOperand(Op);
   17997   }
   17998   if (MI->hasOneMemOperand())
   17999     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   18000 
   18001   BuildMI(*BB, MI, dl,
   18002     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
   18003     .addReg(X86::ECX);
   18004 
   18005   MI->eraseFromParent();
   18006   return BB;
   18007 }
   18008 
   18009 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
   18010                                       const X86Subtarget *Subtarget) {
   18011   DebugLoc dl = MI->getDebugLoc();
   18012   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   18013   // Address into RAX/EAX, other two args into ECX, EDX.
   18014   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
   18015   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
   18016   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   18017   for (int i = 0; i < X86::AddrNumOperands; ++i)
   18018     MIB.addOperand(MI->getOperand(i));
   18019 
   18020   unsigned ValOps = X86::AddrNumOperands;
   18021   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
   18022     .addReg(MI->getOperand(ValOps).getReg());
   18023   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
   18024     .addReg(MI->getOperand(ValOps+1).getReg());
   18025 
   18026   // The instruction doesn't actually take any operands though.
   18027   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
   18028 
   18029   MI->eraseFromParent(); // The pseudo is gone now.
   18030   return BB;
   18031 }
   18032 
   18033 MachineBasicBlock *
   18034 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
   18035                                                  MachineBasicBlock *MBB) const {
   18036   // Emit va_arg instruction on X86-64.
   18037 
   18038   // Operands to this pseudo-instruction:
   18039   // 0  ) Output        : destination address (reg)
   18040   // 1-5) Input         : va_list address (addr, i64mem)
   18041   // 6  ) ArgSize       : Size (in bytes) of vararg type
   18042   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
   18043   // 8  ) Align         : Alignment of type
   18044   // 9  ) EFLAGS (implicit-def)
   18045 
   18046   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
   18047   static_assert(X86::AddrNumOperands == 5,
   18048                 "VAARG_64 assumes 5 address operands");
   18049 
   18050   unsigned DestReg = MI->getOperand(0).getReg();
   18051   MachineOperand &Base = MI->getOperand(1);
   18052   MachineOperand &Scale = MI->getOperand(2);
   18053   MachineOperand &Index = MI->getOperand(3);
   18054   MachineOperand &Disp = MI->getOperand(4);
   18055   MachineOperand &Segment = MI->getOperand(5);
   18056   unsigned ArgSize = MI->getOperand(6).getImm();
   18057   unsigned ArgMode = MI->getOperand(7).getImm();
   18058   unsigned Align = MI->getOperand(8).getImm();
   18059 
   18060   // Memory Reference
   18061   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
   18062   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   18063   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
   18064 
   18065   // Machine Information
   18066   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   18067   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   18068   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   18069   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
   18070   DebugLoc DL = MI->getDebugLoc();
   18071 
   18072   // struct va_list {
   18073   //   i32   gp_offset
   18074   //   i32   fp_offset
   18075   //   i64   overflow_area (address)
   18076   //   i64   reg_save_area (address)
   18077   // }
   18078   // sizeof(va_list) = 24
   18079   // alignment(va_list) = 8
   18080 
   18081   unsigned TotalNumIntRegs = 6;
   18082   unsigned TotalNumXMMRegs = 8;
   18083   bool UseGPOffset = (ArgMode == 1);
   18084   bool UseFPOffset = (ArgMode == 2);
   18085   unsigned MaxOffset = TotalNumIntRegs * 8 +
   18086                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
   18087 
   18088   /* Align ArgSize to a multiple of 8 */
   18089   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
   18090   bool NeedsAlign = (Align > 8);
   18091 
   18092   MachineBasicBlock *thisMBB = MBB;
   18093   MachineBasicBlock *overflowMBB;
   18094   MachineBasicBlock *offsetMBB;
   18095   MachineBasicBlock *endMBB;
   18096 
   18097   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
   18098   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
   18099   unsigned OffsetReg = 0;
   18100 
   18101   if (!UseGPOffset && !UseFPOffset) {
   18102     // If we only pull from the overflow region, we don't create a branch.
   18103     // We don't need to alter control flow.
   18104     OffsetDestReg = 0; // unused
   18105     OverflowDestReg = DestReg;
   18106 
   18107     offsetMBB = nullptr;
   18108     overflowMBB = thisMBB;
   18109     endMBB = thisMBB;
   18110   } else {
   18111     // First emit code to check if gp_offset (or fp_offset) is below the bound.
   18112     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
   18113     // If not, pull from overflow_area. (branch to overflowMBB)
   18114     //
   18115     //       thisMBB
   18116     //         |     .
   18117     //         |        .
   18118     //     offsetMBB   overflowMBB
   18119     //         |        .
   18120     //         |     .
   18121     //        endMBB
   18122 
   18123     // Registers for the PHI in endMBB
   18124     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
   18125     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
   18126 
   18127     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   18128     MachineFunction *MF = MBB->getParent();
   18129     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   18130     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   18131     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   18132 
   18133     MachineFunction::iterator MBBIter = MBB;
   18134     ++MBBIter;
   18135 
   18136     // Insert the new basic blocks
   18137     MF->insert(MBBIter, offsetMBB);
   18138     MF->insert(MBBIter, overflowMBB);
   18139     MF->insert(MBBIter, endMBB);
   18140 
   18141     // Transfer the remainder of MBB and its successor edges to endMBB.
   18142     endMBB->splice(endMBB->begin(), thisMBB,
   18143                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
   18144     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
   18145 
   18146     // Make offsetMBB and overflowMBB successors of thisMBB
   18147     thisMBB->addSuccessor(offsetMBB);
   18148     thisMBB->addSuccessor(overflowMBB);
   18149 
   18150     // endMBB is a successor of both offsetMBB and overflowMBB
   18151     offsetMBB->addSuccessor(endMBB);
   18152     overflowMBB->addSuccessor(endMBB);
   18153 
   18154     // Load the offset value into a register
   18155     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
   18156     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
   18157       .addOperand(Base)
   18158       .addOperand(Scale)
   18159       .addOperand(Index)
   18160       .addDisp(Disp, UseFPOffset ? 4 : 0)
   18161       .addOperand(Segment)
   18162       .setMemRefs(MMOBegin, MMOEnd);
   18163 
   18164     // Check if there is enough room left to pull this argument.
   18165     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
   18166       .addReg(OffsetReg)
   18167       .addImm(MaxOffset + 8 - ArgSizeA8);
   18168 
   18169     // Branch to "overflowMBB" if offset >= max
   18170     // Fall through to "offsetMBB" otherwise
   18171     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
   18172       .addMBB(overflowMBB);
   18173   }
   18174 
   18175   // In offsetMBB, emit code to use the reg_save_area.
   18176   if (offsetMBB) {
   18177     assert(OffsetReg != 0);
   18178 
   18179     // Read the reg_save_area address.
   18180     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
   18181     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
   18182       .addOperand(Base)
   18183       .addOperand(Scale)
   18184       .addOperand(Index)
   18185       .addDisp(Disp, 16)
   18186       .addOperand(Segment)
   18187       .setMemRefs(MMOBegin, MMOEnd);
   18188 
   18189     // Zero-extend the offset
   18190     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
   18191       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
   18192         .addImm(0)
   18193         .addReg(OffsetReg)
   18194         .addImm(X86::sub_32bit);
   18195 
   18196     // Add the offset to the reg_save_area to get the final address.
   18197     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
   18198       .addReg(OffsetReg64)
   18199       .addReg(RegSaveReg);
   18200 
   18201     // Compute the offset for the next argument
   18202     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
   18203     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
   18204       .addReg(OffsetReg)
   18205       .addImm(UseFPOffset ? 16 : 8);
   18206 
   18207     // Store it back into the va_list.
   18208     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
   18209       .addOperand(Base)
   18210       .addOperand(Scale)
   18211       .addOperand(Index)
   18212       .addDisp(Disp, UseFPOffset ? 4 : 0)
   18213       .addOperand(Segment)
   18214       .addReg(NextOffsetReg)
   18215       .setMemRefs(MMOBegin, MMOEnd);
   18216 
   18217     // Jump to endMBB
   18218     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
   18219       .addMBB(endMBB);
   18220   }
   18221 
   18222   //
   18223   // Emit code to use overflow area
   18224   //
   18225 
   18226   // Load the overflow_area address into a register.
   18227   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
   18228   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
   18229     .addOperand(Base)
   18230     .addOperand(Scale)
   18231     .addOperand(Index)
   18232     .addDisp(Disp, 8)
   18233     .addOperand(Segment)
   18234     .setMemRefs(MMOBegin, MMOEnd);
   18235 
   18236   // If we need to align it, do so. Otherwise, just copy the address
   18237   // to OverflowDestReg.
   18238   if (NeedsAlign) {
   18239     // Align the overflow address
   18240     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
   18241     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
   18242 
   18243     // aligned_addr = (addr + (align-1)) & ~(align-1)
   18244     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
   18245       .addReg(OverflowAddrReg)
   18246       .addImm(Align-1);
   18247 
   18248     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
   18249       .addReg(TmpReg)
   18250       .addImm(~(uint64_t)(Align-1));
   18251   } else {
   18252     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
   18253       .addReg(OverflowAddrReg);
   18254   }
   18255 
   18256   // Compute the next overflow address after this argument.
   18257   // (the overflow address should be kept 8-byte aligned)
   18258   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
   18259   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
   18260     .addReg(OverflowDestReg)
   18261     .addImm(ArgSizeA8);
   18262 
   18263   // Store the new overflow address.
   18264   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
   18265     .addOperand(Base)
   18266     .addOperand(Scale)
   18267     .addOperand(Index)
   18268     .addDisp(Disp, 8)
   18269     .addOperand(Segment)
   18270     .addReg(NextAddrReg)
   18271     .setMemRefs(MMOBegin, MMOEnd);
   18272 
   18273   // If we branched, emit the PHI to the front of endMBB.
   18274   if (offsetMBB) {
   18275     BuildMI(*endMBB, endMBB->begin(), DL,
   18276             TII->get(X86::PHI), DestReg)
   18277       .addReg(OffsetDestReg).addMBB(offsetMBB)
   18278       .addReg(OverflowDestReg).addMBB(overflowMBB);
   18279   }
   18280 
   18281   // Erase the pseudo instruction
   18282   MI->eraseFromParent();
   18283 
   18284   return endMBB;
   18285 }
   18286 
   18287 MachineBasicBlock *
   18288 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   18289                                                  MachineInstr *MI,
   18290                                                  MachineBasicBlock *MBB) const {
   18291   // Emit code to save XMM registers to the stack. The ABI says that the
   18292   // number of registers to save is given in %al, so it's theoretically
   18293   // possible to do an indirect jump trick to avoid saving all of them,
   18294   // however this code takes a simpler approach and just executes all
   18295   // of the stores if %al is non-zero. It's less code, and it's probably
   18296   // easier on the hardware branch predictor, and stores aren't all that
   18297   // expensive anyway.
   18298 
   18299   // Create the new basic blocks. One block contains all the XMM stores,
   18300   // and one block is the final destination regardless of whether any
   18301   // stores were performed.
   18302   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   18303   MachineFunction *F = MBB->getParent();
   18304   MachineFunction::iterator MBBIter = MBB;
   18305   ++MBBIter;
   18306   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
   18307   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
   18308   F->insert(MBBIter, XMMSaveMBB);
   18309   F->insert(MBBIter, EndMBB);
   18310 
   18311   // Transfer the remainder of MBB and its successor edges to EndMBB.
   18312   EndMBB->splice(EndMBB->begin(), MBB,
   18313                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   18314   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
   18315 
   18316   // The original block will now fall through to the XMM save block.
   18317   MBB->addSuccessor(XMMSaveMBB);
   18318   // The XMMSaveMBB will fall through to the end block.
   18319   XMMSaveMBB->addSuccessor(EndMBB);
   18320 
   18321   // Now add the instructions.
   18322   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   18323   DebugLoc DL = MI->getDebugLoc();
   18324 
   18325   unsigned CountReg = MI->getOperand(0).getReg();
   18326   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
   18327   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
   18328 
   18329   if (!Subtarget->isTargetWin64()) {
   18330     // If %al is 0, branch around the XMM save block.
   18331     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
   18332     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
   18333     MBB->addSuccessor(EndMBB);
   18334   }
   18335 
   18336   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
   18337   // that was just emitted, but clearly shouldn't be "saved".
   18338   assert((MI->getNumOperands() <= 3 ||
   18339           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
   18340           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
   18341          && "Expected last argument to be EFLAGS");
   18342   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   18343   // In the XMM save block, save all the XMM argument registers.
   18344   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
   18345     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
   18346     MachineMemOperand *MMO =
   18347       F->getMachineMemOperand(
   18348           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
   18349         MachineMemOperand::MOStore,
   18350         /*Size=*/16, /*Align=*/16);
   18351     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
   18352       .addFrameIndex(RegSaveFrameIndex)
   18353       .addImm(/*Scale=*/1)
   18354       .addReg(/*IndexReg=*/0)
   18355       .addImm(/*Disp=*/Offset)
   18356       .addReg(/*Segment=*/0)
   18357       .addReg(MI->getOperand(i).getReg())
   18358       .addMemOperand(MMO);
   18359   }
   18360 
   18361   MI->eraseFromParent();   // The pseudo instruction is gone now.
   18362 
   18363   return EndMBB;
   18364 }
   18365 
   18366 // The EFLAGS operand of SelectItr might be missing a kill marker
   18367 // because there were multiple uses of EFLAGS, and ISel didn't know
   18368 // which to mark. Figure out whether SelectItr should have had a
   18369 // kill marker, and set it if it should. Returns the correct kill
   18370 // marker value.
   18371 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
   18372                                      MachineBasicBlock* BB,
   18373                                      const TargetRegisterInfo* TRI) {
   18374   // Scan forward through BB for a use/def of EFLAGS.
   18375   MachineBasicBlock::iterator miI(std::next(SelectItr));
   18376   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
   18377     const MachineInstr& mi = *miI;
   18378     if (mi.readsRegister(X86::EFLAGS))
   18379       return false;
   18380     if (mi.definesRegister(X86::EFLAGS))
   18381       break; // Should have kill-flag - update below.
   18382   }
   18383 
   18384   // If we hit the end of the block, check whether EFLAGS is live into a
   18385   // successor.
   18386   if (miI == BB->end()) {
   18387     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
   18388                                           sEnd = BB->succ_end();
   18389          sItr != sEnd; ++sItr) {
   18390       MachineBasicBlock* succ = *sItr;
   18391       if (succ->isLiveIn(X86::EFLAGS))
   18392         return false;
   18393     }
   18394   }
   18395 
   18396   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
   18397   // out. SelectMI should have a kill flag on EFLAGS.
   18398   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
   18399   return true;
   18400 }
   18401 
   18402 MachineBasicBlock *
   18403 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   18404                                      MachineBasicBlock *BB) const {
   18405   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   18406   DebugLoc DL = MI->getDebugLoc();
   18407 
   18408   // To "insert" a SELECT_CC instruction, we actually have to insert the
   18409   // diamond control-flow pattern.  The incoming instruction knows the
   18410   // destination vreg to set, the condition code register to branch on, the
   18411   // true/false values to select between, and a branch opcode to use.
   18412   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   18413   MachineFunction::iterator It = BB;
   18414   ++It;
   18415 
   18416   //  thisMBB:
   18417   //  ...
   18418   //   TrueVal = ...
   18419   //   cmpTY ccX, r1, r2
   18420   //   bCC copy1MBB
   18421   //   fallthrough --> copy0MBB
   18422   MachineBasicBlock *thisMBB = BB;
   18423   MachineFunction *F = BB->getParent();
   18424 
   18425   // We also lower double CMOVs:
   18426   //   (CMOV (CMOV F, T, cc1), T, cc2)
   18427   // to two successives branches.  For that, we look for another CMOV as the
   18428   // following instruction.
   18429   //
   18430   // Without this, we would add a PHI between the two jumps, which ends up
   18431   // creating a few copies all around. For instance, for
   18432   //
   18433   //    (sitofp (zext (fcmp une)))
   18434   //
   18435   // we would generate:
   18436   //
   18437   //         ucomiss %xmm1, %xmm0
   18438   //         movss  <1.0f>, %xmm0
   18439   //         movaps  %xmm0, %xmm1
   18440   //         jne     .LBB5_2
   18441   //         xorps   %xmm1, %xmm1
   18442   // .LBB5_2:
   18443   //         jp      .LBB5_4
   18444   //         movaps  %xmm1, %xmm0
   18445   // .LBB5_4:
   18446   //         retq
   18447   //
   18448   // because this custom-inserter would have generated:
   18449   //
   18450   //   A
   18451   //   | \
   18452   //   |  B
   18453   //   | /
   18454   //   C
   18455   //   | \
   18456   //   |  D
   18457   //   | /
   18458   //   E
   18459   //
   18460   // A: X = ...; Y = ...
   18461   // B: empty
   18462   // C: Z = PHI [X, A], [Y, B]
   18463   // D: empty
   18464   // E: PHI [X, C], [Z, D]
   18465   //
   18466   // If we lower both CMOVs in a single step, we can instead generate:
   18467   //
   18468   //   A
   18469   //   | \
   18470   //   |  C
   18471   //   | /|
   18472   //   |/ |
   18473   //   |  |
   18474   //   |  D
   18475   //   | /
   18476   //   E
   18477   //
   18478   // A: X = ...; Y = ...
   18479   // D: empty
   18480   // E: PHI [X, A], [X, C], [Y, D]
   18481   //
   18482   // Which, in our sitofp/fcmp example, gives us something like:
   18483   //
   18484   //         ucomiss %xmm1, %xmm0
   18485   //         movss  <1.0f>, %xmm0
   18486   //         jne     .LBB5_4
   18487   //         jp      .LBB5_4
   18488   //         xorps   %xmm0, %xmm0
   18489   // .LBB5_4:
   18490   //         retq
   18491   //
   18492   MachineInstr *NextCMOV = nullptr;
   18493   MachineBasicBlock::iterator NextMIIt =
   18494       std::next(MachineBasicBlock::iterator(MI));
   18495   if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
   18496       NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
   18497       NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg())
   18498     NextCMOV = &*NextMIIt;
   18499 
   18500   MachineBasicBlock *jcc1MBB = nullptr;
   18501 
   18502   // If we have a double CMOV, we lower it to two successive branches to
   18503   // the same block.  EFLAGS is used by both, so mark it as live in the second.
   18504   if (NextCMOV) {
   18505     jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
   18506     F->insert(It, jcc1MBB);
   18507     jcc1MBB->addLiveIn(X86::EFLAGS);
   18508   }
   18509 
   18510   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
   18511   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
   18512   F->insert(It, copy0MBB);
   18513   F->insert(It, sinkMBB);
   18514 
   18515   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   18516   // live into the sink and copy blocks.
   18517   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   18518 
   18519   MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI;
   18520   if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
   18521       !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
   18522     copy0MBB->addLiveIn(X86::EFLAGS);
   18523     sinkMBB->addLiveIn(X86::EFLAGS);
   18524   }
   18525 
   18526   // Transfer the remainder of BB and its successor edges to sinkMBB.
   18527   sinkMBB->splice(sinkMBB->begin(), BB,
   18528                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
   18529   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
   18530 
   18531   // Add the true and fallthrough blocks as its successors.
   18532   if (NextCMOV) {
   18533     // The fallthrough block may be jcc1MBB, if we have a double CMOV.
   18534     BB->addSuccessor(jcc1MBB);
   18535 
   18536     // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
   18537     // jump to the sinkMBB.
   18538     jcc1MBB->addSuccessor(copy0MBB);
   18539     jcc1MBB->addSuccessor(sinkMBB);
   18540   } else {
   18541     BB->addSuccessor(copy0MBB);
   18542   }
   18543 
   18544   // The true block target of the first (or only) branch is always sinkMBB.
   18545   BB->addSuccessor(sinkMBB);
   18546 
   18547   // Create the conditional branch instruction.
   18548   unsigned Opc =
   18549     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
   18550   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
   18551 
   18552   if (NextCMOV) {
   18553     unsigned Opc2 = X86::GetCondBranchFromCond(
   18554         (X86::CondCode)NextCMOV->getOperand(3).getImm());
   18555     BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
   18556   }
   18557 
   18558   //  copy0MBB:
   18559   //   %FalseValue = ...
   18560   //   # fallthrough to sinkMBB
   18561   copy0MBB->addSuccessor(sinkMBB);
   18562 
   18563   //  sinkMBB:
   18564   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
   18565   //  ...
   18566   MachineInstrBuilder MIB =
   18567       BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI),
   18568               MI->getOperand(0).getReg())
   18569           .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
   18570           .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
   18571 
   18572   // If we have a double CMOV, the second Jcc provides the same incoming
   18573   // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
   18574   if (NextCMOV) {
   18575     MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
   18576     // Copy the PHI result to the register defined by the second CMOV.
   18577     BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
   18578             DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg())
   18579         .addReg(MI->getOperand(0).getReg());
   18580     NextCMOV->eraseFromParent();
   18581   }
   18582 
   18583   MI->eraseFromParent();   // The pseudo instruction is gone now.
   18584   return sinkMBB;
   18585 }
   18586 
   18587 MachineBasicBlock *
   18588 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
   18589                                         MachineBasicBlock *BB) const {
   18590   MachineFunction *MF = BB->getParent();
   18591   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   18592   DebugLoc DL = MI->getDebugLoc();
   18593   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   18594 
   18595   assert(MF->shouldSplitStack());
   18596 
   18597   const bool Is64Bit = Subtarget->is64Bit();
   18598   const bool IsLP64 = Subtarget->isTarget64BitLP64();
   18599 
   18600   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   18601   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
   18602 
   18603   // BB:
   18604   //  ... [Till the alloca]
   18605   // If stacklet is not large enough, jump to mallocMBB
   18606   //
   18607   // bumpMBB:
   18608   //  Allocate by subtracting from RSP
   18609   //  Jump to continueMBB
   18610   //
   18611   // mallocMBB:
   18612   //  Allocate by call to runtime
   18613   //
   18614   // continueMBB:
   18615   //  ...
   18616   //  [rest of original BB]
   18617   //
   18618 
   18619   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   18620   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   18621   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   18622 
   18623   MachineRegisterInfo &MRI = MF->getRegInfo();
   18624   const TargetRegisterClass *AddrRegClass =
   18625     getRegClassFor(getPointerTy());
   18626 
   18627   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
   18628     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
   18629     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
   18630     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
   18631     sizeVReg = MI->getOperand(1).getReg(),
   18632     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
   18633 
   18634   MachineFunction::iterator MBBIter = BB;
   18635   ++MBBIter;
   18636 
   18637   MF->insert(MBBIter, bumpMBB);
   18638   MF->insert(MBBIter, mallocMBB);
   18639   MF->insert(MBBIter, continueMBB);
   18640 
   18641   continueMBB->splice(continueMBB->begin(), BB,
   18642                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
   18643   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
   18644 
   18645   // Add code to the main basic block to check if the stack limit has been hit,
   18646   // and if so, jump to mallocMBB otherwise to bumpMBB.
   18647   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
   18648   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
   18649     .addReg(tmpSPVReg).addReg(sizeVReg);
   18650   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
   18651     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
   18652     .addReg(SPLimitVReg);
   18653   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
   18654 
   18655   // bumpMBB simply decreases the stack pointer, since we know the current
   18656   // stacklet has enough space.
   18657   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
   18658     .addReg(SPLimitVReg);
   18659   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
   18660     .addReg(SPLimitVReg);
   18661   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
   18662 
   18663   // Calls into a routine in libgcc to allocate more space from the heap.
   18664   const uint32_t *RegMask =
   18665       Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
   18666   if (IsLP64) {
   18667     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
   18668       .addReg(sizeVReg);
   18669     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
   18670       .addExternalSymbol("__morestack_allocate_stack_space")
   18671       .addRegMask(RegMask)
   18672       .addReg(X86::RDI, RegState::Implicit)
   18673       .addReg(X86::RAX, RegState::ImplicitDefine);
   18674   } else if (Is64Bit) {
   18675     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
   18676       .addReg(sizeVReg);
   18677     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
   18678       .addExternalSymbol("__morestack_allocate_stack_space")
   18679       .addRegMask(RegMask)
   18680       .addReg(X86::EDI, RegState::Implicit)
   18681       .addReg(X86::EAX, RegState::ImplicitDefine);
   18682   } else {
   18683     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
   18684       .addImm(12);
   18685     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
   18686     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
   18687       .addExternalSymbol("__morestack_allocate_stack_space")
   18688       .addRegMask(RegMask)
   18689       .addReg(X86::EAX, RegState::ImplicitDefine);
   18690   }
   18691 
   18692   if (!Is64Bit)
   18693     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
   18694       .addImm(16);
   18695 
   18696   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
   18697     .addReg(IsLP64 ? X86::RAX : X86::EAX);
   18698   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
   18699 
   18700   // Set up the CFG correctly.
   18701   BB->addSuccessor(bumpMBB);
   18702   BB->addSuccessor(mallocMBB);
   18703   mallocMBB->addSuccessor(continueMBB);
   18704   bumpMBB->addSuccessor(continueMBB);
   18705 
   18706   // Take care of the PHI nodes.
   18707   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
   18708           MI->getOperand(0).getReg())
   18709     .addReg(mallocPtrVReg).addMBB(mallocMBB)
   18710     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
   18711 
   18712   // Delete the original pseudo instruction.
   18713   MI->eraseFromParent();
   18714 
   18715   // And we're done.
   18716   return continueMBB;
   18717 }
   18718 
   18719 MachineBasicBlock *
   18720 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
   18721                                         MachineBasicBlock *BB) const {
   18722   DebugLoc DL = MI->getDebugLoc();
   18723 
   18724   assert(!Subtarget->isTargetMachO());
   18725 
   18726   X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
   18727 
   18728   MI->eraseFromParent();   // The pseudo instruction is gone now.
   18729   return BB;
   18730 }
   18731 
   18732 MachineBasicBlock *
   18733 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
   18734                                       MachineBasicBlock *BB) const {
   18735   // This is pretty easy.  We're taking the value that we received from
   18736   // our load from the relocation, sticking it in either RDI (x86-64)
   18737   // or EAX and doing an indirect call.  The return value will then
   18738   // be in the normal return register.
   18739   MachineFunction *F = BB->getParent();
   18740   const X86InstrInfo *TII = Subtarget->getInstrInfo();
   18741   DebugLoc DL = MI->getDebugLoc();
   18742 
   18743   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
   18744   assert(MI->getOperand(3).isGlobal() && "This should be a global");
   18745 
   18746   // Get a register mask for the lowered call.
   18747   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   18748   // proper register mask.
   18749   const uint32_t *RegMask =
   18750       Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
   18751   if (Subtarget->is64Bit()) {
   18752     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
   18753                                       TII->get(X86::MOV64rm), X86::RDI)
   18754     .addReg(X86::RIP)
   18755     .addImm(0).addReg(0)
   18756     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
   18757                       MI->getOperand(3).getTargetFlags())
   18758     .addReg(0);
   18759     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
   18760     addDirectMem(MIB, X86::RDI);
   18761     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
   18762   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
   18763     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
   18764                                       TII->get(X86::MOV32rm), X86::EAX)
   18765     .addReg(0)
   18766     .addImm(0).addReg(0)
   18767     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
   18768                       MI->getOperand(3).getTargetFlags())
   18769     .addReg(0);
   18770     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
   18771     addDirectMem(MIB, X86::EAX);
   18772     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   18773   } else {
   18774     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
   18775                                       TII->get(X86::MOV32rm), X86::EAX)
   18776     .addReg(TII->getGlobalBaseReg(F))
   18777     .addImm(0).addReg(0)
   18778     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
   18779                       MI->getOperand(3).getTargetFlags())
   18780     .addReg(0);
   18781     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
   18782     addDirectMem(MIB, X86::EAX);
   18783     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   18784   }
   18785 
   18786   MI->eraseFromParent(); // The pseudo instruction is gone now.
   18787   return BB;
   18788 }
   18789 
   18790 MachineBasicBlock *
   18791 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   18792                                     MachineBasicBlock *MBB) const {
   18793   DebugLoc DL = MI->getDebugLoc();
   18794   MachineFunction *MF = MBB->getParent();
   18795   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   18796   MachineRegisterInfo &MRI = MF->getRegInfo();
   18797 
   18798   const BasicBlock *BB = MBB->getBasicBlock();
   18799   MachineFunction::iterator I = MBB;
   18800   ++I;
   18801 
   18802   // Memory Reference
   18803   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   18804   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
   18805 
   18806   unsigned DstReg;
   18807   unsigned MemOpndSlot = 0;
   18808 
   18809   unsigned CurOp = 0;
   18810 
   18811   DstReg = MI->getOperand(CurOp++).getReg();
   18812   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   18813   assert(RC->hasType(MVT::i32) && "Invalid destination!");
   18814   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   18815   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
   18816 
   18817   MemOpndSlot = CurOp;
   18818 
   18819   MVT PVT = getPointerTy();
   18820   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
   18821          "Invalid Pointer Size!");
   18822 
   18823   // For v = setjmp(buf), we generate
   18824   //
   18825   // thisMBB:
   18826   //  buf[LabelOffset] = restoreMBB
   18827   //  SjLjSetup restoreMBB
   18828   //
   18829   // mainMBB:
   18830   //  v_main = 0
   18831   //
   18832   // sinkMBB:
   18833   //  v = phi(main, restore)
   18834   //
   18835   // restoreMBB:
   18836   //  if base pointer being used, load it from frame
   18837   //  v_restore = 1
   18838 
   18839   MachineBasicBlock *thisMBB = MBB;
   18840   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   18841   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   18842   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
   18843   MF->insert(I, mainMBB);
   18844   MF->insert(I, sinkMBB);
   18845   MF->push_back(restoreMBB);
   18846 
   18847   MachineInstrBuilder MIB;
   18848 
   18849   // Transfer the remainder of BB and its successor edges to sinkMBB.
   18850   sinkMBB->splice(sinkMBB->begin(), MBB,
   18851                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   18852   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
   18853 
   18854   // thisMBB:
   18855   unsigned PtrStoreOpc = 0;
   18856   unsigned LabelReg = 0;
   18857   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   18858   Reloc::Model RM = MF->getTarget().getRelocationModel();
   18859   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
   18860                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
   18861 
   18862   // Prepare IP either in reg or imm.
   18863   if (!UseImmLabel) {
   18864     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
   18865     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
   18866     LabelReg = MRI.createVirtualRegister(PtrRC);
   18867     if (Subtarget->is64Bit()) {
   18868       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
   18869               .addReg(X86::RIP)
   18870               .addImm(0)
   18871               .addReg(0)
   18872               .addMBB(restoreMBB)
   18873               .addReg(0);
   18874     } else {
   18875       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
   18876       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
   18877               .addReg(XII->getGlobalBaseReg(MF))
   18878               .addImm(0)
   18879               .addReg(0)
   18880               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
   18881               .addReg(0);
   18882     }
   18883   } else
   18884     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   18885   // Store IP
   18886   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
   18887   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   18888     if (i == X86::AddrDisp)
   18889       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
   18890     else
   18891       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
   18892   }
   18893   if (!UseImmLabel)
   18894     MIB.addReg(LabelReg);
   18895   else
   18896     MIB.addMBB(restoreMBB);
   18897   MIB.setMemRefs(MMOBegin, MMOEnd);
   18898   // Setup
   18899   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
   18900           .addMBB(restoreMBB);
   18901 
   18902   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   18903   MIB.addRegMask(RegInfo->getNoPreservedMask());
   18904   thisMBB->addSuccessor(mainMBB);
   18905   thisMBB->addSuccessor(restoreMBB);
   18906 
   18907   // mainMBB:
   18908   //  EAX = 0
   18909   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
   18910   mainMBB->addSuccessor(sinkMBB);
   18911 
   18912   // sinkMBB:
   18913   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
   18914           TII->get(X86::PHI), DstReg)
   18915     .addReg(mainDstReg).addMBB(mainMBB)
   18916     .addReg(restoreDstReg).addMBB(restoreMBB);
   18917 
   18918   // restoreMBB:
   18919   if (RegInfo->hasBasePointer(*MF)) {
   18920     const bool Uses64BitFramePtr =
   18921         Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
   18922     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
   18923     X86FI->setRestoreBasePointer(MF);
   18924     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
   18925     unsigned BasePtr = RegInfo->getBaseRegister();
   18926     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
   18927     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
   18928                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
   18929       .setMIFlag(MachineInstr::FrameSetup);
   18930   }
   18931   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
   18932   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   18933   restoreMBB->addSuccessor(sinkMBB);
   18934 
   18935   MI->eraseFromParent();
   18936   return sinkMBB;
   18937 }
   18938 
   18939 MachineBasicBlock *
   18940 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   18941                                      MachineBasicBlock *MBB) const {
   18942   DebugLoc DL = MI->getDebugLoc();
   18943   MachineFunction *MF = MBB->getParent();
   18944   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   18945   MachineRegisterInfo &MRI = MF->getRegInfo();
   18946 
   18947   // Memory Reference
   18948   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   18949   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
   18950 
   18951   MVT PVT = getPointerTy();
   18952   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
   18953          "Invalid Pointer Size!");
   18954 
   18955   const TargetRegisterClass *RC =
   18956     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   18957   unsigned Tmp = MRI.createVirtualRegister(RC);
   18958   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   18959   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   18960   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   18961   unsigned SP = RegInfo->getStackRegister();
   18962 
   18963   MachineInstrBuilder MIB;
   18964 
   18965   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   18966   const int64_t SPOffset = 2 * PVT.getStoreSize();
   18967 
   18968   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   18969   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
   18970 
   18971   // Reload FP
   18972   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
   18973   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
   18974     MIB.addOperand(MI->getOperand(i));
   18975   MIB.setMemRefs(MMOBegin, MMOEnd);
   18976   // Reload IP
   18977   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
   18978   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   18979     if (i == X86::AddrDisp)
   18980       MIB.addDisp(MI->getOperand(i), LabelOffset);
   18981     else
   18982       MIB.addOperand(MI->getOperand(i));
   18983   }
   18984   MIB.setMemRefs(MMOBegin, MMOEnd);
   18985   // Reload SP
   18986   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
   18987   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   18988     if (i == X86::AddrDisp)
   18989       MIB.addDisp(MI->getOperand(i), SPOffset);
   18990     else
   18991       MIB.addOperand(MI->getOperand(i));
   18992   }
   18993   MIB.setMemRefs(MMOBegin, MMOEnd);
   18994   // Jump
   18995   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
   18996 
   18997   MI->eraseFromParent();
   18998   return MBB;
   18999 }
   19000 
   19001 // Replace 213-type (isel default) FMA3 instructions with 231-type for
   19002 // accumulator loops. Writing back to the accumulator allows the coalescer
   19003 // to remove extra copies in the loop.
   19004 MachineBasicBlock *
   19005 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
   19006                                  MachineBasicBlock *MBB) const {
   19007   MachineOperand &AddendOp = MI->getOperand(3);
   19008 
   19009   // Bail out early if the addend isn't a register - we can't switch these.
   19010   if (!AddendOp.isReg())
   19011     return MBB;
   19012 
   19013   MachineFunction &MF = *MBB->getParent();
   19014   MachineRegisterInfo &MRI = MF.getRegInfo();
   19015 
   19016   // Check whether the addend is defined by a PHI:
   19017   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
   19018   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
   19019   if (!AddendDef.isPHI())
   19020     return MBB;
   19021 
   19022   // Look for the following pattern:
   19023   // loop:
   19024   //   %addend = phi [%entry, 0], [%loop, %result]
   19025   //   ...
   19026   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
   19027 
   19028   // Replace with:
   19029   //   loop:
   19030   //   %addend = phi [%entry, 0], [%loop, %result]
   19031   //   ...
   19032   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
   19033 
   19034   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
   19035     assert(AddendDef.getOperand(i).isReg());
   19036     MachineOperand PHISrcOp = AddendDef.getOperand(i);
   19037     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
   19038     if (&PHISrcInst == MI) {
   19039       // Found a matching instruction.
   19040       unsigned NewFMAOpc = 0;
   19041       switch (MI->getOpcode()) {
   19042         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
   19043         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
   19044         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
   19045         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
   19046         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
   19047         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
   19048         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
   19049         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
   19050         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
   19051         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
   19052         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
   19053         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
   19054         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
   19055         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
   19056         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
   19057         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
   19058         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
   19059         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
   19060         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
   19061         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
   19062 
   19063         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
   19064         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
   19065         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
   19066         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
   19067         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
   19068         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
   19069         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
   19070         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
   19071         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
   19072         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
   19073         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
   19074         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
   19075         default: llvm_unreachable("Unrecognized FMA variant.");
   19076       }
   19077 
   19078       const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   19079       MachineInstrBuilder MIB =
   19080         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
   19081         .addOperand(MI->getOperand(0))
   19082         .addOperand(MI->getOperand(3))
   19083         .addOperand(MI->getOperand(2))
   19084         .addOperand(MI->getOperand(1));
   19085       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
   19086       MI->eraseFromParent();
   19087     }
   19088   }
   19089 
   19090   return MBB;
   19091 }
   19092 
   19093 MachineBasicBlock *
   19094 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   19095                                                MachineBasicBlock *BB) const {
   19096   switch (MI->getOpcode()) {
   19097   default: llvm_unreachable("Unexpected instr type to insert");
   19098   case X86::TAILJMPd64:
   19099   case X86::TAILJMPr64:
   19100   case X86::TAILJMPm64:
   19101   case X86::TAILJMPd64_REX:
   19102   case X86::TAILJMPr64_REX:
   19103   case X86::TAILJMPm64_REX:
   19104     llvm_unreachable("TAILJMP64 would not be touched here.");
   19105   case X86::TCRETURNdi64:
   19106   case X86::TCRETURNri64:
   19107   case X86::TCRETURNmi64:
   19108     return BB;
   19109   case X86::WIN_ALLOCA:
   19110     return EmitLoweredWinAlloca(MI, BB);
   19111   case X86::SEG_ALLOCA_32:
   19112   case X86::SEG_ALLOCA_64:
   19113     return EmitLoweredSegAlloca(MI, BB);
   19114   case X86::TLSCall_32:
   19115   case X86::TLSCall_64:
   19116     return EmitLoweredTLSCall(MI, BB);
   19117   case X86::CMOV_GR8:
   19118   case X86::CMOV_FR32:
   19119   case X86::CMOV_FR64:
   19120   case X86::CMOV_V4F32:
   19121   case X86::CMOV_V2F64:
   19122   case X86::CMOV_V2I64:
   19123   case X86::CMOV_V8F32:
   19124   case X86::CMOV_V4F64:
   19125   case X86::CMOV_V4I64:
   19126   case X86::CMOV_V16F32:
   19127   case X86::CMOV_V8F64:
   19128   case X86::CMOV_V8I64:
   19129   case X86::CMOV_GR16:
   19130   case X86::CMOV_GR32:
   19131   case X86::CMOV_RFP32:
   19132   case X86::CMOV_RFP64:
   19133   case X86::CMOV_RFP80:
   19134     return EmitLoweredSelect(MI, BB);
   19135 
   19136   case X86::FP32_TO_INT16_IN_MEM:
   19137   case X86::FP32_TO_INT32_IN_MEM:
   19138   case X86::FP32_TO_INT64_IN_MEM:
   19139   case X86::FP64_TO_INT16_IN_MEM:
   19140   case X86::FP64_TO_INT32_IN_MEM:
   19141   case X86::FP64_TO_INT64_IN_MEM:
   19142   case X86::FP80_TO_INT16_IN_MEM:
   19143   case X86::FP80_TO_INT32_IN_MEM:
   19144   case X86::FP80_TO_INT64_IN_MEM: {
   19145     MachineFunction *F = BB->getParent();
   19146     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   19147     DebugLoc DL = MI->getDebugLoc();
   19148 
   19149     // Change the floating point control register to use "round towards zero"
   19150     // mode when truncating to an integer value.
   19151     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
   19152     addFrameReference(BuildMI(*BB, MI, DL,
   19153                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
   19154 
   19155     // Load the old value of the high byte of the control word...
   19156     unsigned OldCW =
   19157       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
   19158     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
   19159                       CWFrameIdx);
   19160 
   19161     // Set the high part to be round to zero...
   19162     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
   19163       .addImm(0xC7F);
   19164 
   19165     // Reload the modified control word now...
   19166     addFrameReference(BuildMI(*BB, MI, DL,
   19167                               TII->get(X86::FLDCW16m)), CWFrameIdx);
   19168 
   19169     // Restore the memory image of control word to original value
   19170     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
   19171       .addReg(OldCW);
   19172 
   19173     // Get the X86 opcode to use.
   19174     unsigned Opc;
   19175     switch (MI->getOpcode()) {
   19176     default: llvm_unreachable("illegal opcode!");
   19177     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
   19178     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
   19179     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
   19180     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
   19181     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
   19182     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
   19183     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
   19184     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
   19185     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
   19186     }
   19187 
   19188     X86AddressMode AM;
   19189     MachineOperand &Op = MI->getOperand(0);
   19190     if (Op.isReg()) {
   19191       AM.BaseType = X86AddressMode::RegBase;
   19192       AM.Base.Reg = Op.getReg();
   19193     } else {
   19194       AM.BaseType = X86AddressMode::FrameIndexBase;
   19195       AM.Base.FrameIndex = Op.getIndex();
   19196     }
   19197     Op = MI->getOperand(1);
   19198     if (Op.isImm())
   19199       AM.Scale = Op.getImm();
   19200     Op = MI->getOperand(2);
   19201     if (Op.isImm())
   19202       AM.IndexReg = Op.getImm();
   19203     Op = MI->getOperand(3);
   19204     if (Op.isGlobal()) {
   19205       AM.GV = Op.getGlobal();
   19206     } else {
   19207       AM.Disp = Op.getImm();
   19208     }
   19209     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
   19210                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
   19211 
   19212     // Reload the original control word now.
   19213     addFrameReference(BuildMI(*BB, MI, DL,
   19214                               TII->get(X86::FLDCW16m)), CWFrameIdx);
   19215 
   19216     MI->eraseFromParent();   // The pseudo instruction is gone now.
   19217     return BB;
   19218   }
   19219     // String/text processing lowering.
   19220   case X86::PCMPISTRM128REG:
   19221   case X86::VPCMPISTRM128REG:
   19222   case X86::PCMPISTRM128MEM:
   19223   case X86::VPCMPISTRM128MEM:
   19224   case X86::PCMPESTRM128REG:
   19225   case X86::VPCMPESTRM128REG:
   19226   case X86::PCMPESTRM128MEM:
   19227   case X86::VPCMPESTRM128MEM:
   19228     assert(Subtarget->hasSSE42() &&
   19229            "Target must have SSE4.2 or AVX features enabled");
   19230     return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
   19231 
   19232   // String/text processing lowering.
   19233   case X86::PCMPISTRIREG:
   19234   case X86::VPCMPISTRIREG:
   19235   case X86::PCMPISTRIMEM:
   19236   case X86::VPCMPISTRIMEM:
   19237   case X86::PCMPESTRIREG:
   19238   case X86::VPCMPESTRIREG:
   19239   case X86::PCMPESTRIMEM:
   19240   case X86::VPCMPESTRIMEM:
   19241     assert(Subtarget->hasSSE42() &&
   19242            "Target must have SSE4.2 or AVX features enabled");
   19243     return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
   19244 
   19245   // Thread synchronization.
   19246   case X86::MONITOR:
   19247     return EmitMonitor(MI, BB, Subtarget);
   19248 
   19249   // xbegin
   19250   case X86::XBEGIN:
   19251     return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
   19252 
   19253   case X86::VASTART_SAVE_XMM_REGS:
   19254     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
   19255 
   19256   case X86::VAARG_64:
   19257     return EmitVAARG64WithCustomInserter(MI, BB);
   19258 
   19259   case X86::EH_SjLj_SetJmp32:
   19260   case X86::EH_SjLj_SetJmp64:
   19261     return emitEHSjLjSetJmp(MI, BB);
   19262 
   19263   case X86::EH_SjLj_LongJmp32:
   19264   case X86::EH_SjLj_LongJmp64:
   19265     return emitEHSjLjLongJmp(MI, BB);
   19266 
   19267   case TargetOpcode::STATEPOINT:
   19268     // As an implementation detail, STATEPOINT shares the STACKMAP format at
   19269     // this point in the process.  We diverge later.
   19270     return emitPatchPoint(MI, BB);
   19271 
   19272   case TargetOpcode::STACKMAP:
   19273   case TargetOpcode::PATCHPOINT:
   19274     return emitPatchPoint(MI, BB);
   19275 
   19276   case X86::VFMADDPDr213r:
   19277   case X86::VFMADDPSr213r:
   19278   case X86::VFMADDSDr213r:
   19279   case X86::VFMADDSSr213r:
   19280   case X86::VFMSUBPDr213r:
   19281   case X86::VFMSUBPSr213r:
   19282   case X86::VFMSUBSDr213r:
   19283   case X86::VFMSUBSSr213r:
   19284   case X86::VFNMADDPDr213r:
   19285   case X86::VFNMADDPSr213r:
   19286   case X86::VFNMADDSDr213r:
   19287   case X86::VFNMADDSSr213r:
   19288   case X86::VFNMSUBPDr213r:
   19289   case X86::VFNMSUBPSr213r:
   19290   case X86::VFNMSUBSDr213r:
   19291   case X86::VFNMSUBSSr213r:
   19292   case X86::VFMADDSUBPDr213r:
   19293   case X86::VFMADDSUBPSr213r:
   19294   case X86::VFMSUBADDPDr213r:
   19295   case X86::VFMSUBADDPSr213r:
   19296   case X86::VFMADDPDr213rY:
   19297   case X86::VFMADDPSr213rY:
   19298   case X86::VFMSUBPDr213rY:
   19299   case X86::VFMSUBPSr213rY:
   19300   case X86::VFNMADDPDr213rY:
   19301   case X86::VFNMADDPSr213rY:
   19302   case X86::VFNMSUBPDr213rY:
   19303   case X86::VFNMSUBPSr213rY:
   19304   case X86::VFMADDSUBPDr213rY:
   19305   case X86::VFMADDSUBPSr213rY:
   19306   case X86::VFMSUBADDPDr213rY:
   19307   case X86::VFMSUBADDPSr213rY:
   19308     return emitFMA3Instr(MI, BB);
   19309   }
   19310 }
   19311 
   19312 //===----------------------------------------------------------------------===//
   19313 //                           X86 Optimization Hooks
   19314 //===----------------------------------------------------------------------===//
   19315 
   19316 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   19317                                                       APInt &KnownZero,
   19318                                                       APInt &KnownOne,
   19319                                                       const SelectionDAG &DAG,
   19320                                                       unsigned Depth) const {
   19321   unsigned BitWidth = KnownZero.getBitWidth();
   19322   unsigned Opc = Op.getOpcode();
   19323   assert((Opc >= ISD::BUILTIN_OP_END ||
   19324           Opc == ISD::INTRINSIC_WO_CHAIN ||
   19325           Opc == ISD::INTRINSIC_W_CHAIN ||
   19326           Opc == ISD::INTRINSIC_VOID) &&
   19327          "Should use MaskedValueIsZero if you don't know whether Op"
   19328          " is a target node!");
   19329 
   19330   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
   19331   switch (Opc) {
   19332   default: break;
   19333   case X86ISD::ADD:
   19334   case X86ISD::SUB:
   19335   case X86ISD::ADC:
   19336   case X86ISD::SBB:
   19337   case X86ISD::SMUL:
   19338   case X86ISD::UMUL:
   19339   case X86ISD::INC:
   19340   case X86ISD::DEC:
   19341   case X86ISD::OR:
   19342   case X86ISD::XOR:
   19343   case X86ISD::AND:
   19344     // These nodes' second result is a boolean.
   19345     if (Op.getResNo() == 0)
   19346       break;
   19347     // Fallthrough
   19348   case X86ISD::SETCC:
   19349     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
   19350     break;
   19351   case ISD::INTRINSIC_WO_CHAIN: {
   19352     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   19353     unsigned NumLoBits = 0;
   19354     switch (IntId) {
   19355     default: break;
   19356     case Intrinsic::x86_sse_movmsk_ps:
   19357     case Intrinsic::x86_avx_movmsk_ps_256:
   19358     case Intrinsic::x86_sse2_movmsk_pd:
   19359     case Intrinsic::x86_avx_movmsk_pd_256:
   19360     case Intrinsic::x86_mmx_pmovmskb:
   19361     case Intrinsic::x86_sse2_pmovmskb_128:
   19362     case Intrinsic::x86_avx2_pmovmskb: {
   19363       // High bits of movmskp{s|d}, pmovmskb are known zero.
   19364       switch (IntId) {
   19365         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   19366         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
   19367         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
   19368         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
   19369         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
   19370         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
   19371         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
   19372         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
   19373       }
   19374       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
   19375       break;
   19376     }
   19377     }
   19378     break;
   19379   }
   19380   }
   19381 }
   19382 
   19383 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
   19384   SDValue Op,
   19385   const SelectionDAG &,
   19386   unsigned Depth) const {
   19387   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
   19388   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
   19389     return Op.getValueType().getScalarType().getSizeInBits();
   19390 
   19391   // Fallback case.
   19392   return 1;
   19393 }
   19394 
   19395 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
   19396 /// node is a GlobalAddress + offset.
   19397 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   19398                                        const GlobalValue* &GA,
   19399                                        int64_t &Offset) const {
   19400   if (N->getOpcode() == X86ISD::Wrapper) {
   19401     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
   19402       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
   19403       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
   19404       return true;
   19405     }
   19406   }
   19407   return TargetLowering::isGAPlusOffset(N, GA, Offset);
   19408 }
   19409 
   19410 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
   19411 /// same as extracting the high 128-bit part of 256-bit vector and then
   19412 /// inserting the result into the low part of a new 256-bit vector
   19413 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
   19414   EVT VT = SVOp->getValueType(0);
   19415   unsigned NumElems = VT.getVectorNumElements();
   19416 
   19417   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   19418   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
   19419     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
   19420         SVOp->getMaskElt(j) >= 0)
   19421       return false;
   19422 
   19423   return true;
   19424 }
   19425 
   19426 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
   19427 /// same as extracting the low 128-bit part of 256-bit vector and then
   19428 /// inserting the result into the high part of a new 256-bit vector
   19429 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
   19430   EVT VT = SVOp->getValueType(0);
   19431   unsigned NumElems = VT.getVectorNumElements();
   19432 
   19433   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   19434   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
   19435     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
   19436         SVOp->getMaskElt(j) >= 0)
   19437       return false;
   19438 
   19439   return true;
   19440 }
   19441 
   19442 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
   19443 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
   19444                                         TargetLowering::DAGCombinerInfo &DCI,
   19445                                         const X86Subtarget* Subtarget) {
   19446   SDLoc dl(N);
   19447   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   19448   SDValue V1 = SVOp->getOperand(0);
   19449   SDValue V2 = SVOp->getOperand(1);
   19450   EVT VT = SVOp->getValueType(0);
   19451   unsigned NumElems = VT.getVectorNumElements();
   19452 
   19453   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
   19454       V2.getOpcode() == ISD::CONCAT_VECTORS) {
   19455     //
   19456     //                   0,0,0,...
   19457     //                      |
   19458     //    V      UNDEF    BUILD_VECTOR    UNDEF
   19459     //     \      /           \           /
   19460     //  CONCAT_VECTOR         CONCAT_VECTOR
   19461     //         \                  /
   19462     //          \                /
   19463     //          RESULT: V + zero extended
   19464     //
   19465     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
   19466         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
   19467         V1.getOperand(1).getOpcode() != ISD::UNDEF)
   19468       return SDValue();
   19469 
   19470     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
   19471       return SDValue();
   19472 
   19473     // To match the shuffle mask, the first half of the mask should
   19474     // be exactly the first vector, and all the rest a splat with the
   19475     // first element of the second one.
   19476     for (unsigned i = 0; i != NumElems/2; ++i)
   19477       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
   19478           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
   19479         return SDValue();
   19480 
   19481     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
   19482     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
   19483       if (Ld->hasNUsesOfValue(1, 0)) {
   19484         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
   19485         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
   19486         SDValue ResNode =
   19487           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
   19488                                   Ld->getMemoryVT(),
   19489                                   Ld->getPointerInfo(),
   19490                                   Ld->getAlignment(),
   19491                                   false/*isVolatile*/, true/*ReadMem*/,
   19492                                   false/*WriteMem*/);
   19493 
   19494         // Make sure the newly-created LOAD is in the same position as Ld in
   19495         // terms of dependency. We create a TokenFactor for Ld and ResNode,
   19496         // and update uses of Ld's output chain to use the TokenFactor.
   19497         if (Ld->hasAnyUseOfValue(1)) {
   19498           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   19499                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
   19500           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
   19501           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
   19502                                  SDValue(ResNode.getNode(), 1));
   19503         }
   19504 
   19505         return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
   19506       }
   19507     }
   19508 
   19509     // Emit a zeroed vector and insert the desired subvector on its
   19510     // first half.
   19511     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   19512     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
   19513     return DCI.CombineTo(N, InsV);
   19514   }
   19515 
   19516   //===--------------------------------------------------------------------===//
   19517   // Combine some shuffles into subvector extracts and inserts:
   19518   //
   19519 
   19520   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   19521   if (isShuffleHigh128VectorInsertLow(SVOp)) {
   19522     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
   19523     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
   19524     return DCI.CombineTo(N, InsV);
   19525   }
   19526 
   19527   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   19528   if (isShuffleLow128VectorInsertHigh(SVOp)) {
   19529     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
   19530     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
   19531     return DCI.CombineTo(N, InsV);
   19532   }
   19533 
   19534   return SDValue();
   19535 }
   19536 
   19537 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
   19538 /// possible.
   19539 ///
   19540 /// This is the leaf of the recursive combinine below. When we have found some
   19541 /// chain of single-use x86 shuffle instructions and accumulated the combined
   19542 /// shuffle mask represented by them, this will try to pattern match that mask
   19543 /// into either a single instruction if there is a special purpose instruction
   19544 /// for this operation, or into a PSHUFB instruction which is a fully general
   19545 /// instruction but should only be used to replace chains over a certain depth.
   19546 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
   19547                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
   19548                                    TargetLowering::DAGCombinerInfo &DCI,
   19549                                    const X86Subtarget *Subtarget) {
   19550   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
   19551 
   19552   // Find the operand that enters the chain. Note that multiple uses are OK
   19553   // here, we're not going to remove the operand we find.
   19554   SDValue Input = Op.getOperand(0);
   19555   while (Input.getOpcode() == ISD::BITCAST)
   19556     Input = Input.getOperand(0);
   19557 
   19558   MVT VT = Input.getSimpleValueType();
   19559   MVT RootVT = Root.getSimpleValueType();
   19560   SDLoc DL(Root);
   19561 
   19562   // Just remove no-op shuffle masks.
   19563   if (Mask.size() == 1) {
   19564     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
   19565                   /*AddTo*/ true);
   19566     return true;
   19567   }
   19568 
   19569   // Use the float domain if the operand type is a floating point type.
   19570   bool FloatDomain = VT.isFloatingPoint();
   19571 
   19572   // For floating point shuffles, we don't have free copies in the shuffle
   19573   // instructions or the ability to load as part of the instruction, so
   19574   // canonicalize their shuffles to UNPCK or MOV variants.
   19575   //
   19576   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
   19577   // vectors because it can have a load folded into it that UNPCK cannot. This
   19578   // doesn't preclude something switching to the shorter encoding post-RA.
   19579   //
   19580   // FIXME: Should teach these routines about AVX vector widths.
   19581   if (FloatDomain && VT.getSizeInBits() == 128) {
   19582     if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
   19583       bool Lo = Mask.equals({0, 0});
   19584       unsigned Shuffle;
   19585       MVT ShuffleVT;
   19586       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
   19587       // is no slower than UNPCKLPD but has the option to fold the input operand
   19588       // into even an unaligned memory load.
   19589       if (Lo && Subtarget->hasSSE3()) {
   19590         Shuffle = X86ISD::MOVDDUP;
   19591         ShuffleVT = MVT::v2f64;
   19592       } else {
   19593         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
   19594         // than the UNPCK variants.
   19595         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
   19596         ShuffleVT = MVT::v4f32;
   19597       }
   19598       if (Depth == 1 && Root->getOpcode() == Shuffle)
   19599         return false; // Nothing to do!
   19600       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
   19601       DCI.AddToWorklist(Op.getNode());
   19602       if (Shuffle == X86ISD::MOVDDUP)
   19603         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
   19604       else
   19605         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
   19606       DCI.AddToWorklist(Op.getNode());
   19607       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
   19608                     /*AddTo*/ true);
   19609       return true;
   19610     }
   19611     if (Subtarget->hasSSE3() &&
   19612         (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) {
   19613       bool Lo = Mask.equals({0, 0, 2, 2});
   19614       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
   19615       MVT ShuffleVT = MVT::v4f32;
   19616       if (Depth == 1 && Root->getOpcode() == Shuffle)
   19617         return false; // Nothing to do!
   19618       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
   19619       DCI.AddToWorklist(Op.getNode());
   19620       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
   19621       DCI.AddToWorklist(Op.getNode());
   19622       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
   19623                     /*AddTo*/ true);
   19624       return true;
   19625     }
   19626     if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) {
   19627       bool Lo = Mask.equals({0, 0, 1, 1});
   19628       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
   19629       MVT ShuffleVT = MVT::v4f32;
   19630       if (Depth == 1 && Root->getOpcode() == Shuffle)
   19631         return false; // Nothing to do!
   19632       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
   19633       DCI.AddToWorklist(Op.getNode());
   19634       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
   19635       DCI.AddToWorklist(Op.getNode());
   19636       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
   19637                     /*AddTo*/ true);
   19638       return true;
   19639     }
   19640   }
   19641 
   19642   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
   19643   // variants as none of these have single-instruction variants that are
   19644   // superior to the UNPCK formulation.
   19645   if (!FloatDomain && VT.getSizeInBits() == 128 &&
   19646       (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
   19647        Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
   19648        Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
   19649        Mask.equals(
   19650            {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) {
   19651     bool Lo = Mask[0] == 0;
   19652     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
   19653     if (Depth == 1 && Root->getOpcode() == Shuffle)
   19654       return false; // Nothing to do!
   19655     MVT ShuffleVT;
   19656     switch (Mask.size()) {
   19657     case 8:
   19658       ShuffleVT = MVT::v8i16;
   19659       break;
   19660     case 16:
   19661       ShuffleVT = MVT::v16i8;
   19662       break;
   19663     default:
   19664       llvm_unreachable("Impossible mask size!");
   19665     };
   19666     Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
   19667     DCI.AddToWorklist(Op.getNode());
   19668     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
   19669     DCI.AddToWorklist(Op.getNode());
   19670     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
   19671                   /*AddTo*/ true);
   19672     return true;
   19673   }
   19674 
   19675   // Don't try to re-form single instruction chains under any circumstances now
   19676   // that we've done encoding canonicalization for them.
   19677   if (Depth < 2)
   19678     return false;
   19679 
   19680   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
   19681   // can replace them with a single PSHUFB instruction profitably. Intel's
   19682   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
   19683   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
   19684   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
   19685     SmallVector<SDValue, 16> PSHUFBMask;
   19686     int NumBytes = VT.getSizeInBits() / 8;
   19687     int Ratio = NumBytes / Mask.size();
   19688     for (int i = 0; i < NumBytes; ++i) {
   19689       if (Mask[i / Ratio] == SM_SentinelUndef) {
   19690         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
   19691         continue;
   19692       }
   19693       int M = Mask[i / Ratio] != SM_SentinelZero
   19694                   ? Ratio * Mask[i / Ratio] + i % Ratio
   19695                   : 255;
   19696       PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
   19697     }
   19698     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
   19699     Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Input);
   19700     DCI.AddToWorklist(Op.getNode());
   19701     SDValue PSHUFBMaskOp =
   19702         DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
   19703     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
   19704     Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
   19705     DCI.AddToWorklist(Op.getNode());
   19706     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
   19707                   /*AddTo*/ true);
   19708     return true;
   19709   }
   19710 
   19711   // Failed to find any combines.
   19712   return false;
   19713 }
   19714 
   19715 /// \brief Fully generic combining of x86 shuffle instructions.
   19716 ///
   19717 /// This should be the last combine run over the x86 shuffle instructions. Once
   19718 /// they have been fully optimized, this will recursively consider all chains
   19719 /// of single-use shuffle instructions, build a generic model of the cumulative
   19720 /// shuffle operation, and check for simpler instructions which implement this
   19721 /// operation. We use this primarily for two purposes:
   19722 ///
   19723 /// 1) Collapse generic shuffles to specialized single instructions when
   19724 ///    equivalent. In most cases, this is just an encoding size win, but
   19725 ///    sometimes we will collapse multiple generic shuffles into a single
   19726 ///    special-purpose shuffle.
   19727 /// 2) Look for sequences of shuffle instructions with 3 or more total
   19728 ///    instructions, and replace them with the slightly more expensive SSSE3
   19729 ///    PSHUFB instruction if available. We do this as the last combining step
   19730 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
   19731 ///    a suitable short sequence of other instructions. The PHUFB will either
   19732 ///    use a register or have to read from memory and so is slightly (but only
   19733 ///    slightly) more expensive than the other shuffle instructions.
   19734 ///
   19735 /// Because this is inherently a quadratic operation (for each shuffle in
   19736 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
   19737 /// This should never be an issue in practice as the shuffle lowering doesn't
   19738 /// produce sequences of more than 8 instructions.
   19739 ///
   19740 /// FIXME: We will currently miss some cases where the redundant shuffling
   19741 /// would simplify under the threshold for PSHUFB formation because of
   19742 /// combine-ordering. To fix this, we should do the redundant instruction
   19743 /// combining in this recursive walk.
   19744 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
   19745                                           ArrayRef<int> RootMask,
   19746                                           int Depth, bool HasPSHUFB,
   19747                                           SelectionDAG &DAG,
   19748                                           TargetLowering::DAGCombinerInfo &DCI,
   19749                                           const X86Subtarget *Subtarget) {
   19750   // Bound the depth of our recursive combine because this is ultimately
   19751   // quadratic in nature.
   19752   if (Depth > 8)
   19753     return false;
   19754 
   19755   // Directly rip through bitcasts to find the underlying operand.
   19756   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
   19757     Op = Op.getOperand(0);
   19758 
   19759   MVT VT = Op.getSimpleValueType();
   19760   if (!VT.isVector())
   19761     return false; // Bail if we hit a non-vector.
   19762 
   19763   assert(Root.getSimpleValueType().isVector() &&
   19764          "Shuffles operate on vector types!");
   19765   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
   19766          "Can only combine shuffles of the same vector register size.");
   19767 
   19768   if (!isTargetShuffle(Op.getOpcode()))
   19769     return false;
   19770   SmallVector<int, 16> OpMask;
   19771   bool IsUnary;
   19772   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
   19773   // We only can combine unary shuffles which we can decode the mask for.
   19774   if (!HaveMask || !IsUnary)
   19775     return false;
   19776 
   19777   assert(VT.getVectorNumElements() == OpMask.size() &&
   19778          "Different mask size from vector size!");
   19779   assert(((RootMask.size() > OpMask.size() &&
   19780            RootMask.size() % OpMask.size() == 0) ||
   19781           (OpMask.size() > RootMask.size() &&
   19782            OpMask.size() % RootMask.size() == 0) ||
   19783           OpMask.size() == RootMask.size()) &&
   19784          "The smaller number of elements must divide the larger.");
   19785   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
   19786   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
   19787   assert(((RootRatio == 1 && OpRatio == 1) ||
   19788           (RootRatio == 1) != (OpRatio == 1)) &&
   19789          "Must not have a ratio for both incoming and op masks!");
   19790 
   19791   SmallVector<int, 16> Mask;
   19792   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
   19793 
   19794   // Merge this shuffle operation's mask into our accumulated mask. Note that
   19795   // this shuffle's mask will be the first applied to the input, followed by the
   19796   // root mask to get us all the way to the root value arrangement. The reason
   19797   // for this order is that we are recursing up the operation chain.
   19798   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
   19799     int RootIdx = i / RootRatio;
   19800     if (RootMask[RootIdx] < 0) {
   19801       // This is a zero or undef lane, we're done.
   19802       Mask.push_back(RootMask[RootIdx]);
   19803       continue;
   19804     }
   19805 
   19806     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
   19807     int OpIdx = RootMaskedIdx / OpRatio;
   19808     if (OpMask[OpIdx] < 0) {
   19809       // The incoming lanes are zero or undef, it doesn't matter which ones we
   19810       // are using.
   19811       Mask.push_back(OpMask[OpIdx]);
   19812       continue;
   19813     }
   19814 
   19815     // Ok, we have non-zero lanes, map them through.
   19816     Mask.push_back(OpMask[OpIdx] * OpRatio +
   19817                    RootMaskedIdx % OpRatio);
   19818   }
   19819 
   19820   // See if we can recurse into the operand to combine more things.
   19821   switch (Op.getOpcode()) {
   19822     case X86ISD::PSHUFB:
   19823       HasPSHUFB = true;
   19824     case X86ISD::PSHUFD:
   19825     case X86ISD::PSHUFHW:
   19826     case X86ISD::PSHUFLW:
   19827       if (Op.getOperand(0).hasOneUse() &&
   19828           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
   19829                                         HasPSHUFB, DAG, DCI, Subtarget))
   19830         return true;
   19831       break;
   19832 
   19833     case X86ISD::UNPCKL:
   19834     case X86ISD::UNPCKH:
   19835       assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
   19836       // We can't check for single use, we have to check that this shuffle is the only user.
   19837       if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
   19838           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
   19839                                         HasPSHUFB, DAG, DCI, Subtarget))
   19840           return true;
   19841       break;
   19842   }
   19843 
   19844   // Minor canonicalization of the accumulated shuffle mask to make it easier
   19845   // to match below. All this does is detect masks with squential pairs of
   19846   // elements, and shrink them to the half-width mask. It does this in a loop
   19847   // so it will reduce the size of the mask to the minimal width mask which
   19848   // performs an equivalent shuffle.
   19849   SmallVector<int, 16> WidenedMask;
   19850   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
   19851     Mask = std::move(WidenedMask);
   19852     WidenedMask.clear();
   19853   }
   19854 
   19855   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
   19856                                 Subtarget);
   19857 }
   19858 
   19859 /// \brief Get the PSHUF-style mask from PSHUF node.
   19860 ///
   19861 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
   19862 /// PSHUF-style masks that can be reused with such instructions.
   19863 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
   19864   MVT VT = N.getSimpleValueType();
   19865   SmallVector<int, 4> Mask;
   19866   bool IsUnary;
   19867   bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
   19868   (void)HaveMask;
   19869   assert(HaveMask);
   19870 
   19871   // If we have more than 128-bits, only the low 128-bits of shuffle mask
   19872   // matter. Check that the upper masks are repeats and remove them.
   19873   if (VT.getSizeInBits() > 128) {
   19874     int LaneElts = 128 / VT.getScalarSizeInBits();
   19875 #ifndef NDEBUG
   19876     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
   19877       for (int j = 0; j < LaneElts; ++j)
   19878         assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts &&
   19879                "Mask doesn't repeat in high 128-bit lanes!");
   19880 #endif
   19881     Mask.resize(LaneElts);
   19882   }
   19883 
   19884   switch (N.getOpcode()) {
   19885   case X86ISD::PSHUFD:
   19886     return Mask;
   19887   case X86ISD::PSHUFLW:
   19888     Mask.resize(4);
   19889     return Mask;
   19890   case X86ISD::PSHUFHW:
   19891     Mask.erase(Mask.begin(), Mask.begin() + 4);
   19892     for (int &M : Mask)
   19893       M -= 4;
   19894     return Mask;
   19895   default:
   19896     llvm_unreachable("No valid shuffle instruction found!");
   19897   }
   19898 }
   19899 
   19900 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
   19901 ///
   19902 /// We walk up the chain and look for a combinable shuffle, skipping over
   19903 /// shuffles that we could hoist this shuffle's transformation past without
   19904 /// altering anything.
   19905 static SDValue
   19906 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
   19907                              SelectionDAG &DAG,
   19908                              TargetLowering::DAGCombinerInfo &DCI) {
   19909   assert(N.getOpcode() == X86ISD::PSHUFD &&
   19910          "Called with something other than an x86 128-bit half shuffle!");
   19911   SDLoc DL(N);
   19912 
   19913   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
   19914   // of the shuffles in the chain so that we can form a fresh chain to replace
   19915   // this one.
   19916   SmallVector<SDValue, 8> Chain;
   19917   SDValue V = N.getOperand(0);
   19918   for (; V.hasOneUse(); V = V.getOperand(0)) {
   19919     switch (V.getOpcode()) {
   19920     default:
   19921       return SDValue(); // Nothing combined!
   19922 
   19923     case ISD::BITCAST:
   19924       // Skip bitcasts as we always know the type for the target specific
   19925       // instructions.
   19926       continue;
   19927 
   19928     case X86ISD::PSHUFD:
   19929       // Found another dword shuffle.
   19930       break;
   19931 
   19932     case X86ISD::PSHUFLW:
   19933       // Check that the low words (being shuffled) are the identity in the
   19934       // dword shuffle, and the high words are self-contained.
   19935       if (Mask[0] != 0 || Mask[1] != 1 ||
   19936           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
   19937         return SDValue();
   19938 
   19939       Chain.push_back(V);
   19940       continue;
   19941 
   19942     case X86ISD::PSHUFHW:
   19943       // Check that the high words (being shuffled) are the identity in the
   19944       // dword shuffle, and the low words are self-contained.
   19945       if (Mask[2] != 2 || Mask[3] != 3 ||
   19946           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
   19947         return SDValue();
   19948 
   19949       Chain.push_back(V);
   19950       continue;
   19951 
   19952     case X86ISD::UNPCKL:
   19953     case X86ISD::UNPCKH:
   19954       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
   19955       // shuffle into a preceding word shuffle.
   19956       if (V.getSimpleValueType().getScalarType() != MVT::i8 &&
   19957           V.getSimpleValueType().getScalarType() != MVT::i16)
   19958         return SDValue();
   19959 
   19960       // Search for a half-shuffle which we can combine with.
   19961       unsigned CombineOp =
   19962           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
   19963       if (V.getOperand(0) != V.getOperand(1) ||
   19964           !V->isOnlyUserOf(V.getOperand(0).getNode()))
   19965         return SDValue();
   19966       Chain.push_back(V);
   19967       V = V.getOperand(0);
   19968       do {
   19969         switch (V.getOpcode()) {
   19970         default:
   19971           return SDValue(); // Nothing to combine.
   19972 
   19973         case X86ISD::PSHUFLW:
   19974         case X86ISD::PSHUFHW:
   19975           if (V.getOpcode() == CombineOp)
   19976             break;
   19977 
   19978           Chain.push_back(V);
   19979 
   19980           // Fallthrough!
   19981         case ISD::BITCAST:
   19982           V = V.getOperand(0);
   19983           continue;
   19984         }
   19985         break;
   19986       } while (V.hasOneUse());
   19987       break;
   19988     }
   19989     // Break out of the loop if we break out of the switch.
   19990     break;
   19991   }
   19992 
   19993   if (!V.hasOneUse())
   19994     // We fell out of the loop without finding a viable combining instruction.
   19995     return SDValue();
   19996 
   19997   // Merge this node's mask and our incoming mask.
   19998   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   19999   for (int &M : Mask)
   20000     M = VMask[M];
   20001   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
   20002                   getV4X86ShuffleImm8ForMask(Mask, DAG));
   20003 
   20004   // Rebuild the chain around this new shuffle.
   20005   while (!Chain.empty()) {
   20006     SDValue W = Chain.pop_back_val();
   20007 
   20008     if (V.getValueType() != W.getOperand(0).getValueType())
   20009       V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
   20010 
   20011     switch (W.getOpcode()) {
   20012     default:
   20013       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
   20014 
   20015     case X86ISD::UNPCKL:
   20016     case X86ISD::UNPCKH:
   20017       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
   20018       break;
   20019 
   20020     case X86ISD::PSHUFD:
   20021     case X86ISD::PSHUFLW:
   20022     case X86ISD::PSHUFHW:
   20023       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
   20024       break;
   20025     }
   20026   }
   20027   if (V.getValueType() != N.getValueType())
   20028     V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
   20029 
   20030   // Return the new chain to replace N.
   20031   return V;
   20032 }
   20033 
   20034 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
   20035 ///
   20036 /// We walk up the chain, skipping shuffles of the other half and looking
   20037 /// through shuffles which switch halves trying to find a shuffle of the same
   20038 /// pair of dwords.
   20039 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
   20040                                         SelectionDAG &DAG,
   20041                                         TargetLowering::DAGCombinerInfo &DCI) {
   20042   assert(
   20043       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
   20044       "Called with something other than an x86 128-bit half shuffle!");
   20045   SDLoc DL(N);
   20046   unsigned CombineOpcode = N.getOpcode();
   20047 
   20048   // Walk up a single-use chain looking for a combinable shuffle.
   20049   SDValue V = N.getOperand(0);
   20050   for (; V.hasOneUse(); V = V.getOperand(0)) {
   20051     switch (V.getOpcode()) {
   20052     default:
   20053       return false; // Nothing combined!
   20054 
   20055     case ISD::BITCAST:
   20056       // Skip bitcasts as we always know the type for the target specific
   20057       // instructions.
   20058       continue;
   20059 
   20060     case X86ISD::PSHUFLW:
   20061     case X86ISD::PSHUFHW:
   20062       if (V.getOpcode() == CombineOpcode)
   20063         break;
   20064 
   20065       // Other-half shuffles are no-ops.
   20066       continue;
   20067     }
   20068     // Break out of the loop if we break out of the switch.
   20069     break;
   20070   }
   20071 
   20072   if (!V.hasOneUse())
   20073     // We fell out of the loop without finding a viable combining instruction.
   20074     return false;
   20075 
   20076   // Combine away the bottom node as its shuffle will be accumulated into
   20077   // a preceding shuffle.
   20078   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
   20079 
   20080   // Record the old value.
   20081   SDValue Old = V;
   20082 
   20083   // Merge this node's mask and our incoming mask (adjusted to account for all
   20084   // the pshufd instructions encountered).
   20085   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   20086   for (int &M : Mask)
   20087     M = VMask[M];
   20088   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
   20089                   getV4X86ShuffleImm8ForMask(Mask, DAG));
   20090 
   20091   // Check that the shuffles didn't cancel each other out. If not, we need to
   20092   // combine to the new one.
   20093   if (Old != V)
   20094     // Replace the combinable shuffle with the combined one, updating all users
   20095     // so that we re-evaluate the chain here.
   20096     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
   20097 
   20098   return true;
   20099 }
   20100 
   20101 /// \brief Try to combine x86 target specific shuffles.
   20102 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
   20103                                            TargetLowering::DAGCombinerInfo &DCI,
   20104                                            const X86Subtarget *Subtarget) {
   20105   SDLoc DL(N);
   20106   MVT VT = N.getSimpleValueType();
   20107   SmallVector<int, 4> Mask;
   20108 
   20109   switch (N.getOpcode()) {
   20110   case X86ISD::PSHUFD:
   20111   case X86ISD::PSHUFLW:
   20112   case X86ISD::PSHUFHW:
   20113     Mask = getPSHUFShuffleMask(N);
   20114     assert(Mask.size() == 4);
   20115     break;
   20116   default:
   20117     return SDValue();
   20118   }
   20119 
   20120   // Nuke no-op shuffles that show up after combining.
   20121   if (isNoopShuffleMask(Mask))
   20122     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
   20123 
   20124   // Look for simplifications involving one or two shuffle instructions.
   20125   SDValue V = N.getOperand(0);
   20126   switch (N.getOpcode()) {
   20127   default:
   20128     break;
   20129   case X86ISD::PSHUFLW:
   20130   case X86ISD::PSHUFHW:
   20131     assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!");
   20132 
   20133     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
   20134       return SDValue(); // We combined away this shuffle, so we're done.
   20135 
   20136     // See if this reduces to a PSHUFD which is no more expensive and can
   20137     // combine with more operations. Note that it has to at least flip the
   20138     // dwords as otherwise it would have been removed as a no-op.
   20139     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
   20140       int DMask[] = {0, 1, 2, 3};
   20141       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
   20142       DMask[DOffset + 0] = DOffset + 1;
   20143       DMask[DOffset + 1] = DOffset + 0;
   20144       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
   20145       V = DAG.getNode(ISD::BITCAST, DL, DVT, V);
   20146       DCI.AddToWorklist(V.getNode());
   20147       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
   20148                       getV4X86ShuffleImm8ForMask(DMask, DAG));
   20149       DCI.AddToWorklist(V.getNode());
   20150       return DAG.getNode(ISD::BITCAST, DL, VT, V);
   20151     }
   20152 
   20153     // Look for shuffle patterns which can be implemented as a single unpack.
   20154     // FIXME: This doesn't handle the location of the PSHUFD generically, and
   20155     // only works when we have a PSHUFD followed by two half-shuffles.
   20156     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
   20157         (V.getOpcode() == X86ISD::PSHUFLW ||
   20158          V.getOpcode() == X86ISD::PSHUFHW) &&
   20159         V.getOpcode() != N.getOpcode() &&
   20160         V.hasOneUse()) {
   20161       SDValue D = V.getOperand(0);
   20162       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
   20163         D = D.getOperand(0);
   20164       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
   20165         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   20166         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
   20167         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
   20168         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
   20169         int WordMask[8];
   20170         for (int i = 0; i < 4; ++i) {
   20171           WordMask[i + NOffset] = Mask[i] + NOffset;
   20172           WordMask[i + VOffset] = VMask[i] + VOffset;
   20173         }
   20174         // Map the word mask through the DWord mask.
   20175         int MappedMask[8];
   20176         for (int i = 0; i < 8; ++i)
   20177           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
   20178         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
   20179             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
   20180           // We can replace all three shuffles with an unpack.
   20181           V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0));
   20182           DCI.AddToWorklist(V.getNode());
   20183           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
   20184                                                 : X86ISD::UNPCKH,
   20185                              DL, VT, V, V);
   20186         }
   20187       }
   20188     }
   20189 
   20190     break;
   20191 
   20192   case X86ISD::PSHUFD:
   20193     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
   20194       return NewN;
   20195 
   20196     break;
   20197   }
   20198 
   20199   return SDValue();
   20200 }
   20201 
   20202 /// \brief Try to combine a shuffle into a target-specific add-sub node.
   20203 ///
   20204 /// We combine this directly on the abstract vector shuffle nodes so it is
   20205 /// easier to generically match. We also insert dummy vector shuffle nodes for
   20206 /// the operands which explicitly discard the lanes which are unused by this
   20207 /// operation to try to flow through the rest of the combiner the fact that
   20208 /// they're unused.
   20209 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
   20210   SDLoc DL(N);
   20211   EVT VT = N->getValueType(0);
   20212 
   20213   // We only handle target-independent shuffles.
   20214   // FIXME: It would be easy and harmless to use the target shuffle mask
   20215   // extraction tool to support more.
   20216   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
   20217     return SDValue();
   20218 
   20219   auto *SVN = cast<ShuffleVectorSDNode>(N);
   20220   ArrayRef<int> Mask = SVN->getMask();
   20221   SDValue V1 = N->getOperand(0);
   20222   SDValue V2 = N->getOperand(1);
   20223 
   20224   // We require the first shuffle operand to be the SUB node, and the second to
   20225   // be the ADD node.
   20226   // FIXME: We should support the commuted patterns.
   20227   if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
   20228     return SDValue();
   20229 
   20230   // If there are other uses of these operations we can't fold them.
   20231   if (!V1->hasOneUse() || !V2->hasOneUse())
   20232     return SDValue();
   20233 
   20234   // Ensure that both operations have the same operands. Note that we can
   20235   // commute the FADD operands.
   20236   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
   20237   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
   20238       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
   20239     return SDValue();
   20240 
   20241   // We're looking for blends between FADD and FSUB nodes. We insist on these
   20242   // nodes being lined up in a specific expected pattern.
   20243   if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
   20244         isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
   20245         isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
   20246     return SDValue();
   20247 
   20248   // Only specific types are legal at this point, assert so we notice if and
   20249   // when these change.
   20250   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
   20251           VT == MVT::v4f64) &&
   20252          "Unknown vector type encountered!");
   20253 
   20254   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
   20255 }
   20256 
   20257 /// PerformShuffleCombine - Performs several different shuffle combines.
   20258 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   20259                                      TargetLowering::DAGCombinerInfo &DCI,
   20260                                      const X86Subtarget *Subtarget) {
   20261   SDLoc dl(N);
   20262   SDValue N0 = N->getOperand(0);
   20263   SDValue N1 = N->getOperand(1);
   20264   EVT VT = N->getValueType(0);
   20265 
   20266   // Don't create instructions with illegal types after legalize types has run.
   20267   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   20268   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
   20269     return SDValue();
   20270 
   20271   // If we have legalized the vector types, look for blends of FADD and FSUB
   20272   // nodes that we can fuse into an ADDSUB node.
   20273   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
   20274     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
   20275       return AddSub;
   20276 
   20277   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
   20278   if (Subtarget->hasFp256() && VT.is256BitVector() &&
   20279       N->getOpcode() == ISD::VECTOR_SHUFFLE)
   20280     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
   20281 
   20282   // During Type Legalization, when promoting illegal vector types,
   20283   // the backend might introduce new shuffle dag nodes and bitcasts.
   20284   //
   20285   // This code performs the following transformation:
   20286   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
   20287   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
   20288   //
   20289   // We do this only if both the bitcast and the BINOP dag nodes have
   20290   // one use. Also, perform this transformation only if the new binary
   20291   // operation is legal. This is to avoid introducing dag nodes that
   20292   // potentially need to be further expanded (or custom lowered) into a
   20293   // less optimal sequence of dag nodes.
   20294   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
   20295       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
   20296       N0.getOpcode() == ISD::BITCAST) {
   20297     SDValue BC0 = N0.getOperand(0);
   20298     EVT SVT = BC0.getValueType();
   20299     unsigned Opcode = BC0.getOpcode();
   20300     unsigned NumElts = VT.getVectorNumElements();
   20301 
   20302     if (BC0.hasOneUse() && SVT.isVector() &&
   20303         SVT.getVectorNumElements() * 2 == NumElts &&
   20304         TLI.isOperationLegal(Opcode, VT)) {
   20305       bool CanFold = false;
   20306       switch (Opcode) {
   20307       default : break;
   20308       case ISD::ADD :
   20309       case ISD::FADD :
   20310       case ISD::SUB :
   20311       case ISD::FSUB :
   20312       case ISD::MUL :
   20313       case ISD::FMUL :
   20314         CanFold = true;
   20315       }
   20316 
   20317       unsigned SVTNumElts = SVT.getVectorNumElements();
   20318       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   20319       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
   20320         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
   20321       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
   20322         CanFold = SVOp->getMaskElt(i) < 0;
   20323 
   20324       if (CanFold) {
   20325         SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
   20326         SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
   20327         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
   20328         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
   20329       }
   20330     }
   20331   }
   20332 
   20333   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
   20334   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
   20335   // consecutive, non-overlapping, and in the right order.
   20336   SmallVector<SDValue, 16> Elts;
   20337   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
   20338     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
   20339 
   20340   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
   20341   if (LD.getNode())
   20342     return LD;
   20343 
   20344   if (isTargetShuffle(N->getOpcode())) {
   20345     SDValue Shuffle =
   20346         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
   20347     if (Shuffle.getNode())
   20348       return Shuffle;
   20349 
   20350     // Try recursively combining arbitrary sequences of x86 shuffle
   20351     // instructions into higher-order shuffles. We do this after combining
   20352     // specific PSHUF instruction sequences into their minimal form so that we
   20353     // can evaluate how many specialized shuffle instructions are involved in
   20354     // a particular chain.
   20355     SmallVector<int, 1> NonceMask; // Just a placeholder.
   20356     NonceMask.push_back(0);
   20357     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
   20358                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
   20359                                       DCI, Subtarget))
   20360       return SDValue(); // This routine will use CombineTo to replace N.
   20361   }
   20362 
   20363   return SDValue();
   20364 }
   20365 
   20366 /// PerformTruncateCombine - Converts truncate operation to
   20367 /// a sequence of vector shuffle operations.
   20368 /// It is possible when we truncate 256-bit vector to 128-bit vector
   20369 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
   20370                                       TargetLowering::DAGCombinerInfo &DCI,
   20371                                       const X86Subtarget *Subtarget)  {
   20372   return SDValue();
   20373 }
   20374 
   20375 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
   20376 /// specific shuffle of a load can be folded into a single element load.
   20377 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
   20378 /// shuffles have been custom lowered so we need to handle those here.
   20379 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   20380                                          TargetLowering::DAGCombinerInfo &DCI) {
   20381   if (DCI.isBeforeLegalizeOps())
   20382     return SDValue();
   20383 
   20384   SDValue InVec = N->getOperand(0);
   20385   SDValue EltNo = N->getOperand(1);
   20386 
   20387   if (!isa<ConstantSDNode>(EltNo))
   20388     return SDValue();
   20389 
   20390   EVT OriginalVT = InVec.getValueType();
   20391 
   20392   if (InVec.getOpcode() == ISD::BITCAST) {
   20393     // Don't duplicate a load with other uses.
   20394     if (!InVec.hasOneUse())
   20395       return SDValue();
   20396     EVT BCVT = InVec.getOperand(0).getValueType();
   20397     if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
   20398       return SDValue();
   20399     InVec = InVec.getOperand(0);
   20400   }
   20401 
   20402   EVT CurrentVT = InVec.getValueType();
   20403 
   20404   if (!isTargetShuffle(InVec.getOpcode()))
   20405     return SDValue();
   20406 
   20407   // Don't duplicate a load with other uses.
   20408   if (!InVec.hasOneUse())
   20409     return SDValue();
   20410 
   20411   SmallVector<int, 16> ShuffleMask;
   20412   bool UnaryShuffle;
   20413   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
   20414                             ShuffleMask, UnaryShuffle))
   20415     return SDValue();
   20416 
   20417   // Select the input vector, guarding against out of range extract vector.
   20418   unsigned NumElems = CurrentVT.getVectorNumElements();
   20419   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   20420   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
   20421   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
   20422                                          : InVec.getOperand(1);
   20423 
   20424   // If inputs to shuffle are the same for both ops, then allow 2 uses
   20425   unsigned AllowedUses = InVec.getNumOperands() > 1 &&
   20426                          InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
   20427 
   20428   if (LdNode.getOpcode() == ISD::BITCAST) {
   20429     // Don't duplicate a load with other uses.
   20430     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
   20431       return SDValue();
   20432 
   20433     AllowedUses = 1; // only allow 1 load use if we have a bitcast
   20434     LdNode = LdNode.getOperand(0);
   20435   }
   20436 
   20437   if (!ISD::isNormalLoad(LdNode.getNode()))
   20438     return SDValue();
   20439 
   20440   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
   20441 
   20442   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
   20443     return SDValue();
   20444 
   20445   EVT EltVT = N->getValueType(0);
   20446   // If there's a bitcast before the shuffle, check if the load type and
   20447   // alignment is valid.
   20448   unsigned Align = LN0->getAlignment();
   20449   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   20450   unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
   20451       EltVT.getTypeForEVT(*DAG.getContext()));
   20452 
   20453   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
   20454     return SDValue();
   20455 
   20456   // All checks match so transform back to vector_shuffle so that DAG combiner
   20457   // can finish the job
   20458   SDLoc dl(N);
   20459 
   20460   // Create shuffle node taking into account the case that its a unary shuffle
   20461   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
   20462                                    : InVec.getOperand(1);
   20463   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
   20464                                  InVec.getOperand(0), Shuffle,
   20465                                  &ShuffleMask[0]);
   20466   Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
   20467   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
   20468                      EltNo);
   20469 }
   20470 
   20471 /// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
   20472 /// special and don't usually play with other vector types, it's better to
   20473 /// handle them early to be sure we emit efficient code by avoiding
   20474 /// store-load conversions.
   20475 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
   20476   if (N->getValueType(0) != MVT::x86mmx ||
   20477       N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
   20478       N->getOperand(0)->getValueType(0) != MVT::v2i32)
   20479     return SDValue();
   20480 
   20481   SDValue V = N->getOperand(0);
   20482   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
   20483   if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
   20484     return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
   20485                        N->getValueType(0), V.getOperand(0));
   20486 
   20487   return SDValue();
   20488 }
   20489 
   20490 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
   20491 /// generation and convert it from being a bunch of shuffles and extracts
   20492 /// into a somewhat faster sequence. For i686, the best sequence is apparently
   20493 /// storing the value and loading scalars back, while for x64 we should
   20494 /// use 64-bit extracts and shifts.
   20495 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   20496                                          TargetLowering::DAGCombinerInfo &DCI) {
   20497   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
   20498   if (NewOp.getNode())
   20499     return NewOp;
   20500 
   20501   SDValue InputVector = N->getOperand(0);
   20502 
   20503   // Detect mmx to i32 conversion through a v2i32 elt extract.
   20504   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
   20505       N->getValueType(0) == MVT::i32 &&
   20506       InputVector.getValueType() == MVT::v2i32) {
   20507 
   20508     // The bitcast source is a direct mmx result.
   20509     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
   20510     if (MMXSrc.getValueType() == MVT::x86mmx)
   20511       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
   20512                          N->getValueType(0),
   20513                          InputVector.getNode()->getOperand(0));
   20514 
   20515     // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
   20516     SDValue MMXSrcOp = MMXSrc.getOperand(0);
   20517     if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
   20518         MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
   20519         MMXSrcOp.getOpcode() == ISD::BITCAST &&
   20520         MMXSrcOp.getValueType() == MVT::v1i64 &&
   20521         MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
   20522       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
   20523                          N->getValueType(0),
   20524                          MMXSrcOp.getOperand(0));
   20525   }
   20526 
   20527   // Only operate on vectors of 4 elements, where the alternative shuffling
   20528   // gets to be more expensive.
   20529   if (InputVector.getValueType() != MVT::v4i32)
   20530     return SDValue();
   20531 
   20532   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
   20533   // single use which is a sign-extend or zero-extend, and all elements are
   20534   // used.
   20535   SmallVector<SDNode *, 4> Uses;
   20536   unsigned ExtractedElements = 0;
   20537   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
   20538        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
   20539     if (UI.getUse().getResNo() != InputVector.getResNo())
   20540       return SDValue();
   20541 
   20542     SDNode *Extract = *UI;
   20543     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   20544       return SDValue();
   20545 
   20546     if (Extract->getValueType(0) != MVT::i32)
   20547       return SDValue();
   20548     if (!Extract->hasOneUse())
   20549       return SDValue();
   20550     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
   20551         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
   20552       return SDValue();
   20553     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
   20554       return SDValue();
   20555 
   20556     // Record which element was extracted.
   20557     ExtractedElements |=
   20558       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
   20559 
   20560     Uses.push_back(Extract);
   20561   }
   20562 
   20563   // If not all the elements were used, this may not be worthwhile.
   20564   if (ExtractedElements != 15)
   20565     return SDValue();
   20566 
   20567   // Ok, we've now decided to do the transformation.
   20568   // If 64-bit shifts are legal, use the extract-shift sequence,
   20569   // otherwise bounce the vector off the cache.
   20570   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   20571   SDValue Vals[4];
   20572   SDLoc dl(InputVector);
   20573 
   20574   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
   20575     SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
   20576     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
   20577     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
   20578       DAG.getConstant(0, VecIdxTy));
   20579     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
   20580       DAG.getConstant(1, VecIdxTy));
   20581 
   20582     SDValue ShAmt = DAG.getConstant(32,
   20583       DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
   20584     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
   20585     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   20586       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
   20587     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
   20588     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
   20589       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
   20590   } else {
   20591     // Store the value to a temporary stack slot.
   20592     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
   20593     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
   20594       MachinePointerInfo(), false, false, 0);
   20595 
   20596     EVT ElementType = InputVector.getValueType().getVectorElementType();
   20597     unsigned EltSize = ElementType.getSizeInBits() / 8;
   20598 
   20599     // Replace each use (extract) with a load of the appropriate element.
   20600     for (unsigned i = 0; i < 4; ++i) {
   20601       uint64_t Offset = EltSize * i;
   20602       SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
   20603 
   20604       SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
   20605                                        StackPtr, OffsetVal);
   20606 
   20607       // Load the scalar.
   20608       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
   20609                             ScalarAddr, MachinePointerInfo(),
   20610                             false, false, false, 0);
   20611 
   20612     }
   20613   }
   20614 
   20615   // Replace the extracts
   20616   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
   20617     UE = Uses.end(); UI != UE; ++UI) {
   20618     SDNode *Extract = *UI;
   20619 
   20620     SDValue Idx = Extract->getOperand(1);
   20621     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   20622     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
   20623   }
   20624 
   20625   // The replacement was made in place; don't return anything.
   20626   return SDValue();
   20627 }
   20628 
   20629 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
   20630 static std::pair<unsigned, bool>
   20631 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
   20632                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
   20633   if (!VT.isVector())
   20634     return std::make_pair(0, false);
   20635 
   20636   bool NeedSplit = false;
   20637   switch (VT.getSimpleVT().SimpleTy) {
   20638   default: return std::make_pair(0, false);
   20639   case MVT::v4i64:
   20640   case MVT::v2i64:
   20641     if (!Subtarget->hasVLX())
   20642       return std::make_pair(0, false);
   20643     break;
   20644   case MVT::v64i8:
   20645   case MVT::v32i16:
   20646     if (!Subtarget->hasBWI())
   20647       return std::make_pair(0, false);
   20648     break;
   20649   case MVT::v16i32:
   20650   case MVT::v8i64:
   20651     if (!Subtarget->hasAVX512())
   20652       return std::make_pair(0, false);
   20653     break;
   20654   case MVT::v32i8:
   20655   case MVT::v16i16:
   20656   case MVT::v8i32:
   20657     if (!Subtarget->hasAVX2())
   20658       NeedSplit = true;
   20659     if (!Subtarget->hasAVX())
   20660       return std::make_pair(0, false);
   20661     break;
   20662   case MVT::v16i8:
   20663   case MVT::v8i16:
   20664   case MVT::v4i32:
   20665     if (!Subtarget->hasSSE2())
   20666       return std::make_pair(0, false);
   20667   }
   20668 
   20669   // SSE2 has only a small subset of the operations.
   20670   bool hasUnsigned = Subtarget->hasSSE41() ||
   20671                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
   20672   bool hasSigned = Subtarget->hasSSE41() ||
   20673                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
   20674 
   20675   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   20676 
   20677   unsigned Opc = 0;
   20678   // Check for x CC y ? x : y.
   20679   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   20680       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   20681     switch (CC) {
   20682     default: break;
   20683     case ISD::SETULT:
   20684     case ISD::SETULE:
   20685       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
   20686     case ISD::SETUGT:
   20687     case ISD::SETUGE:
   20688       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
   20689     case ISD::SETLT:
   20690     case ISD::SETLE:
   20691       Opc = hasSigned ? X86ISD::SMIN : 0; break;
   20692     case ISD::SETGT:
   20693     case ISD::SETGE:
   20694       Opc = hasSigned ? X86ISD::SMAX : 0; break;
   20695     }
   20696   // Check for x CC y ? y : x -- a min/max with reversed arms.
   20697   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
   20698              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
   20699     switch (CC) {
   20700     default: break;
   20701     case ISD::SETULT:
   20702     case ISD::SETULE:
   20703       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
   20704     case ISD::SETUGT:
   20705     case ISD::SETUGE:
   20706       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
   20707     case ISD::SETLT:
   20708     case ISD::SETLE:
   20709       Opc = hasSigned ? X86ISD::SMAX : 0; break;
   20710     case ISD::SETGT:
   20711     case ISD::SETGE:
   20712       Opc = hasSigned ? X86ISD::SMIN : 0; break;
   20713     }
   20714   }
   20715 
   20716   return std::make_pair(Opc, NeedSplit);
   20717 }
   20718 
   20719 static SDValue
   20720 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
   20721                                       const X86Subtarget *Subtarget) {
   20722   SDLoc dl(N);
   20723   SDValue Cond = N->getOperand(0);
   20724   SDValue LHS = N->getOperand(1);
   20725   SDValue RHS = N->getOperand(2);
   20726 
   20727   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
   20728     SDValue CondSrc = Cond->getOperand(0);
   20729     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
   20730       Cond = CondSrc->getOperand(0);
   20731   }
   20732 
   20733   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
   20734     return SDValue();
   20735 
   20736   // A vselect where all conditions and data are constants can be optimized into
   20737   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
   20738   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
   20739       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
   20740     return SDValue();
   20741 
   20742   unsigned MaskValue = 0;
   20743   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
   20744     return SDValue();
   20745 
   20746   MVT VT = N->getSimpleValueType(0);
   20747   unsigned NumElems = VT.getVectorNumElements();
   20748   SmallVector<int, 8> ShuffleMask(NumElems, -1);
   20749   for (unsigned i = 0; i < NumElems; ++i) {
   20750     // Be sure we emit undef where we can.
   20751     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
   20752       ShuffleMask[i] = -1;
   20753     else
   20754       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
   20755   }
   20756 
   20757   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   20758   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
   20759     return SDValue();
   20760   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
   20761 }
   20762 
   20763 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
   20764 /// nodes.
   20765 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   20766                                     TargetLowering::DAGCombinerInfo &DCI,
   20767                                     const X86Subtarget *Subtarget) {
   20768   SDLoc DL(N);
   20769   SDValue Cond = N->getOperand(0);
   20770   // Get the LHS/RHS of the select.
   20771   SDValue LHS = N->getOperand(1);
   20772   SDValue RHS = N->getOperand(2);
   20773   EVT VT = LHS.getValueType();
   20774   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   20775 
   20776   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   20777   // instructions match the semantics of the common C idiom x<y?x:y but not
   20778   // x<=y?x:y, because of how they handle negative zero (which can be
   20779   // ignored in unsafe-math mode).
   20780   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   20781   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
   20782       VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
   20783       (Subtarget->hasSSE2() ||
   20784        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
   20785     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   20786 
   20787     unsigned Opcode = 0;
   20788     // Check for x CC y ? x : y.
   20789     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   20790         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   20791       switch (CC) {
   20792       default: break;
   20793       case ISD::SETULT:
   20794         // Converting this to a min would handle NaNs incorrectly, and swapping
   20795         // the operands would cause it to handle comparisons between positive
   20796         // and negative zero incorrectly.
   20797         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
   20798           if (!DAG.getTarget().Options.UnsafeFPMath &&
   20799               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
   20800             break;
   20801           std::swap(LHS, RHS);
   20802         }
   20803         Opcode = X86ISD::FMIN;
   20804         break;
   20805       case ISD::SETOLE:
   20806         // Converting this to a min would handle comparisons between positive
   20807         // and negative zero incorrectly.
   20808         if (!DAG.getTarget().Options.UnsafeFPMath &&
   20809             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
   20810           break;
   20811         Opcode = X86ISD::FMIN;
   20812         break;
   20813       case ISD::SETULE:
   20814         // Converting this to a min would handle both negative zeros and NaNs
   20815         // incorrectly, but we can swap the operands to fix both.
   20816         std::swap(LHS, RHS);
   20817       case ISD::SETOLT:
   20818       case ISD::SETLT:
   20819       case ISD::SETLE:
   20820         Opcode = X86ISD::FMIN;
   20821         break;
   20822 
   20823       case ISD::SETOGE:
   20824         // Converting this to a max would handle comparisons between positive
   20825         // and negative zero incorrectly.
   20826         if (!DAG.getTarget().Options.UnsafeFPMath &&
   20827             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
   20828           break;
   20829         Opcode = X86ISD::FMAX;
   20830         break;
   20831       case ISD::SETUGT:
   20832         // Converting this to a max would handle NaNs incorrectly, and swapping
   20833         // the operands would cause it to handle comparisons between positive
   20834         // and negative zero incorrectly.
   20835         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
   20836           if (!DAG.getTarget().Options.UnsafeFPMath &&
   20837               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
   20838             break;
   20839           std::swap(LHS, RHS);
   20840         }
   20841         Opcode = X86ISD::FMAX;
   20842         break;
   20843       case ISD::SETUGE:
   20844         // Converting this to a max would handle both negative zeros and NaNs
   20845         // incorrectly, but we can swap the operands to fix both.
   20846         std::swap(LHS, RHS);
   20847       case ISD::SETOGT:
   20848       case ISD::SETGT:
   20849       case ISD::SETGE:
   20850         Opcode = X86ISD::FMAX;
   20851         break;
   20852       }
   20853     // Check for x CC y ? y : x -- a min/max with reversed arms.
   20854     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
   20855                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
   20856       switch (CC) {
   20857       default: break;
   20858       case ISD::SETOGE:
   20859         // Converting this to a min would handle comparisons between positive
   20860         // and negative zero incorrectly, and swapping the operands would
   20861         // cause it to handle NaNs incorrectly.
   20862         if (!DAG.getTarget().Options.UnsafeFPMath &&
   20863             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
   20864           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   20865             break;
   20866           std::swap(LHS, RHS);
   20867         }
   20868         Opcode = X86ISD::FMIN;
   20869         break;
   20870       case ISD::SETUGT:
   20871         // Converting this to a min would handle NaNs incorrectly.
   20872         if (!DAG.getTarget().Options.UnsafeFPMath &&
   20873             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
   20874           break;
   20875         Opcode = X86ISD::FMIN;
   20876         break;
   20877       case ISD::SETUGE:
   20878         // Converting this to a min would handle both negative zeros and NaNs
   20879         // incorrectly, but we can swap the operands to fix both.
   20880         std::swap(LHS, RHS);
   20881       case ISD::SETOGT:
   20882       case ISD::SETGT:
   20883       case ISD::SETGE:
   20884         Opcode = X86ISD::FMIN;
   20885         break;
   20886 
   20887       case ISD::SETULT:
   20888         // Converting this to a max would handle NaNs incorrectly.
   20889         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   20890           break;
   20891         Opcode = X86ISD::FMAX;
   20892         break;
   20893       case ISD::SETOLE:
   20894         // Converting this to a max would handle comparisons between positive
   20895         // and negative zero incorrectly, and swapping the operands would
   20896         // cause it to handle NaNs incorrectly.
   20897         if (!DAG.getTarget().Options.UnsafeFPMath &&
   20898             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
   20899           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   20900             break;
   20901           std::swap(LHS, RHS);
   20902         }
   20903         Opcode = X86ISD::FMAX;
   20904         break;
   20905       case ISD::SETULE:
   20906         // Converting this to a max would handle both negative zeros and NaNs
   20907         // incorrectly, but we can swap the operands to fix both.
   20908         std::swap(LHS, RHS);
   20909       case ISD::SETOLT:
   20910       case ISD::SETLT:
   20911       case ISD::SETLE:
   20912         Opcode = X86ISD::FMAX;
   20913         break;
   20914       }
   20915     }
   20916 
   20917     if (Opcode)
   20918       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   20919   }
   20920 
   20921   EVT CondVT = Cond.getValueType();
   20922   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
   20923       CondVT.getVectorElementType() == MVT::i1) {
   20924     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
   20925     // lowering on KNL. In this case we convert it to
   20926     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
   20927     // The same situation for all 128 and 256-bit vectors of i8 and i16.
   20928     // Since SKX these selects have a proper lowering.
   20929     EVT OpVT = LHS.getValueType();
   20930     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
   20931         (OpVT.getVectorElementType() == MVT::i8 ||
   20932          OpVT.getVectorElementType() == MVT::i16) &&
   20933         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
   20934       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
   20935       DCI.AddToWorklist(Cond.getNode());
   20936       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
   20937     }
   20938   }
   20939   // If this is a select between two integer constants, try to do some
   20940   // optimizations.
   20941   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
   20942     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
   20943       // Don't do this for crazy integer types.
   20944       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
   20945         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
   20946         // so that TrueC (the true value) is larger than FalseC.
   20947         bool NeedsCondInvert = false;
   20948 
   20949         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
   20950             // Efficiently invertible.
   20951             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
   20952              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
   20953               isa<ConstantSDNode>(Cond.getOperand(1))))) {
   20954           NeedsCondInvert = true;
   20955           std::swap(TrueC, FalseC);
   20956         }
   20957 
   20958         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
   20959         if (FalseC->getAPIntValue() == 0 &&
   20960             TrueC->getAPIntValue().isPowerOf2()) {
   20961           if (NeedsCondInvert) // Invert the condition if needed.
   20962             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   20963                                DAG.getConstant(1, Cond.getValueType()));
   20964 
   20965           // Zero extend the condition if needed.
   20966           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
   20967 
   20968           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
   20969           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
   20970                              DAG.getConstant(ShAmt, MVT::i8));
   20971         }
   20972 
   20973         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
   20974         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
   20975           if (NeedsCondInvert) // Invert the condition if needed.
   20976             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   20977                                DAG.getConstant(1, Cond.getValueType()));
   20978 
   20979           // Zero extend the condition if needed.
   20980           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
   20981                              FalseC->getValueType(0), Cond);
   20982           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   20983                              SDValue(FalseC, 0));
   20984         }
   20985 
   20986         // Optimize cases that will turn into an LEA instruction.  This requires
   20987         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   20988         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
   20989           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
   20990           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
   20991 
   20992           bool isFastMultiplier = false;
   20993           if (Diff < 10) {
   20994             switch ((unsigned char)Diff) {
   20995               default: break;
   20996               case 1:  // result = add base, cond
   20997               case 2:  // result = lea base(    , cond*2)
   20998               case 3:  // result = lea base(cond, cond*2)
   20999               case 4:  // result = lea base(    , cond*4)
   21000               case 5:  // result = lea base(cond, cond*4)
   21001               case 8:  // result = lea base(    , cond*8)
   21002               case 9:  // result = lea base(cond, cond*8)
   21003                 isFastMultiplier = true;
   21004                 break;
   21005             }
   21006           }
   21007 
   21008           if (isFastMultiplier) {
   21009             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
   21010             if (NeedsCondInvert) // Invert the condition if needed.
   21011               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
   21012                                  DAG.getConstant(1, Cond.getValueType()));
   21013 
   21014             // Zero extend the condition if needed.
   21015             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
   21016                                Cond);
   21017             // Scale the condition by the difference.
   21018             if (Diff != 1)
   21019               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
   21020                                  DAG.getConstant(Diff, Cond.getValueType()));
   21021 
   21022             // Add the base if non-zero.
   21023             if (FalseC->getAPIntValue() != 0)
   21024               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   21025                                  SDValue(FalseC, 0));
   21026             return Cond;
   21027           }
   21028         }
   21029       }
   21030   }
   21031 
   21032   // Canonicalize max and min:
   21033   // (x > y) ? x : y -> (x >= y) ? x : y
   21034   // (x < y) ? x : y -> (x <= y) ? x : y
   21035   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
   21036   // the need for an extra compare
   21037   // against zero. e.g.
   21038   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
   21039   // subl   %esi, %edi
   21040   // testl  %edi, %edi
   21041   // movl   $0, %eax
   21042   // cmovgl %edi, %eax
   21043   // =>
   21044   // xorl   %eax, %eax
   21045   // subl   %esi, $edi
   21046   // cmovsl %eax, %edi
   21047   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
   21048       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   21049       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   21050     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   21051     switch (CC) {
   21052     default: break;
   21053     case ISD::SETLT:
   21054     case ISD::SETGT: {
   21055       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
   21056       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
   21057                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
   21058       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
   21059     }
   21060     }
   21061   }
   21062 
   21063   // Early exit check
   21064   if (!TLI.isTypeLegal(VT))
   21065     return SDValue();
   21066 
   21067   // Match VSELECTs into subs with unsigned saturation.
   21068   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
   21069       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
   21070       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
   21071        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
   21072     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   21073 
   21074     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
   21075     // left side invert the predicate to simplify logic below.
   21076     SDValue Other;
   21077     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
   21078       Other = RHS;
   21079       CC = ISD::getSetCCInverse(CC, true);
   21080     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
   21081       Other = LHS;
   21082     }
   21083 
   21084     if (Other.getNode() && Other->getNumOperands() == 2 &&
   21085         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
   21086       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
   21087       SDValue CondRHS = Cond->getOperand(1);
   21088 
   21089       // Look for a general sub with unsigned saturation first.
   21090       // x >= y ? x-y : 0 --> subus x, y
   21091       // x >  y ? x-y : 0 --> subus x, y
   21092       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
   21093           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
   21094         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
   21095 
   21096       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
   21097         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
   21098           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
   21099             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
   21100               // If the RHS is a constant we have to reverse the const
   21101               // canonicalization.
   21102               // x > C-1 ? x+-C : 0 --> subus x, C
   21103               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
   21104                   CondRHSConst->getAPIntValue() ==
   21105                       (-OpRHSConst->getAPIntValue() - 1))
   21106                 return DAG.getNode(
   21107                     X86ISD::SUBUS, DL, VT, OpLHS,
   21108                     DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
   21109 
   21110           // Another special case: If C was a sign bit, the sub has been
   21111           // canonicalized into a xor.
   21112           // FIXME: Would it be better to use computeKnownBits to determine
   21113           //        whether it's safe to decanonicalize the xor?
   21114           // x s< 0 ? x^C : 0 --> subus x, C
   21115           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
   21116               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
   21117               OpRHSConst->getAPIntValue().isSignBit())
   21118             // Note that we have to rebuild the RHS constant here to ensure we
   21119             // don't rely on particular values of undef lanes.
   21120             return DAG.getNode(
   21121                 X86ISD::SUBUS, DL, VT, OpLHS,
   21122                 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
   21123         }
   21124     }
   21125   }
   21126 
   21127   // Try to match a min/max vector operation.
   21128   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
   21129     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
   21130     unsigned Opc = ret.first;
   21131     bool NeedSplit = ret.second;
   21132 
   21133     if (Opc && NeedSplit) {
   21134       unsigned NumElems = VT.getVectorNumElements();
   21135       // Extract the LHS vectors
   21136       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
   21137       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
   21138 
   21139       // Extract the RHS vectors
   21140       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
   21141       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
   21142 
   21143       // Create min/max for each subvector
   21144       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
   21145       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
   21146 
   21147       // Merge the result
   21148       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
   21149     } else if (Opc)
   21150       return DAG.getNode(Opc, DL, VT, LHS, RHS);
   21151   }
   21152 
   21153   // Simplify vector selection if condition value type matches vselect
   21154   // operand type
   21155   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
   21156     assert(Cond.getValueType().isVector() &&
   21157            "vector select expects a vector selector!");
   21158 
   21159     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
   21160     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
   21161 
   21162     // Try invert the condition if true value is not all 1s and false value
   21163     // is not all 0s.
   21164     if (!TValIsAllOnes && !FValIsAllZeros &&
   21165         // Check if the selector will be produced by CMPP*/PCMP*
   21166         Cond.getOpcode() == ISD::SETCC &&
   21167         // Check if SETCC has already been promoted
   21168         TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
   21169       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
   21170       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
   21171 
   21172       if (TValIsAllZeros || FValIsAllOnes) {
   21173         SDValue CC = Cond.getOperand(2);
   21174         ISD::CondCode NewCC =
   21175           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
   21176                                Cond.getOperand(0).getValueType().isInteger());
   21177         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
   21178         std::swap(LHS, RHS);
   21179         TValIsAllOnes = FValIsAllOnes;
   21180         FValIsAllZeros = TValIsAllZeros;
   21181       }
   21182     }
   21183 
   21184     if (TValIsAllOnes || FValIsAllZeros) {
   21185       SDValue Ret;
   21186 
   21187       if (TValIsAllOnes && FValIsAllZeros)
   21188         Ret = Cond;
   21189       else if (TValIsAllOnes)
   21190         Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
   21191                           DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
   21192       else if (FValIsAllZeros)
   21193         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
   21194                           DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
   21195 
   21196       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
   21197     }
   21198   }
   21199 
   21200   // We should generate an X86ISD::BLENDI from a vselect if its argument
   21201   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
   21202   // constants. This specific pattern gets generated when we split a
   21203   // selector for a 512 bit vector in a machine without AVX512 (but with
   21204   // 256-bit vectors), during legalization:
   21205   //
   21206   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
   21207   //
   21208   // Iff we find this pattern and the build_vectors are built from
   21209   // constants, we translate the vselect into a shuffle_vector that we
   21210   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
   21211   if ((N->getOpcode() == ISD::VSELECT ||
   21212        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
   21213       !DCI.isBeforeLegalize()) {
   21214     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
   21215     if (Shuffle.getNode())
   21216       return Shuffle;
   21217   }
   21218 
   21219   // If this is a *dynamic* select (non-constant condition) and we can match
   21220   // this node with one of the variable blend instructions, restructure the
   21221   // condition so that the blends can use the high bit of each element and use
   21222   // SimplifyDemandedBits to simplify the condition operand.
   21223   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
   21224       !DCI.isBeforeLegalize() &&
   21225       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
   21226     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
   21227 
   21228     // Don't optimize vector selects that map to mask-registers.
   21229     if (BitWidth == 1)
   21230       return SDValue();
   21231 
   21232     // We can only handle the cases where VSELECT is directly legal on the
   21233     // subtarget. We custom lower VSELECT nodes with constant conditions and
   21234     // this makes it hard to see whether a dynamic VSELECT will correctly
   21235     // lower, so we both check the operation's status and explicitly handle the
   21236     // cases where a *dynamic* blend will fail even though a constant-condition
   21237     // blend could be custom lowered.
   21238     // FIXME: We should find a better way to handle this class of problems.
   21239     // Potentially, we should combine constant-condition vselect nodes
   21240     // pre-legalization into shuffles and not mark as many types as custom
   21241     // lowered.
   21242     if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
   21243       return SDValue();
   21244     // FIXME: We don't support i16-element blends currently. We could and
   21245     // should support them by making *all* the bits in the condition be set
   21246     // rather than just the high bit and using an i8-element blend.
   21247     if (VT.getScalarType() == MVT::i16)
   21248       return SDValue();
   21249     // Dynamic blending was only available from SSE4.1 onward.
   21250     if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
   21251       return SDValue();
   21252     // Byte blends are only available in AVX2
   21253     if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
   21254         !Subtarget->hasAVX2())
   21255       return SDValue();
   21256 
   21257     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
   21258     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
   21259 
   21260     APInt KnownZero, KnownOne;
   21261     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
   21262                                           DCI.isBeforeLegalizeOps());
   21263     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
   21264         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
   21265                                  TLO)) {
   21266       // If we changed the computation somewhere in the DAG, this change
   21267       // will affect all users of Cond.
   21268       // Make sure it is fine and update all the nodes so that we do not
   21269       // use the generic VSELECT anymore. Otherwise, we may perform
   21270       // wrong optimizations as we messed up with the actual expectation
   21271       // for the vector boolean values.
   21272       if (Cond != TLO.Old) {
   21273         // Check all uses of that condition operand to check whether it will be
   21274         // consumed by non-BLEND instructions, which may depend on all bits are
   21275         // set properly.
   21276         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
   21277              I != E; ++I)
   21278           if (I->getOpcode() != ISD::VSELECT)
   21279             // TODO: Add other opcodes eventually lowered into BLEND.
   21280             return SDValue();
   21281 
   21282         // Update all the users of the condition, before committing the change,
   21283         // so that the VSELECT optimizations that expect the correct vector
   21284         // boolean value will not be triggered.
   21285         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
   21286              I != E; ++I)
   21287           DAG.ReplaceAllUsesOfValueWith(
   21288               SDValue(*I, 0),
   21289               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
   21290                           Cond, I->getOperand(1), I->getOperand(2)));
   21291         DCI.CommitTargetLoweringOpt(TLO);
   21292         return SDValue();
   21293       }
   21294       // At this point, only Cond is changed. Change the condition
   21295       // just for N to keep the opportunity to optimize all other
   21296       // users their own way.
   21297       DAG.ReplaceAllUsesOfValueWith(
   21298           SDValue(N, 0),
   21299           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
   21300                       TLO.New, N->getOperand(1), N->getOperand(2)));
   21301       return SDValue();
   21302     }
   21303   }
   21304 
   21305   return SDValue();
   21306 }
   21307 
   21308 // Check whether a boolean test is testing a boolean value generated by
   21309 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
   21310 // code.
   21311 //
   21312 // Simplify the following patterns:
   21313 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
   21314 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
   21315 // to (Op EFLAGS Cond)
   21316 //
   21317 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
   21318 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
   21319 // to (Op EFLAGS !Cond)
   21320 //
   21321 // where Op could be BRCOND or CMOV.
   21322 //
   21323 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   21324   // Quit if not CMP and SUB with its value result used.
   21325   if (Cmp.getOpcode() != X86ISD::CMP &&
   21326       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
   21327       return SDValue();
   21328 
   21329   // Quit if not used as a boolean value.
   21330   if (CC != X86::COND_E && CC != X86::COND_NE)
   21331     return SDValue();
   21332 
   21333   // Check CMP operands. One of them should be 0 or 1 and the other should be
   21334   // an SetCC or extended from it.
   21335   SDValue Op1 = Cmp.getOperand(0);
   21336   SDValue Op2 = Cmp.getOperand(1);
   21337 
   21338   SDValue SetCC;
   21339   const ConstantSDNode* C = nullptr;
   21340   bool needOppositeCond = (CC == X86::COND_E);
   21341   bool checkAgainstTrue = false; // Is it a comparison against 1?
   21342 
   21343   if ((C = dyn_cast<ConstantSDNode>(Op1)))
   21344     SetCC = Op2;
   21345   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
   21346     SetCC = Op1;
   21347   else // Quit if all operands are not constants.
   21348     return SDValue();
   21349 
   21350   if (C->getZExtValue() == 1) {
   21351     needOppositeCond = !needOppositeCond;
   21352     checkAgainstTrue = true;
   21353   } else if (C->getZExtValue() != 0)
   21354     // Quit if the constant is neither 0 or 1.
   21355     return SDValue();
   21356 
   21357   bool truncatedToBoolWithAnd = false;
   21358   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
   21359   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
   21360          SetCC.getOpcode() == ISD::TRUNCATE ||
   21361          SetCC.getOpcode() == ISD::AND) {
   21362     if (SetCC.getOpcode() == ISD::AND) {
   21363       int OpIdx = -1;
   21364       ConstantSDNode *CS;
   21365       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
   21366           CS->getZExtValue() == 1)
   21367         OpIdx = 1;
   21368       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
   21369           CS->getZExtValue() == 1)
   21370         OpIdx = 0;
   21371       if (OpIdx == -1)
   21372         break;
   21373       SetCC = SetCC.getOperand(OpIdx);
   21374       truncatedToBoolWithAnd = true;
   21375     } else
   21376       SetCC = SetCC.getOperand(0);
   21377   }
   21378 
   21379   switch (SetCC.getOpcode()) {
   21380   case X86ISD::SETCC_CARRY:
   21381     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
   21382     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
   21383     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
   21384     // truncated to i1 using 'and'.
   21385     if (checkAgainstTrue && !truncatedToBoolWithAnd)
   21386       break;
   21387     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
   21388            "Invalid use of SETCC_CARRY!");
   21389     // FALL THROUGH
   21390   case X86ISD::SETCC:
   21391     // Set the condition code or opposite one if necessary.
   21392     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
   21393     if (needOppositeCond)
   21394       CC = X86::GetOppositeBranchCondition(CC);
   21395     return SetCC.getOperand(1);
   21396   case X86ISD::CMOV: {
   21397     // Check whether false/true value has canonical one, i.e. 0 or 1.
   21398     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
   21399     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
   21400     // Quit if true value is not a constant.
   21401     if (!TVal)
   21402       return SDValue();
   21403     // Quit if false value is not a constant.
   21404     if (!FVal) {
   21405       SDValue Op = SetCC.getOperand(0);
   21406       // Skip 'zext' or 'trunc' node.
   21407       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
   21408           Op.getOpcode() == ISD::TRUNCATE)
   21409         Op = Op.getOperand(0);
   21410       // A special case for rdrand/rdseed, where 0 is set if false cond is
   21411       // found.
   21412       if ((Op.getOpcode() != X86ISD::RDRAND &&
   21413            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
   21414         return SDValue();
   21415     }
   21416     // Quit if false value is not the constant 0 or 1.
   21417     bool FValIsFalse = true;
   21418     if (FVal && FVal->getZExtValue() != 0) {
   21419       if (FVal->getZExtValue() != 1)
   21420         return SDValue();
   21421       // If FVal is 1, opposite cond is needed.
   21422       needOppositeCond = !needOppositeCond;
   21423       FValIsFalse = false;
   21424     }
   21425     // Quit if TVal is not the constant opposite of FVal.
   21426     if (FValIsFalse && TVal->getZExtValue() != 1)
   21427       return SDValue();
   21428     if (!FValIsFalse && TVal->getZExtValue() != 0)
   21429       return SDValue();
   21430     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
   21431     if (needOppositeCond)
   21432       CC = X86::GetOppositeBranchCondition(CC);
   21433     return SetCC.getOperand(3);
   21434   }
   21435   }
   21436 
   21437   return SDValue();
   21438 }
   21439 
   21440 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
   21441 /// Match:
   21442 ///   (X86or (X86setcc) (X86setcc))
   21443 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
   21444 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
   21445                                            X86::CondCode &CC1, SDValue &Flags,
   21446                                            bool &isAnd) {
   21447   if (Cond->getOpcode() == X86ISD::CMP) {
   21448     ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
   21449     if (!CondOp1C || !CondOp1C->isNullValue())
   21450       return false;
   21451 
   21452     Cond = Cond->getOperand(0);
   21453   }
   21454 
   21455   isAnd = false;
   21456 
   21457   SDValue SetCC0, SetCC1;
   21458   switch (Cond->getOpcode()) {
   21459   default: return false;
   21460   case ISD::AND:
   21461   case X86ISD::AND:
   21462     isAnd = true;
   21463     // fallthru
   21464   case ISD::OR:
   21465   case X86ISD::OR:
   21466     SetCC0 = Cond->getOperand(0);
   21467     SetCC1 = Cond->getOperand(1);
   21468     break;
   21469   };
   21470 
   21471   // Make sure we have SETCC nodes, using the same flags value.
   21472   if (SetCC0.getOpcode() != X86ISD::SETCC ||
   21473       SetCC1.getOpcode() != X86ISD::SETCC ||
   21474       SetCC0->getOperand(1) != SetCC1->getOperand(1))
   21475     return false;
   21476 
   21477   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
   21478   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
   21479   Flags = SetCC0->getOperand(1);
   21480   return true;
   21481 }
   21482 
   21483 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
   21484 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
   21485                                   TargetLowering::DAGCombinerInfo &DCI,
   21486                                   const X86Subtarget *Subtarget) {
   21487   SDLoc DL(N);
   21488 
   21489   // If the flag operand isn't dead, don't touch this CMOV.
   21490   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
   21491     return SDValue();
   21492 
   21493   SDValue FalseOp = N->getOperand(0);
   21494   SDValue TrueOp = N->getOperand(1);
   21495   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
   21496   SDValue Cond = N->getOperand(3);
   21497 
   21498   if (CC == X86::COND_E || CC == X86::COND_NE) {
   21499     switch (Cond.getOpcode()) {
   21500     default: break;
   21501     case X86ISD::BSR:
   21502     case X86ISD::BSF:
   21503       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
   21504       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
   21505         return (CC == X86::COND_E) ? FalseOp : TrueOp;
   21506     }
   21507   }
   21508 
   21509   SDValue Flags;
   21510 
   21511   Flags = checkBoolTestSetCCCombine(Cond, CC);
   21512   if (Flags.getNode() &&
   21513       // Extra check as FCMOV only supports a subset of X86 cond.
   21514       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
   21515     SDValue Ops[] = { FalseOp, TrueOp,
   21516                       DAG.getConstant(CC, MVT::i8), Flags };
   21517     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
   21518   }
   21519 
   21520   // If this is a select between two integer constants, try to do some
   21521   // optimizations.  Note that the operands are ordered the opposite of SELECT
   21522   // operands.
   21523   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
   21524     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
   21525       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
   21526       // larger than FalseC (the false value).
   21527       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
   21528         CC = X86::GetOppositeBranchCondition(CC);
   21529         std::swap(TrueC, FalseC);
   21530         std::swap(TrueOp, FalseOp);
   21531       }
   21532 
   21533       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
   21534       // This is efficient for any integer data type (including i8/i16) and
   21535       // shift amount.
   21536       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
   21537         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   21538                            DAG.getConstant(CC, MVT::i8), Cond);
   21539 
   21540         // Zero extend the condition if needed.
   21541         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
   21542 
   21543         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
   21544         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
   21545                            DAG.getConstant(ShAmt, MVT::i8));
   21546         if (N->getNumValues() == 2)  // Dead flag value?
   21547           return DCI.CombineTo(N, Cond, SDValue());
   21548         return Cond;
   21549       }
   21550 
   21551       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
   21552       // for any integer data type, including i8/i16.
   21553       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
   21554         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   21555                            DAG.getConstant(CC, MVT::i8), Cond);
   21556 
   21557         // Zero extend the condition if needed.
   21558         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
   21559                            FalseC->getValueType(0), Cond);
   21560         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   21561                            SDValue(FalseC, 0));
   21562 
   21563         if (N->getNumValues() == 2)  // Dead flag value?
   21564           return DCI.CombineTo(N, Cond, SDValue());
   21565         return Cond;
   21566       }
   21567 
   21568       // Optimize cases that will turn into an LEA instruction.  This requires
   21569       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   21570       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
   21571         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
   21572         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
   21573 
   21574         bool isFastMultiplier = false;
   21575         if (Diff < 10) {
   21576           switch ((unsigned char)Diff) {
   21577           default: break;
   21578           case 1:  // result = add base, cond
   21579           case 2:  // result = lea base(    , cond*2)
   21580           case 3:  // result = lea base(cond, cond*2)
   21581           case 4:  // result = lea base(    , cond*4)
   21582           case 5:  // result = lea base(cond, cond*4)
   21583           case 8:  // result = lea base(    , cond*8)
   21584           case 9:  // result = lea base(cond, cond*8)
   21585             isFastMultiplier = true;
   21586             break;
   21587           }
   21588         }
   21589 
   21590         if (isFastMultiplier) {
   21591           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
   21592           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
   21593                              DAG.getConstant(CC, MVT::i8), Cond);
   21594           // Zero extend the condition if needed.
   21595           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
   21596                              Cond);
   21597           // Scale the condition by the difference.
   21598           if (Diff != 1)
   21599             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
   21600                                DAG.getConstant(Diff, Cond.getValueType()));
   21601 
   21602           // Add the base if non-zero.
   21603           if (FalseC->getAPIntValue() != 0)
   21604             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   21605                                SDValue(FalseC, 0));
   21606           if (N->getNumValues() == 2)  // Dead flag value?
   21607             return DCI.CombineTo(N, Cond, SDValue());
   21608           return Cond;
   21609         }
   21610       }
   21611     }
   21612   }
   21613 
   21614   // Handle these cases:
   21615   //   (select (x != c), e, c) -> select (x != c), e, x),
   21616   //   (select (x == c), c, e) -> select (x == c), x, e)
   21617   // where the c is an integer constant, and the "select" is the combination
   21618   // of CMOV and CMP.
   21619   //
   21620   // The rationale for this change is that the conditional-move from a constant
   21621   // needs two instructions, however, conditional-move from a register needs
   21622   // only one instruction.
   21623   //
   21624   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
   21625   //  some instruction-combining opportunities. This opt needs to be
   21626   //  postponed as late as possible.
   21627   //
   21628   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
   21629     // the DCI.xxxx conditions are provided to postpone the optimization as
   21630     // late as possible.
   21631 
   21632     ConstantSDNode *CmpAgainst = nullptr;
   21633     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
   21634         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
   21635         !isa<ConstantSDNode>(Cond.getOperand(0))) {
   21636 
   21637       if (CC == X86::COND_NE &&
   21638           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
   21639         CC = X86::GetOppositeBranchCondition(CC);
   21640         std::swap(TrueOp, FalseOp);
   21641       }
   21642 
   21643       if (CC == X86::COND_E &&
   21644           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
   21645         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
   21646                           DAG.getConstant(CC, MVT::i8), Cond };
   21647         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
   21648       }
   21649     }
   21650   }
   21651 
   21652   // Fold and/or of setcc's to double CMOV:
   21653   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
   21654   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
   21655   //
   21656   // This combine lets us generate:
   21657   //   cmovcc1 (jcc1 if we don't have CMOV)
   21658   //   cmovcc2 (same)
   21659   // instead of:
   21660   //   setcc1
   21661   //   setcc2
   21662   //   and/or
   21663   //   cmovne (jne if we don't have CMOV)
   21664   // When we can't use the CMOV instruction, it might increase branch
   21665   // mispredicts.
   21666   // When we can use CMOV, or when there is no mispredict, this improves
   21667   // throughput and reduces register pressure.
   21668   //
   21669   if (CC == X86::COND_NE) {
   21670     SDValue Flags;
   21671     X86::CondCode CC0, CC1;
   21672     bool isAndSetCC;
   21673     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
   21674       if (isAndSetCC) {
   21675         std::swap(FalseOp, TrueOp);
   21676         CC0 = X86::GetOppositeBranchCondition(CC0);
   21677         CC1 = X86::GetOppositeBranchCondition(CC1);
   21678       }
   21679 
   21680       SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, MVT::i8),
   21681         Flags};
   21682       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
   21683       SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, MVT::i8), Flags};
   21684       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
   21685       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
   21686       return CMOV;
   21687     }
   21688   }
   21689 
   21690   return SDValue();
   21691 }
   21692 
   21693 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
   21694                                                 const X86Subtarget *Subtarget) {
   21695   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   21696   switch (IntNo) {
   21697   default: return SDValue();
   21698   // SSE/AVX/AVX2 blend intrinsics.
   21699   case Intrinsic::x86_avx2_pblendvb:
   21700     // Don't try to simplify this intrinsic if we don't have AVX2.
   21701     if (!Subtarget->hasAVX2())
   21702       return SDValue();
   21703     // FALL-THROUGH
   21704   case Intrinsic::x86_avx_blendv_pd_256:
   21705   case Intrinsic::x86_avx_blendv_ps_256:
   21706     // Don't try to simplify this intrinsic if we don't have AVX.
   21707     if (!Subtarget->hasAVX())
   21708       return SDValue();
   21709     // FALL-THROUGH
   21710   case Intrinsic::x86_sse41_blendvps:
   21711   case Intrinsic::x86_sse41_blendvpd:
   21712   case Intrinsic::x86_sse41_pblendvb: {
   21713     SDValue Op0 = N->getOperand(1);
   21714     SDValue Op1 = N->getOperand(2);
   21715     SDValue Mask = N->getOperand(3);
   21716 
   21717     // Don't try to simplify this intrinsic if we don't have SSE4.1.
   21718     if (!Subtarget->hasSSE41())
   21719       return SDValue();
   21720 
   21721     // fold (blend A, A, Mask) -> A
   21722     if (Op0 == Op1)
   21723       return Op0;
   21724     // fold (blend A, B, allZeros) -> A
   21725     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
   21726       return Op0;
   21727     // fold (blend A, B, allOnes) -> B
   21728     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
   21729       return Op1;
   21730 
   21731     // Simplify the case where the mask is a constant i32 value.
   21732     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
   21733       if (C->isNullValue())
   21734         return Op0;
   21735       if (C->isAllOnesValue())
   21736         return Op1;
   21737     }
   21738 
   21739     return SDValue();
   21740   }
   21741 
   21742   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
   21743   case Intrinsic::x86_sse2_psrai_w:
   21744   case Intrinsic::x86_sse2_psrai_d:
   21745   case Intrinsic::x86_avx2_psrai_w:
   21746   case Intrinsic::x86_avx2_psrai_d:
   21747   case Intrinsic::x86_sse2_psra_w:
   21748   case Intrinsic::x86_sse2_psra_d:
   21749   case Intrinsic::x86_avx2_psra_w:
   21750   case Intrinsic::x86_avx2_psra_d: {
   21751     SDValue Op0 = N->getOperand(1);
   21752     SDValue Op1 = N->getOperand(2);
   21753     EVT VT = Op0.getValueType();
   21754     assert(VT.isVector() && "Expected a vector type!");
   21755 
   21756     if (isa<BuildVectorSDNode>(Op1))
   21757       Op1 = Op1.getOperand(0);
   21758 
   21759     if (!isa<ConstantSDNode>(Op1))
   21760       return SDValue();
   21761 
   21762     EVT SVT = VT.getVectorElementType();
   21763     unsigned SVTBits = SVT.getSizeInBits();
   21764 
   21765     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
   21766     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
   21767     uint64_t ShAmt = C.getZExtValue();
   21768 
   21769     // Don't try to convert this shift into a ISD::SRA if the shift
   21770     // count is bigger than or equal to the element size.
   21771     if (ShAmt >= SVTBits)
   21772       return SDValue();
   21773 
   21774     // Trivial case: if the shift count is zero, then fold this
   21775     // into the first operand.
   21776     if (ShAmt == 0)
   21777       return Op0;
   21778 
   21779     // Replace this packed shift intrinsic with a target independent
   21780     // shift dag node.
   21781     SDValue Splat = DAG.getConstant(C, VT);
   21782     return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
   21783   }
   21784   }
   21785 }
   21786 
   21787 /// PerformMulCombine - Optimize a single multiply with constant into two
   21788 /// in order to implement it with two cheaper instructions, e.g.
   21789 /// LEA + SHL, LEA + LEA.
   21790 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
   21791                                  TargetLowering::DAGCombinerInfo &DCI) {
   21792   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
   21793     return SDValue();
   21794 
   21795   EVT VT = N->getValueType(0);
   21796   if (VT != MVT::i64 && VT != MVT::i32)
   21797     return SDValue();
   21798 
   21799   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   21800   if (!C)
   21801     return SDValue();
   21802   uint64_t MulAmt = C->getZExtValue();
   21803   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
   21804     return SDValue();
   21805 
   21806   uint64_t MulAmt1 = 0;
   21807   uint64_t MulAmt2 = 0;
   21808   if ((MulAmt % 9) == 0) {
   21809     MulAmt1 = 9;
   21810     MulAmt2 = MulAmt / 9;
   21811   } else if ((MulAmt % 5) == 0) {
   21812     MulAmt1 = 5;
   21813     MulAmt2 = MulAmt / 5;
   21814   } else if ((MulAmt % 3) == 0) {
   21815     MulAmt1 = 3;
   21816     MulAmt2 = MulAmt / 3;
   21817   }
   21818   if (MulAmt2 &&
   21819       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
   21820     SDLoc DL(N);
   21821 
   21822     if (isPowerOf2_64(MulAmt2) &&
   21823         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
   21824       // If second multiplifer is pow2, issue it first. We want the multiply by
   21825       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
   21826       // is an add.
   21827       std::swap(MulAmt1, MulAmt2);
   21828 
   21829     SDValue NewMul;
   21830     if (isPowerOf2_64(MulAmt1))
   21831       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   21832                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
   21833     else
   21834       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
   21835                            DAG.getConstant(MulAmt1, VT));
   21836 
   21837     if (isPowerOf2_64(MulAmt2))
   21838       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
   21839                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
   21840     else
   21841       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
   21842                            DAG.getConstant(MulAmt2, VT));
   21843 
   21844     // Do not add new nodes to DAG combiner worklist.
   21845     DCI.CombineTo(N, NewMul, false);
   21846   }
   21847   return SDValue();
   21848 }
   21849 
   21850 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
   21851   SDValue N0 = N->getOperand(0);
   21852   SDValue N1 = N->getOperand(1);
   21853   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   21854   EVT VT = N0.getValueType();
   21855 
   21856   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
   21857   // since the result of setcc_c is all zero's or all ones.
   21858   if (VT.isInteger() && !VT.isVector() &&
   21859       N1C && N0.getOpcode() == ISD::AND &&
   21860       N0.getOperand(1).getOpcode() == ISD::Constant) {
   21861     SDValue N00 = N0.getOperand(0);
   21862     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
   21863         ((N00.getOpcode() == ISD::ANY_EXTEND ||
   21864           N00.getOpcode() == ISD::ZERO_EXTEND) &&
   21865          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
   21866       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
   21867       APInt ShAmt = N1C->getAPIntValue();
   21868       Mask = Mask.shl(ShAmt);
   21869       if (Mask != 0)
   21870         return DAG.getNode(ISD::AND, SDLoc(N), VT,
   21871                            N00, DAG.getConstant(Mask, VT));
   21872     }
   21873   }
   21874 
   21875   // Hardware support for vector shifts is sparse which makes us scalarize the
   21876   // vector operations in many cases. Also, on sandybridge ADD is faster than
   21877   // shl.
   21878   // (shl V, 1) -> add V,V
   21879   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
   21880     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
   21881       assert(N0.getValueType().isVector() && "Invalid vector shift type");
   21882       // We shift all of the values by one. In many cases we do not have
   21883       // hardware support for this operation. This is better expressed as an ADD
   21884       // of two values.
   21885       if (N1SplatC->getZExtValue() == 1)
   21886         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
   21887     }
   21888 
   21889   return SDValue();
   21890 }
   21891 
   21892 /// \brief Returns a vector of 0s if the node in input is a vector logical
   21893 /// shift by a constant amount which is known to be bigger than or equal
   21894 /// to the vector element size in bits.
   21895 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
   21896                                       const X86Subtarget *Subtarget) {
   21897   EVT VT = N->getValueType(0);
   21898 
   21899   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
   21900       (!Subtarget->hasInt256() ||
   21901        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
   21902     return SDValue();
   21903 
   21904   SDValue Amt = N->getOperand(1);
   21905   SDLoc DL(N);
   21906   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
   21907     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
   21908       APInt ShiftAmt = AmtSplat->getAPIntValue();
   21909       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
   21910 
   21911       // SSE2/AVX2 logical shifts always return a vector of 0s
   21912       // if the shift amount is bigger than or equal to
   21913       // the element size. The constant shift amount will be
   21914       // encoded as a 8-bit immediate.
   21915       if (ShiftAmt.trunc(8).uge(MaxAmount))
   21916         return getZeroVector(VT, Subtarget, DAG, DL);
   21917     }
   21918 
   21919   return SDValue();
   21920 }
   21921 
   21922 /// PerformShiftCombine - Combine shifts.
   21923 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
   21924                                    TargetLowering::DAGCombinerInfo &DCI,
   21925                                    const X86Subtarget *Subtarget) {
   21926   if (N->getOpcode() == ISD::SHL) {
   21927     SDValue V = PerformSHLCombine(N, DAG);
   21928     if (V.getNode()) return V;
   21929   }
   21930 
   21931   if (N->getOpcode() != ISD::SRA) {
   21932     // Try to fold this logical shift into a zero vector.
   21933     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
   21934     if (V.getNode()) return V;
   21935   }
   21936 
   21937   return SDValue();
   21938 }
   21939 
   21940 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
   21941 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
   21942 // and friends.  Likewise for OR -> CMPNEQSS.
   21943 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
   21944                             TargetLowering::DAGCombinerInfo &DCI,
   21945                             const X86Subtarget *Subtarget) {
   21946   unsigned opcode;
   21947 
   21948   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
   21949   // we're requiring SSE2 for both.
   21950   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
   21951     SDValue N0 = N->getOperand(0);
   21952     SDValue N1 = N->getOperand(1);
   21953     SDValue CMP0 = N0->getOperand(1);
   21954     SDValue CMP1 = N1->getOperand(1);
   21955     SDLoc DL(N);
   21956 
   21957     // The SETCCs should both refer to the same CMP.
   21958     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
   21959       return SDValue();
   21960 
   21961     SDValue CMP00 = CMP0->getOperand(0);
   21962     SDValue CMP01 = CMP0->getOperand(1);
   21963     EVT     VT    = CMP00.getValueType();
   21964 
   21965     if (VT == MVT::f32 || VT == MVT::f64) {
   21966       bool ExpectingFlags = false;
   21967       // Check for any users that want flags:
   21968       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
   21969            !ExpectingFlags && UI != UE; ++UI)
   21970         switch (UI->getOpcode()) {
   21971         default:
   21972         case ISD::BR_CC:
   21973         case ISD::BRCOND:
   21974         case ISD::SELECT:
   21975           ExpectingFlags = true;
   21976           break;
   21977         case ISD::CopyToReg:
   21978         case ISD::SIGN_EXTEND:
   21979         case ISD::ZERO_EXTEND:
   21980         case ISD::ANY_EXTEND:
   21981           break;
   21982         }
   21983 
   21984       if (!ExpectingFlags) {
   21985         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
   21986         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
   21987 
   21988         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
   21989           X86::CondCode tmp = cc0;
   21990           cc0 = cc1;
   21991           cc1 = tmp;
   21992         }
   21993 
   21994         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
   21995             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
   21996           // FIXME: need symbolic constants for these magic numbers.
   21997           // See X86ATTInstPrinter.cpp:printSSECC().
   21998           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
   21999           if (Subtarget->hasAVX512()) {
   22000             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
   22001                                          CMP01, DAG.getConstant(x86cc, MVT::i8));
   22002             if (N->getValueType(0) != MVT::i1)
   22003               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
   22004                                  FSetCC);
   22005             return FSetCC;
   22006           }
   22007           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
   22008                                               CMP00.getValueType(), CMP00, CMP01,
   22009                                               DAG.getConstant(x86cc, MVT::i8));
   22010 
   22011           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
   22012           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
   22013 
   22014           if (is64BitFP && !Subtarget->is64Bit()) {
   22015             // On a 32-bit target, we cannot bitcast the 64-bit float to a
   22016             // 64-bit integer, since that's not a legal type. Since
   22017             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
   22018             // bits, but can do this little dance to extract the lowest 32 bits
   22019             // and work with those going forward.
   22020             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
   22021                                            OnesOrZeroesF);
   22022             SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
   22023                                            Vector64);
   22024             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
   22025                                         Vector32, DAG.getIntPtrConstant(0));
   22026             IntVT = MVT::i32;
   22027           }
   22028 
   22029           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
   22030           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
   22031                                       DAG.getConstant(1, IntVT));
   22032           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
   22033           return OneBitOfTruth;
   22034         }
   22035       }
   22036     }
   22037   }
   22038   return SDValue();
   22039 }
   22040 
   22041 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
   22042 /// so it can be folded inside ANDNP.
   22043 static bool CanFoldXORWithAllOnes(const SDNode *N) {
   22044   EVT VT = N->getValueType(0);
   22045 
   22046   // Match direct AllOnes for 128 and 256-bit vectors
   22047   if (ISD::isBuildVectorAllOnes(N))
   22048     return true;
   22049 
   22050   // Look through a bit convert.
   22051   if (N->getOpcode() == ISD::BITCAST)
   22052     N = N->getOperand(0).getNode();
   22053 
   22054   // Sometimes the operand may come from a insert_subvector building a 256-bit
   22055   // allones vector
   22056   if (VT.is256BitVector() &&
   22057       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
   22058     SDValue V1 = N->getOperand(0);
   22059     SDValue V2 = N->getOperand(1);
   22060 
   22061     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
   22062         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
   22063         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
   22064         ISD::isBuildVectorAllOnes(V2.getNode()))
   22065       return true;
   22066   }
   22067 
   22068   return false;
   22069 }
   22070 
   22071 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
   22072 // register. In most cases we actually compare or select YMM-sized registers
   22073 // and mixing the two types creates horrible code. This method optimizes
   22074 // some of the transition sequences.
   22075 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   22076                                  TargetLowering::DAGCombinerInfo &DCI,
   22077                                  const X86Subtarget *Subtarget) {
   22078   EVT VT = N->getValueType(0);
   22079   if (!VT.is256BitVector())
   22080     return SDValue();
   22081 
   22082   assert((N->getOpcode() == ISD::ANY_EXTEND ||
   22083           N->getOpcode() == ISD::ZERO_EXTEND ||
   22084           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
   22085 
   22086   SDValue Narrow = N->getOperand(0);
   22087   EVT NarrowVT = Narrow->getValueType(0);
   22088   if (!NarrowVT.is128BitVector())
   22089     return SDValue();
   22090 
   22091   if (Narrow->getOpcode() != ISD::XOR &&
   22092       Narrow->getOpcode() != ISD::AND &&
   22093       Narrow->getOpcode() != ISD::OR)
   22094     return SDValue();
   22095 
   22096   SDValue N0  = Narrow->getOperand(0);
   22097   SDValue N1  = Narrow->getOperand(1);
   22098   SDLoc DL(Narrow);
   22099 
   22100   // The Left side has to be a trunc.
   22101   if (N0.getOpcode() != ISD::TRUNCATE)
   22102     return SDValue();
   22103 
   22104   // The type of the truncated inputs.
   22105   EVT WideVT = N0->getOperand(0)->getValueType(0);
   22106   if (WideVT != VT)
   22107     return SDValue();
   22108 
   22109   // The right side has to be a 'trunc' or a constant vector.
   22110   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
   22111   ConstantSDNode *RHSConstSplat = nullptr;
   22112   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
   22113     RHSConstSplat = RHSBV->getConstantSplatNode();
   22114   if (!RHSTrunc && !RHSConstSplat)
   22115     return SDValue();
   22116 
   22117   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   22118 
   22119   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
   22120     return SDValue();
   22121 
   22122   // Set N0 and N1 to hold the inputs to the new wide operation.
   22123   N0 = N0->getOperand(0);
   22124   if (RHSConstSplat) {
   22125     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
   22126                      SDValue(RHSConstSplat, 0));
   22127     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
   22128     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
   22129   } else if (RHSTrunc) {
   22130     N1 = N1->getOperand(0);
   22131   }
   22132 
   22133   // Generate the wide operation.
   22134   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
   22135   unsigned Opcode = N->getOpcode();
   22136   switch (Opcode) {
   22137   case ISD::ANY_EXTEND:
   22138     return Op;
   22139   case ISD::ZERO_EXTEND: {
   22140     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
   22141     APInt Mask = APInt::getAllOnesValue(InBits);
   22142     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
   22143     return DAG.getNode(ISD::AND, DL, VT,
   22144                        Op, DAG.getConstant(Mask, VT));
   22145   }
   22146   case ISD::SIGN_EXTEND:
   22147     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
   22148                        Op, DAG.getValueType(NarrowVT));
   22149   default:
   22150     llvm_unreachable("Unexpected opcode");
   22151   }
   22152 }
   22153 
   22154 static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
   22155                                  TargetLowering::DAGCombinerInfo &DCI,
   22156                                  const X86Subtarget *Subtarget) {
   22157   SDValue N0 = N->getOperand(0);
   22158   SDValue N1 = N->getOperand(1);
   22159   SDLoc DL(N);
   22160 
   22161   // A vector zext_in_reg may be represented as a shuffle,
   22162   // feeding into a bitcast (this represents anyext) feeding into
   22163   // an and with a mask.
   22164   // We'd like to try to combine that into a shuffle with zero
   22165   // plus a bitcast, removing the and.
   22166   if (N0.getOpcode() != ISD::BITCAST ||
   22167       N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
   22168     return SDValue();
   22169 
   22170   // The other side of the AND should be a splat of 2^C, where C
   22171   // is the number of bits in the source type.
   22172   if (N1.getOpcode() == ISD::BITCAST)
   22173     N1 = N1.getOperand(0);
   22174   if (N1.getOpcode() != ISD::BUILD_VECTOR)
   22175     return SDValue();
   22176   BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
   22177 
   22178   ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
   22179   EVT SrcType = Shuffle->getValueType(0);
   22180 
   22181   // We expect a single-source shuffle
   22182   if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF)
   22183     return SDValue();
   22184 
   22185   unsigned SrcSize = SrcType.getScalarSizeInBits();
   22186 
   22187   APInt SplatValue, SplatUndef;
   22188   unsigned SplatBitSize;
   22189   bool HasAnyUndefs;
   22190   if (!Vector->isConstantSplat(SplatValue, SplatUndef,
   22191                                 SplatBitSize, HasAnyUndefs))
   22192     return SDValue();
   22193 
   22194   unsigned ResSize = N1.getValueType().getScalarSizeInBits();
   22195   // Make sure the splat matches the mask we expect
   22196   if (SplatBitSize > ResSize ||
   22197       (SplatValue + 1).exactLogBase2() != (int)SrcSize)
   22198     return SDValue();
   22199 
   22200   // Make sure the input and output size make sense
   22201   if (SrcSize >= ResSize || ResSize % SrcSize)
   22202     return SDValue();
   22203 
   22204   // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
   22205   // The number of u's between each two values depends on the ratio between
   22206   // the source and dest type.
   22207   unsigned ZextRatio = ResSize / SrcSize;
   22208   bool IsZext = true;
   22209   for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) {
   22210     if (i % ZextRatio) {
   22211       if (Shuffle->getMaskElt(i) > 0) {
   22212         // Expected undef
   22213         IsZext = false;
   22214         break;
   22215       }
   22216     } else {
   22217       if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
   22218         // Expected element number
   22219         IsZext = false;
   22220         break;
   22221       }
   22222     }
   22223   }
   22224 
   22225   if (!IsZext)
   22226     return SDValue();
   22227 
   22228   // Ok, perform the transformation - replace the shuffle with
   22229   // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
   22230   // (instead of undef) where the k elements come from the zero vector.
   22231   SmallVector<int, 8> Mask;
   22232   unsigned NumElems = SrcType.getVectorNumElements();
   22233   for (unsigned i = 0; i < NumElems; ++i)
   22234     if (i % ZextRatio)
   22235       Mask.push_back(NumElems);
   22236     else
   22237       Mask.push_back(i / ZextRatio);
   22238 
   22239   SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
   22240     Shuffle->getOperand(0), DAG.getConstant(0, SrcType), Mask);
   22241   return DAG.getNode(ISD::BITCAST, DL,  N0.getValueType(), NewShuffle);
   22242 }
   22243 
   22244 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   22245                                  TargetLowering::DAGCombinerInfo &DCI,
   22246                                  const X86Subtarget *Subtarget) {
   22247   if (DCI.isBeforeLegalizeOps())
   22248     return SDValue();
   22249 
   22250   if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget))
   22251     return Zext;
   22252 
   22253   if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
   22254     return R;
   22255 
   22256   EVT VT = N->getValueType(0);
   22257   SDValue N0 = N->getOperand(0);
   22258   SDValue N1 = N->getOperand(1);
   22259   SDLoc DL(N);
   22260 
   22261   // Create BEXTR instructions
   22262   // BEXTR is ((X >> imm) & (2**size-1))
   22263   if (VT == MVT::i32 || VT == MVT::i64) {
   22264     // Check for BEXTR.
   22265     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
   22266         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
   22267       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
   22268       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   22269       if (MaskNode && ShiftNode) {
   22270         uint64_t Mask = MaskNode->getZExtValue();
   22271         uint64_t Shift = ShiftNode->getZExtValue();
   22272         if (isMask_64(Mask)) {
   22273           uint64_t MaskSize = countPopulation(Mask);
   22274           if (Shift + MaskSize <= VT.getSizeInBits())
   22275             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
   22276                                DAG.getConstant(Shift | (MaskSize << 8), VT));
   22277         }
   22278       }
   22279     } // BEXTR
   22280 
   22281     return SDValue();
   22282   }
   22283 
   22284   // Want to form ANDNP nodes:
   22285   // 1) In the hopes of then easily combining them with OR and AND nodes
   22286   //    to form PBLEND/PSIGN.
   22287   // 2) To match ANDN packed intrinsics
   22288   if (VT != MVT::v2i64 && VT != MVT::v4i64)
   22289     return SDValue();
   22290 
   22291   // Check LHS for vnot
   22292   if (N0.getOpcode() == ISD::XOR &&
   22293       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
   22294       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
   22295     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
   22296 
   22297   // Check RHS for vnot
   22298   if (N1.getOpcode() == ISD::XOR &&
   22299       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
   22300       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
   22301     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
   22302 
   22303   return SDValue();
   22304 }
   22305 
   22306 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   22307                                 TargetLowering::DAGCombinerInfo &DCI,
   22308                                 const X86Subtarget *Subtarget) {
   22309   if (DCI.isBeforeLegalizeOps())
   22310     return SDValue();
   22311 
   22312   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
   22313   if (R.getNode())
   22314     return R;
   22315 
   22316   SDValue N0 = N->getOperand(0);
   22317   SDValue N1 = N->getOperand(1);
   22318   EVT VT = N->getValueType(0);
   22319 
   22320   // look for psign/blend
   22321   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
   22322     if (!Subtarget->hasSSSE3() ||
   22323         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
   22324       return SDValue();
   22325 
   22326     // Canonicalize pandn to RHS
   22327     if (N0.getOpcode() == X86ISD::ANDNP)
   22328       std::swap(N0, N1);
   22329     // or (and (m, y), (pandn m, x))
   22330     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
   22331       SDValue Mask = N1.getOperand(0);
   22332       SDValue X    = N1.getOperand(1);
   22333       SDValue Y;
   22334       if (N0.getOperand(0) == Mask)
   22335         Y = N0.getOperand(1);
   22336       if (N0.getOperand(1) == Mask)
   22337         Y = N0.getOperand(0);
   22338 
   22339       // Check to see if the mask appeared in both the AND and ANDNP and
   22340       if (!Y.getNode())
   22341         return SDValue();
   22342 
   22343       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
   22344       // Look through mask bitcast.
   22345       if (Mask.getOpcode() == ISD::BITCAST)
   22346         Mask = Mask.getOperand(0);
   22347       if (X.getOpcode() == ISD::BITCAST)
   22348         X = X.getOperand(0);
   22349       if (Y.getOpcode() == ISD::BITCAST)
   22350         Y = Y.getOperand(0);
   22351 
   22352       EVT MaskVT = Mask.getValueType();
   22353 
   22354       // Validate that the Mask operand is a vector sra node.
   22355       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
   22356       // there is no psrai.b
   22357       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
   22358       unsigned SraAmt = ~0;
   22359       if (Mask.getOpcode() == ISD::SRA) {
   22360         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
   22361           if (auto *AmtConst = AmtBV->getConstantSplatNode())
   22362             SraAmt = AmtConst->getZExtValue();
   22363       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
   22364         SDValue SraC = Mask.getOperand(1);
   22365         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
   22366       }
   22367       if ((SraAmt + 1) != EltBits)
   22368         return SDValue();
   22369 
   22370       SDLoc DL(N);
   22371 
   22372       // Now we know we at least have a plendvb with the mask val.  See if
   22373       // we can form a psignb/w/d.
   22374       // psign = x.type == y.type == mask.type && y = sub(0, x);
   22375       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
   22376           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
   22377           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
   22378         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
   22379                "Unsupported VT for PSIGN");
   22380         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
   22381         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
   22382       }
   22383       // PBLENDVB only available on SSE 4.1
   22384       if (!Subtarget->hasSSE41())
   22385         return SDValue();
   22386 
   22387       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
   22388 
   22389       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
   22390       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
   22391       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
   22392       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
   22393       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
   22394     }
   22395   }
   22396 
   22397   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
   22398     return SDValue();
   22399 
   22400   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
   22401   MachineFunction &MF = DAG.getMachineFunction();
   22402   bool OptForSize =
   22403       MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
   22404 
   22405   // SHLD/SHRD instructions have lower register pressure, but on some
   22406   // platforms they have higher latency than the equivalent
   22407   // series of shifts/or that would otherwise be generated.
   22408   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
   22409   // have higher latencies and we are not optimizing for size.
   22410   if (!OptForSize && Subtarget->isSHLDSlow())
   22411     return SDValue();
   22412 
   22413   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
   22414     std::swap(N0, N1);
   22415   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
   22416     return SDValue();
   22417   if (!N0.hasOneUse() || !N1.hasOneUse())
   22418     return SDValue();
   22419 
   22420   SDValue ShAmt0 = N0.getOperand(1);
   22421   if (ShAmt0.getValueType() != MVT::i8)
   22422     return SDValue();
   22423   SDValue ShAmt1 = N1.getOperand(1);
   22424   if (ShAmt1.getValueType() != MVT::i8)
   22425     return SDValue();
   22426   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
   22427     ShAmt0 = ShAmt0.getOperand(0);
   22428   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
   22429     ShAmt1 = ShAmt1.getOperand(0);
   22430 
   22431   SDLoc DL(N);
   22432   unsigned Opc = X86ISD::SHLD;
   22433   SDValue Op0 = N0.getOperand(0);
   22434   SDValue Op1 = N1.getOperand(0);
   22435   if (ShAmt0.getOpcode() == ISD::SUB) {
   22436     Opc = X86ISD::SHRD;
   22437     std::swap(Op0, Op1);
   22438     std::swap(ShAmt0, ShAmt1);
   22439   }
   22440 
   22441   unsigned Bits = VT.getSizeInBits();
   22442   if (ShAmt1.getOpcode() == ISD::SUB) {
   22443     SDValue Sum = ShAmt1.getOperand(0);
   22444     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
   22445       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
   22446       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
   22447         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
   22448       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
   22449         return DAG.getNode(Opc, DL, VT,
   22450                            Op0, Op1,
   22451                            DAG.getNode(ISD::TRUNCATE, DL,
   22452                                        MVT::i8, ShAmt0));
   22453     }
   22454   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
   22455     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
   22456     if (ShAmt0C &&
   22457         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
   22458       return DAG.getNode(Opc, DL, VT,
   22459                          N0.getOperand(0), N1.getOperand(0),
   22460                          DAG.getNode(ISD::TRUNCATE, DL,
   22461                                        MVT::i8, ShAmt0));
   22462   }
   22463 
   22464   return SDValue();
   22465 }
   22466 
   22467 // Generate NEG and CMOV for integer abs.
   22468 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
   22469   EVT VT = N->getValueType(0);
   22470 
   22471   // Since X86 does not have CMOV for 8-bit integer, we don't convert
   22472   // 8-bit integer abs to NEG and CMOV.
   22473   if (VT.isInteger() && VT.getSizeInBits() == 8)
   22474     return SDValue();
   22475 
   22476   SDValue N0 = N->getOperand(0);
   22477   SDValue N1 = N->getOperand(1);
   22478   SDLoc DL(N);
   22479 
   22480   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
   22481   // and change it to SUB and CMOV.
   22482   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
   22483       N0.getOpcode() == ISD::ADD &&
   22484       N0.getOperand(1) == N1 &&
   22485       N1.getOpcode() == ISD::SRA &&
   22486       N1.getOperand(0) == N0.getOperand(0))
   22487     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
   22488       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
   22489         // Generate SUB & CMOV.
   22490         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
   22491                                   DAG.getConstant(0, VT), N0.getOperand(0));
   22492 
   22493         SDValue Ops[] = { N0.getOperand(0), Neg,
   22494                           DAG.getConstant(X86::COND_GE, MVT::i8),
   22495                           SDValue(Neg.getNode(), 1) };
   22496         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
   22497       }
   22498   return SDValue();
   22499 }
   22500 
   22501 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
   22502 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
   22503                                  TargetLowering::DAGCombinerInfo &DCI,
   22504                                  const X86Subtarget *Subtarget) {
   22505   if (DCI.isBeforeLegalizeOps())
   22506     return SDValue();
   22507 
   22508   if (Subtarget->hasCMov()) {
   22509     SDValue RV = performIntegerAbsCombine(N, DAG);
   22510     if (RV.getNode())
   22511       return RV;
   22512   }
   22513 
   22514   return SDValue();
   22515 }
   22516 
   22517 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
   22518 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   22519                                   TargetLowering::DAGCombinerInfo &DCI,
   22520                                   const X86Subtarget *Subtarget) {
   22521   LoadSDNode *Ld = cast<LoadSDNode>(N);
   22522   EVT RegVT = Ld->getValueType(0);
   22523   EVT MemVT = Ld->getMemoryVT();
   22524   SDLoc dl(Ld);
   22525   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   22526 
   22527   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
   22528   // into two 16-byte operations.
   22529   ISD::LoadExtType Ext = Ld->getExtensionType();
   22530   unsigned Alignment = Ld->getAlignment();
   22531   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
   22532   if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
   22533       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
   22534     unsigned NumElems = RegVT.getVectorNumElements();
   22535     if (NumElems < 2)
   22536       return SDValue();
   22537 
   22538     SDValue Ptr = Ld->getBasePtr();
   22539     SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
   22540 
   22541     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
   22542                                   NumElems/2);
   22543     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
   22544                                 Ld->getPointerInfo(), Ld->isVolatile(),
   22545                                 Ld->isNonTemporal(), Ld->isInvariant(),
   22546                                 Alignment);
   22547     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   22548     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
   22549                                 Ld->getPointerInfo(), Ld->isVolatile(),
   22550                                 Ld->isNonTemporal(), Ld->isInvariant(),
   22551                                 std::min(16U, Alignment));
   22552     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   22553                              Load1.getValue(1),
   22554                              Load2.getValue(1));
   22555 
   22556     SDValue NewVec = DAG.getUNDEF(RegVT);
   22557     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
   22558     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
   22559     return DCI.CombineTo(N, NewVec, TF, true);
   22560   }
   22561 
   22562   return SDValue();
   22563 }
   22564 
   22565 /// PerformMLOADCombine - Resolve extending loads
   22566 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
   22567                                    TargetLowering::DAGCombinerInfo &DCI,
   22568                                    const X86Subtarget *Subtarget) {
   22569   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
   22570   if (Mld->getExtensionType() != ISD::SEXTLOAD)
   22571     return SDValue();
   22572 
   22573   EVT VT = Mld->getValueType(0);
   22574   unsigned NumElems = VT.getVectorNumElements();
   22575   EVT LdVT = Mld->getMemoryVT();
   22576   SDLoc dl(Mld);
   22577 
   22578   assert(LdVT != VT && "Cannot extend to the same type");
   22579   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
   22580   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
   22581   // From, To sizes and ElemCount must be pow of two
   22582   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
   22583     "Unexpected size for extending masked load");
   22584 
   22585   unsigned SizeRatio  = ToSz / FromSz;
   22586   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
   22587 
   22588   // Create a type on which we perform the shuffle
   22589   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   22590           LdVT.getScalarType(), NumElems*SizeRatio);
   22591   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   22592 
   22593   // Convert Src0 value
   22594   SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
   22595   if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
   22596     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   22597     for (unsigned i = 0; i != NumElems; ++i)
   22598       ShuffleVec[i] = i * SizeRatio;
   22599 
   22600     // Can't shuffle using an illegal type.
   22601     assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
   22602 	    && "WideVecVT should be legal");
   22603     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
   22604                                     DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
   22605   }
   22606   // Prepare the new mask
   22607   SDValue NewMask;
   22608   SDValue Mask = Mld->getMask();
   22609   if (Mask.getValueType() == VT) {
   22610     // Mask and original value have the same type
   22611     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
   22612     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   22613     for (unsigned i = 0; i != NumElems; ++i)
   22614       ShuffleVec[i] = i * SizeRatio;
   22615     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
   22616       ShuffleVec[i] = NumElems*SizeRatio;
   22617     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
   22618                                    DAG.getConstant(0, WideVecVT),
   22619                                    &ShuffleVec[0]);
   22620   }
   22621   else {
   22622     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
   22623     unsigned WidenNumElts = NumElems*SizeRatio;
   22624     unsigned MaskNumElts = VT.getVectorNumElements();
   22625     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
   22626                                      WidenNumElts);
   22627 
   22628     unsigned NumConcat = WidenNumElts / MaskNumElts;
   22629     SmallVector<SDValue, 16> Ops(NumConcat);
   22630     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
   22631     Ops[0] = Mask;
   22632     for (unsigned i = 1; i != NumConcat; ++i)
   22633       Ops[i] = ZeroVal;
   22634 
   22635     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   22636   }
   22637 
   22638   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
   22639                                      Mld->getBasePtr(), NewMask, WideSrc0,
   22640                                      Mld->getMemoryVT(), Mld->getMemOperand(),
   22641                                      ISD::NON_EXTLOAD);
   22642   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
   22643   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
   22644 
   22645 }
   22646 /// PerformMSTORECombine - Resolve truncating stores
   22647 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
   22648                                     const X86Subtarget *Subtarget) {
   22649   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
   22650   if (!Mst->isTruncatingStore())
   22651     return SDValue();
   22652 
   22653   EVT VT = Mst->getValue().getValueType();
   22654   unsigned NumElems = VT.getVectorNumElements();
   22655   EVT StVT = Mst->getMemoryVT();
   22656   SDLoc dl(Mst);
   22657 
   22658   assert(StVT != VT && "Cannot truncate to the same type");
   22659   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
   22660   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
   22661 
   22662   // From, To sizes and ElemCount must be pow of two
   22663   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
   22664     "Unexpected size for truncating masked store");
   22665   // We are going to use the original vector elt for storing.
   22666   // Accumulated smaller vector elements must be a multiple of the store size.
   22667   assert (((NumElems * FromSz) % ToSz) == 0 &&
   22668           "Unexpected ratio for truncating masked store");
   22669 
   22670   unsigned SizeRatio  = FromSz / ToSz;
   22671   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
   22672 
   22673   // Create a type on which we perform the shuffle
   22674   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   22675           StVT.getScalarType(), NumElems*SizeRatio);
   22676 
   22677   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   22678 
   22679   SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
   22680   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   22681   for (unsigned i = 0; i != NumElems; ++i)
   22682     ShuffleVec[i] = i * SizeRatio;
   22683 
   22684   // Can't shuffle using an illegal type.
   22685   assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
   22686 	  && "WideVecVT should be legal");
   22687 
   22688   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
   22689                                         DAG.getUNDEF(WideVecVT),
   22690                                         &ShuffleVec[0]);
   22691 
   22692   SDValue NewMask;
   22693   SDValue Mask = Mst->getMask();
   22694   if (Mask.getValueType() == VT) {
   22695     // Mask and original value have the same type
   22696     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
   22697     for (unsigned i = 0; i != NumElems; ++i)
   22698       ShuffleVec[i] = i * SizeRatio;
   22699     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
   22700       ShuffleVec[i] = NumElems*SizeRatio;
   22701     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
   22702                                    DAG.getConstant(0, WideVecVT),
   22703                                    &ShuffleVec[0]);
   22704   }
   22705   else {
   22706     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
   22707     unsigned WidenNumElts = NumElems*SizeRatio;
   22708     unsigned MaskNumElts = VT.getVectorNumElements();
   22709     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
   22710                                      WidenNumElts);
   22711 
   22712     unsigned NumConcat = WidenNumElts / MaskNumElts;
   22713     SmallVector<SDValue, 16> Ops(NumConcat);
   22714     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
   22715     Ops[0] = Mask;
   22716     for (unsigned i = 1; i != NumConcat; ++i)
   22717       Ops[i] = ZeroVal;
   22718 
   22719     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   22720   }
   22721 
   22722   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
   22723                             NewMask, StVT, Mst->getMemOperand(), false);
   22724 }
   22725 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
   22726 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   22727                                    const X86Subtarget *Subtarget) {
   22728   StoreSDNode *St = cast<StoreSDNode>(N);
   22729   EVT VT = St->getValue().getValueType();
   22730   EVT StVT = St->getMemoryVT();
   22731   SDLoc dl(St);
   22732   SDValue StoredVal = St->getOperand(1);
   22733   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   22734 
   22735   // If we are saving a concatenation of two XMM registers and 32-byte stores
   22736   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
   22737   unsigned Alignment = St->getAlignment();
   22738   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
   22739   if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
   22740       StVT == VT && !IsAligned) {
   22741     unsigned NumElems = VT.getVectorNumElements();
   22742     if (NumElems < 2)
   22743       return SDValue();
   22744 
   22745     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
   22746     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
   22747 
   22748     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
   22749     SDValue Ptr0 = St->getBasePtr();
   22750     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
   22751 
   22752     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
   22753                                 St->getPointerInfo(), St->isVolatile(),
   22754                                 St->isNonTemporal(), Alignment);
   22755     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
   22756                                 St->getPointerInfo(), St->isVolatile(),
   22757                                 St->isNonTemporal(),
   22758                                 std::min(16U, Alignment));
   22759     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
   22760   }
   22761 
   22762   // Optimize trunc store (of multiple scalars) to shuffle and store.
   22763   // First, pack all of the elements in one place. Next, store to memory
   22764   // in fewer chunks.
   22765   if (St->isTruncatingStore() && VT.isVector()) {
   22766     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   22767     unsigned NumElems = VT.getVectorNumElements();
   22768     assert(StVT != VT && "Cannot truncate to the same type");
   22769     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
   22770     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
   22771 
   22772     // From, To sizes and ElemCount must be pow of two
   22773     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
   22774     // We are going to use the original vector elt for storing.
   22775     // Accumulated smaller vector elements must be a multiple of the store size.
   22776     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
   22777 
   22778     unsigned SizeRatio  = FromSz / ToSz;
   22779 
   22780     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
   22781 
   22782     // Create a type on which we perform the shuffle
   22783     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   22784             StVT.getScalarType(), NumElems*SizeRatio);
   22785 
   22786     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   22787 
   22788     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
   22789     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
   22790     for (unsigned i = 0; i != NumElems; ++i)
   22791       ShuffleVec[i] = i * SizeRatio;
   22792 
   22793     // Can't shuffle using an illegal type.
   22794     if (!TLI.isTypeLegal(WideVecVT))
   22795       return SDValue();
   22796 
   22797     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
   22798                                          DAG.getUNDEF(WideVecVT),
   22799                                          &ShuffleVec[0]);
   22800     // At this point all of the data is stored at the bottom of the
   22801     // register. We now need to save it to mem.
   22802 
   22803     // Find the largest store unit
   22804     MVT StoreType = MVT::i8;
   22805     for (MVT Tp : MVT::integer_valuetypes()) {
   22806       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
   22807         StoreType = Tp;
   22808     }
   22809 
   22810     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
   22811     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
   22812         (64 <= NumElems * ToSz))
   22813       StoreType = MVT::f64;
   22814 
   22815     // Bitcast the original vector into a vector of store-size units
   22816     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
   22817             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
   22818     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
   22819     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
   22820     SmallVector<SDValue, 8> Chains;
   22821     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
   22822                                         TLI.getPointerTy());
   22823     SDValue Ptr = St->getBasePtr();
   22824 
   22825     // Perform one or more big stores into memory.
   22826     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
   22827       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   22828                                    StoreType, ShuffWide,
   22829                                    DAG.getIntPtrConstant(i));
   22830       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
   22831                                 St->getPointerInfo(), St->isVolatile(),
   22832                                 St->isNonTemporal(), St->getAlignment());
   22833       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   22834       Chains.push_back(Ch);
   22835     }
   22836 
   22837     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   22838   }
   22839 
   22840   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   22841   // the FP state in cases where an emms may be missing.
   22842   // A preferable solution to the general problem is to figure out the right
   22843   // places to insert EMMS.  This qualifies as a quick hack.
   22844 
   22845   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
   22846   if (VT.getSizeInBits() != 64)
   22847     return SDValue();
   22848 
   22849   const Function *F = DAG.getMachineFunction().getFunction();
   22850   bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
   22851   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
   22852                      && Subtarget->hasSSE2();
   22853   if ((VT.isVector() ||
   22854        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
   22855       isa<LoadSDNode>(St->getValue()) &&
   22856       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
   22857       St->getChain().hasOneUse() && !St->isVolatile()) {
   22858     SDNode* LdVal = St->getValue().getNode();
   22859     LoadSDNode *Ld = nullptr;
   22860     int TokenFactorIndex = -1;
   22861     SmallVector<SDValue, 8> Ops;
   22862     SDNode* ChainVal = St->getChain().getNode();
   22863     // Must be a store of a load.  We currently handle two cases:  the load
   22864     // is a direct child, and it's under an intervening TokenFactor.  It is
   22865     // possible to dig deeper under nested TokenFactors.
   22866     if (ChainVal == LdVal)
   22867       Ld = cast<LoadSDNode>(St->getChain());
   22868     else if (St->getValue().hasOneUse() &&
   22869              ChainVal->getOpcode() == ISD::TokenFactor) {
   22870       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
   22871         if (ChainVal->getOperand(i).getNode() == LdVal) {
   22872           TokenFactorIndex = i;
   22873           Ld = cast<LoadSDNode>(St->getValue());
   22874         } else
   22875           Ops.push_back(ChainVal->getOperand(i));
   22876       }
   22877     }
   22878 
   22879     if (!Ld || !ISD::isNormalLoad(Ld))
   22880       return SDValue();
   22881 
   22882     // If this is not the MMX case, i.e. we are just turning i64 load/store
   22883     // into f64 load/store, avoid the transformation if there are multiple
   22884     // uses of the loaded value.
   22885     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
   22886       return SDValue();
   22887 
   22888     SDLoc LdDL(Ld);
   22889     SDLoc StDL(N);
   22890     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
   22891     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
   22892     // pair instead.
   22893     if (Subtarget->is64Bit() || F64IsLegal) {
   22894       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
   22895       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
   22896                                   Ld->getPointerInfo(), Ld->isVolatile(),
   22897                                   Ld->isNonTemporal(), Ld->isInvariant(),
   22898                                   Ld->getAlignment());
   22899       SDValue NewChain = NewLd.getValue(1);
   22900       if (TokenFactorIndex != -1) {
   22901         Ops.push_back(NewChain);
   22902         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
   22903       }
   22904       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
   22905                           St->getPointerInfo(),
   22906                           St->isVolatile(), St->isNonTemporal(),
   22907                           St->getAlignment());
   22908     }
   22909 
   22910     // Otherwise, lower to two pairs of 32-bit loads / stores.
   22911     SDValue LoAddr = Ld->getBasePtr();
   22912     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
   22913                                  DAG.getConstant(4, MVT::i32));
   22914 
   22915     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
   22916                                Ld->getPointerInfo(),
   22917                                Ld->isVolatile(), Ld->isNonTemporal(),
   22918                                Ld->isInvariant(), Ld->getAlignment());
   22919     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
   22920                                Ld->getPointerInfo().getWithOffset(4),
   22921                                Ld->isVolatile(), Ld->isNonTemporal(),
   22922                                Ld->isInvariant(),
   22923                                MinAlign(Ld->getAlignment(), 4));
   22924 
   22925     SDValue NewChain = LoLd.getValue(1);
   22926     if (TokenFactorIndex != -1) {
   22927       Ops.push_back(LoLd);
   22928       Ops.push_back(HiLd);
   22929       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
   22930     }
   22931 
   22932     LoAddr = St->getBasePtr();
   22933     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
   22934                          DAG.getConstant(4, MVT::i32));
   22935 
   22936     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
   22937                                 St->getPointerInfo(),
   22938                                 St->isVolatile(), St->isNonTemporal(),
   22939                                 St->getAlignment());
   22940     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
   22941                                 St->getPointerInfo().getWithOffset(4),
   22942                                 St->isVolatile(),
   22943                                 St->isNonTemporal(),
   22944                                 MinAlign(St->getAlignment(), 4));
   22945     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
   22946   }
   22947   return SDValue();
   22948 }
   22949 
   22950 /// Return 'true' if this vector operation is "horizontal"
   22951 /// and return the operands for the horizontal operation in LHS and RHS.  A
   22952 /// horizontal operation performs the binary operation on successive elements
   22953 /// of its first operand, then on successive elements of its second operand,
   22954 /// returning the resulting values in a vector.  For example, if
   22955 ///   A = < float a0, float a1, float a2, float a3 >
   22956 /// and
   22957 ///   B = < float b0, float b1, float b2, float b3 >
   22958 /// then the result of doing a horizontal operation on A and B is
   22959 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
   22960 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
   22961 /// A horizontal-op B, for some already available A and B, and if so then LHS is
   22962 /// set to A, RHS to B, and the routine returns 'true'.
   22963 /// Note that the binary operation should have the property that if one of the
   22964 /// operands is UNDEF then the result is UNDEF.
   22965 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   22966   // Look for the following pattern: if
   22967   //   A = < float a0, float a1, float a2, float a3 >
   22968   //   B = < float b0, float b1, float b2, float b3 >
   22969   // and
   22970   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
   22971   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
   22972   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
   22973   // which is A horizontal-op B.
   22974 
   22975   // At least one of the operands should be a vector shuffle.
   22976   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
   22977       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
   22978     return false;
   22979 
   22980   MVT VT = LHS.getSimpleValueType();
   22981 
   22982   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   22983          "Unsupported vector type for horizontal add/sub");
   22984 
   22985   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
   22986   // operate independently on 128-bit lanes.
   22987   unsigned NumElts = VT.getVectorNumElements();
   22988   unsigned NumLanes = VT.getSizeInBits()/128;
   22989   unsigned NumLaneElts = NumElts / NumLanes;
   22990   assert((NumLaneElts % 2 == 0) &&
   22991          "Vector type should have an even number of elements in each lane");
   22992   unsigned HalfLaneElts = NumLaneElts/2;
   22993 
   22994   // View LHS in the form
   22995   //   LHS = VECTOR_SHUFFLE A, B, LMask
   22996   // If LHS is not a shuffle then pretend it is the shuffle
   22997   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
   22998   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
   22999   // type VT.
   23000   SDValue A, B;
   23001   SmallVector<int, 16> LMask(NumElts);
   23002   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
   23003     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
   23004       A = LHS.getOperand(0);
   23005     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
   23006       B = LHS.getOperand(1);
   23007     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
   23008     std::copy(Mask.begin(), Mask.end(), LMask.begin());
   23009   } else {
   23010     if (LHS.getOpcode() != ISD::UNDEF)
   23011       A = LHS;
   23012     for (unsigned i = 0; i != NumElts; ++i)
   23013       LMask[i] = i;
   23014   }
   23015 
   23016   // Likewise, view RHS in the form
   23017   //   RHS = VECTOR_SHUFFLE C, D, RMask
   23018   SDValue C, D;
   23019   SmallVector<int, 16> RMask(NumElts);
   23020   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
   23021     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
   23022       C = RHS.getOperand(0);
   23023     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
   23024       D = RHS.getOperand(1);
   23025     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
   23026     std::copy(Mask.begin(), Mask.end(), RMask.begin());
   23027   } else {
   23028     if (RHS.getOpcode() != ISD::UNDEF)
   23029       C = RHS;
   23030     for (unsigned i = 0; i != NumElts; ++i)
   23031       RMask[i] = i;
   23032   }
   23033 
   23034   // Check that the shuffles are both shuffling the same vectors.
   23035   if (!(A == C && B == D) && !(A == D && B == C))
   23036     return false;
   23037 
   23038   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
   23039   if (!A.getNode() && !B.getNode())
   23040     return false;
   23041 
   23042   // If A and B occur in reverse order in RHS, then "swap" them (which means
   23043   // rewriting the mask).
   23044   if (A != C)
   23045     ShuffleVectorSDNode::commuteMask(RMask);
   23046 
   23047   // At this point LHS and RHS are equivalent to
   23048   //   LHS = VECTOR_SHUFFLE A, B, LMask
   23049   //   RHS = VECTOR_SHUFFLE A, B, RMask
   23050   // Check that the masks correspond to performing a horizontal operation.
   23051   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
   23052     for (unsigned i = 0; i != NumLaneElts; ++i) {
   23053       int LIdx = LMask[i+l], RIdx = RMask[i+l];
   23054 
   23055       // Ignore any UNDEF components.
   23056       if (LIdx < 0 || RIdx < 0 ||
   23057           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
   23058           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
   23059         continue;
   23060 
   23061       // Check that successive elements are being operated on.  If not, this is
   23062       // not a horizontal operation.
   23063       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
   23064       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
   23065       if (!(LIdx == Index && RIdx == Index + 1) &&
   23066           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
   23067         return false;
   23068     }
   23069   }
   23070 
   23071   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
   23072   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
   23073   return true;
   23074 }
   23075 
   23076 /// Do target-specific dag combines on floating point adds.
   23077 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
   23078                                   const X86Subtarget *Subtarget) {
   23079   EVT VT = N->getValueType(0);
   23080   SDValue LHS = N->getOperand(0);
   23081   SDValue RHS = N->getOperand(1);
   23082 
   23083   // Try to synthesize horizontal adds from adds of shuffles.
   23084   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   23085        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
   23086       isHorizontalBinOp(LHS, RHS, true))
   23087     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
   23088   return SDValue();
   23089 }
   23090 
   23091 /// Do target-specific dag combines on floating point subs.
   23092 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
   23093                                   const X86Subtarget *Subtarget) {
   23094   EVT VT = N->getValueType(0);
   23095   SDValue LHS = N->getOperand(0);
   23096   SDValue RHS = N->getOperand(1);
   23097 
   23098   // Try to synthesize horizontal subs from subs of shuffles.
   23099   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   23100        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
   23101       isHorizontalBinOp(LHS, RHS, false))
   23102     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
   23103   return SDValue();
   23104 }
   23105 
   23106 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
   23107 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
   23108   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
   23109 
   23110   // F[X]OR(0.0, x) -> x
   23111   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   23112     if (C->getValueAPF().isPosZero())
   23113       return N->getOperand(1);
   23114 
   23115   // F[X]OR(x, 0.0) -> x
   23116   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   23117     if (C->getValueAPF().isPosZero())
   23118       return N->getOperand(0);
   23119   return SDValue();
   23120 }
   23121 
   23122 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
   23123 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
   23124   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
   23125 
   23126   // Only perform optimizations if UnsafeMath is used.
   23127   if (!DAG.getTarget().Options.UnsafeFPMath)
   23128     return SDValue();
   23129 
   23130   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
   23131   // into FMINC and FMAXC, which are Commutative operations.
   23132   unsigned NewOp = 0;
   23133   switch (N->getOpcode()) {
   23134     default: llvm_unreachable("unknown opcode");
   23135     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
   23136     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
   23137   }
   23138 
   23139   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
   23140                      N->getOperand(0), N->getOperand(1));
   23141 }
   23142 
   23143 /// Do target-specific dag combines on X86ISD::FAND nodes.
   23144 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
   23145   // FAND(0.0, x) -> 0.0
   23146   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   23147     if (C->getValueAPF().isPosZero())
   23148       return N->getOperand(0);
   23149 
   23150   // FAND(x, 0.0) -> 0.0
   23151   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   23152     if (C->getValueAPF().isPosZero())
   23153       return N->getOperand(1);
   23154 
   23155   return SDValue();
   23156 }
   23157 
   23158 /// Do target-specific dag combines on X86ISD::FANDN nodes
   23159 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
   23160   // FANDN(0.0, x) -> x
   23161   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
   23162     if (C->getValueAPF().isPosZero())
   23163       return N->getOperand(1);
   23164 
   23165   // FANDN(x, 0.0) -> 0.0
   23166   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
   23167     if (C->getValueAPF().isPosZero())
   23168       return N->getOperand(1);
   23169 
   23170   return SDValue();
   23171 }
   23172 
   23173 static SDValue PerformBTCombine(SDNode *N,
   23174                                 SelectionDAG &DAG,
   23175                                 TargetLowering::DAGCombinerInfo &DCI) {
   23176   // BT ignores high bits in the bit index operand.
   23177   SDValue Op1 = N->getOperand(1);
   23178   if (Op1.hasOneUse()) {
   23179     unsigned BitWidth = Op1.getValueSizeInBits();
   23180     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
   23181     APInt KnownZero, KnownOne;
   23182     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   23183                                           !DCI.isBeforeLegalizeOps());
   23184     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   23185     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
   23186         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
   23187       DCI.CommitTargetLoweringOpt(TLO);
   23188   }
   23189   return SDValue();
   23190 }
   23191 
   23192 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
   23193   SDValue Op = N->getOperand(0);
   23194   if (Op.getOpcode() == ISD::BITCAST)
   23195     Op = Op.getOperand(0);
   23196   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
   23197   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
   23198       VT.getVectorElementType().getSizeInBits() ==
   23199       OpVT.getVectorElementType().getSizeInBits()) {
   23200     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
   23201   }
   23202   return SDValue();
   23203 }
   23204 
   23205 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
   23206                                                const X86Subtarget *Subtarget) {
   23207   EVT VT = N->getValueType(0);
   23208   if (!VT.isVector())
   23209     return SDValue();
   23210 
   23211   SDValue N0 = N->getOperand(0);
   23212   SDValue N1 = N->getOperand(1);
   23213   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
   23214   SDLoc dl(N);
   23215 
   23216   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
   23217   // both SSE and AVX2 since there is no sign-extended shift right
   23218   // operation on a vector with 64-bit elements.
   23219   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
   23220   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
   23221   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
   23222       N0.getOpcode() == ISD::SIGN_EXTEND)) {
   23223     SDValue N00 = N0.getOperand(0);
   23224 
   23225     // EXTLOAD has a better solution on AVX2,
   23226     // it may be replaced with X86ISD::VSEXT node.
   23227     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
   23228       if (!ISD::isNormalLoad(N00.getNode()))
   23229         return SDValue();
   23230 
   23231     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
   23232         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
   23233                                   N00, N1);
   23234       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
   23235     }
   23236   }
   23237   return SDValue();
   23238 }
   23239 
   23240 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
   23241                                   TargetLowering::DAGCombinerInfo &DCI,
   23242                                   const X86Subtarget *Subtarget) {
   23243   SDValue N0 = N->getOperand(0);
   23244   EVT VT = N->getValueType(0);
   23245 
   23246   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
   23247   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
   23248   // This exposes the sext to the sdivrem lowering, so that it directly extends
   23249   // from AH (which we otherwise need to do contortions to access).
   23250   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
   23251       N0.getValueType() == MVT::i8 && VT == MVT::i32) {
   23252     SDLoc dl(N);
   23253     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
   23254     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
   23255                             N0.getOperand(0), N0.getOperand(1));
   23256     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
   23257     return R.getValue(1);
   23258   }
   23259 
   23260   if (!DCI.isBeforeLegalizeOps())
   23261     return SDValue();
   23262 
   23263   if (!Subtarget->hasFp256())
   23264     return SDValue();
   23265 
   23266   if (VT.isVector() && VT.getSizeInBits() == 256) {
   23267     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
   23268     if (R.getNode())
   23269       return R;
   23270   }
   23271 
   23272   return SDValue();
   23273 }
   23274 
   23275 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
   23276                                  const X86Subtarget* Subtarget) {
   23277   SDLoc dl(N);
   23278   EVT VT = N->getValueType(0);
   23279 
   23280   // Let legalize expand this if it isn't a legal type yet.
   23281   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   23282     return SDValue();
   23283 
   23284   EVT ScalarVT = VT.getScalarType();
   23285   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
   23286       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
   23287     return SDValue();
   23288 
   23289   SDValue A = N->getOperand(0);
   23290   SDValue B = N->getOperand(1);
   23291   SDValue C = N->getOperand(2);
   23292 
   23293   bool NegA = (A.getOpcode() == ISD::FNEG);
   23294   bool NegB = (B.getOpcode() == ISD::FNEG);
   23295   bool NegC = (C.getOpcode() == ISD::FNEG);
   23296 
   23297   // Negative multiplication when NegA xor NegB
   23298   bool NegMul = (NegA != NegB);
   23299   if (NegA)
   23300     A = A.getOperand(0);
   23301   if (NegB)
   23302     B = B.getOperand(0);
   23303   if (NegC)
   23304     C = C.getOperand(0);
   23305 
   23306   unsigned Opcode;
   23307   if (!NegMul)
   23308     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
   23309   else
   23310     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
   23311 
   23312   return DAG.getNode(Opcode, dl, VT, A, B, C);
   23313 }
   23314 
   23315 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
   23316                                   TargetLowering::DAGCombinerInfo &DCI,
   23317                                   const X86Subtarget *Subtarget) {
   23318   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   23319   //           (and (i32 x86isd::setcc_carry), 1)
   23320   // This eliminates the zext. This transformation is necessary because
   23321   // ISD::SETCC is always legalized to i8.
   23322   SDLoc dl(N);
   23323   SDValue N0 = N->getOperand(0);
   23324   EVT VT = N->getValueType(0);
   23325 
   23326   if (N0.getOpcode() == ISD::AND &&
   23327       N0.hasOneUse() &&
   23328       N0.getOperand(0).hasOneUse()) {
   23329     SDValue N00 = N0.getOperand(0);
   23330     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   23331       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   23332       if (!C || C->getZExtValue() != 1)
   23333         return SDValue();
   23334       return DAG.getNode(ISD::AND, dl, VT,
   23335                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
   23336                                      N00.getOperand(0), N00.getOperand(1)),
   23337                          DAG.getConstant(1, VT));
   23338     }
   23339   }
   23340 
   23341   if (N0.getOpcode() == ISD::TRUNCATE &&
   23342       N0.hasOneUse() &&
   23343       N0.getOperand(0).hasOneUse()) {
   23344     SDValue N00 = N0.getOperand(0);
   23345     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   23346       return DAG.getNode(ISD::AND, dl, VT,
   23347                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
   23348                                      N00.getOperand(0), N00.getOperand(1)),
   23349                          DAG.getConstant(1, VT));
   23350     }
   23351   }
   23352   if (VT.is256BitVector()) {
   23353     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
   23354     if (R.getNode())
   23355       return R;
   23356   }
   23357 
   23358   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
   23359   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
   23360   // This exposes the zext to the udivrem lowering, so that it directly extends
   23361   // from AH (which we otherwise need to do contortions to access).
   23362   if (N0.getOpcode() == ISD::UDIVREM &&
   23363       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
   23364       (VT == MVT::i32 || VT == MVT::i64)) {
   23365     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
   23366     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
   23367                             N0.getOperand(0), N0.getOperand(1));
   23368     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
   23369     return R.getValue(1);
   23370   }
   23371 
   23372   return SDValue();
   23373 }
   23374 
   23375 // Optimize x == -y --> x+y == 0
   23376 //          x != -y --> x+y != 0
   23377 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
   23378                                       const X86Subtarget* Subtarget) {
   23379   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   23380   SDValue LHS = N->getOperand(0);
   23381   SDValue RHS = N->getOperand(1);
   23382   EVT VT = N->getValueType(0);
   23383   SDLoc DL(N);
   23384 
   23385   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
   23386     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
   23387       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
   23388         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS,
   23389                                    LHS.getOperand(1));
   23390         return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV,
   23391                             DAG.getConstant(0, addV.getValueType()), CC);
   23392       }
   23393   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
   23394     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
   23395       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
   23396         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS,
   23397                                    RHS.getOperand(1));
   23398         return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV,
   23399                             DAG.getConstant(0, addV.getValueType()), CC);
   23400       }
   23401 
   23402   if (VT.getScalarType() == MVT::i1 &&
   23403       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
   23404     bool IsSEXT0 =
   23405         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
   23406         (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
   23407     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
   23408 
   23409     if (!IsSEXT0 || !IsVZero1) {
   23410       // Swap the operands and update the condition code.
   23411       std::swap(LHS, RHS);
   23412       CC = ISD::getSetCCSwappedOperands(CC);
   23413 
   23414       IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
   23415                 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
   23416       IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
   23417     }
   23418 
   23419     if (IsSEXT0 && IsVZero1) {
   23420       assert(VT == LHS.getOperand(0).getValueType() &&
   23421              "Uexpected operand type");
   23422       if (CC == ISD::SETGT)
   23423         return DAG.getConstant(0, VT);
   23424       if (CC == ISD::SETLE)
   23425         return DAG.getConstant(1, VT);
   23426       if (CC == ISD::SETEQ || CC == ISD::SETGE)
   23427         return DAG.getNOT(DL, LHS.getOperand(0), VT);
   23428 
   23429       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
   23430              "Unexpected condition code!");
   23431       return LHS.getOperand(0);
   23432     }
   23433   }
   23434 
   23435   return SDValue();
   23436 }
   23437 
   23438 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
   23439                                          SelectionDAG &DAG) {
   23440   SDLoc dl(Load);
   23441   MVT VT = Load->getSimpleValueType(0);
   23442   MVT EVT = VT.getVectorElementType();
   23443   SDValue Addr = Load->getOperand(1);
   23444   SDValue NewAddr = DAG.getNode(
   23445       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
   23446       DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
   23447 
   23448   SDValue NewLoad =
   23449       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
   23450                   DAG.getMachineFunction().getMachineMemOperand(
   23451                       Load->getMemOperand(), 0, EVT.getStoreSize()));
   23452   return NewLoad;
   23453 }
   23454 
   23455 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
   23456                                       const X86Subtarget *Subtarget) {
   23457   SDLoc dl(N);
   23458   MVT VT = N->getOperand(1)->getSimpleValueType(0);
   23459   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
   23460          "X86insertps is only defined for v4x32");
   23461 
   23462   SDValue Ld = N->getOperand(1);
   23463   if (MayFoldLoad(Ld)) {
   23464     // Extract the countS bits from the immediate so we can get the proper
   23465     // address when narrowing the vector load to a specific element.
   23466     // When the second source op is a memory address, insertps doesn't use
   23467     // countS and just gets an f32 from that address.
   23468     unsigned DestIndex =
   23469         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
   23470 
   23471     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
   23472 
   23473     // Create this as a scalar to vector to match the instruction pattern.
   23474     SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
   23475     // countS bits are ignored when loading from memory on insertps, which
   23476     // means we don't need to explicitly set them to 0.
   23477     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
   23478                        LoadScalarToVector, N->getOperand(2));
   23479   }
   23480   return SDValue();
   23481 }
   23482 
   23483 static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
   23484   SDValue V0 = N->getOperand(0);
   23485   SDValue V1 = N->getOperand(1);
   23486   SDLoc DL(N);
   23487   EVT VT = N->getValueType(0);
   23488 
   23489   // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
   23490   // operands and changing the mask to 1. This saves us a bunch of
   23491   // pattern-matching possibilities related to scalar math ops in SSE/AVX.
   23492   // x86InstrInfo knows how to commute this back after instruction selection
   23493   // if it would help register allocation.
   23494 
   23495   // TODO: If optimizing for size or a processor that doesn't suffer from
   23496   // partial register update stalls, this should be transformed into a MOVSD
   23497   // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
   23498 
   23499   if (VT == MVT::v2f64)
   23500     if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
   23501       if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
   23502         SDValue NewMask = DAG.getConstant(1, MVT::i8);
   23503         return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
   23504       }
   23505 
   23506   return SDValue();
   23507 }
   23508 
   23509 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
   23510 // as "sbb reg,reg", since it can be extended without zext and produces
   23511 // an all-ones bit which is more useful than 0/1 in some cases.
   23512 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
   23513                                MVT VT) {
   23514   if (VT == MVT::i8)
   23515     return DAG.getNode(ISD::AND, DL, VT,
   23516                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
   23517                                    DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
   23518                        DAG.getConstant(1, VT));
   23519   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
   23520   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
   23521                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
   23522                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
   23523 }
   23524 
   23525 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
   23526 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
   23527                                    TargetLowering::DAGCombinerInfo &DCI,
   23528                                    const X86Subtarget *Subtarget) {
   23529   SDLoc DL(N);
   23530   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   23531   SDValue EFLAGS = N->getOperand(1);
   23532 
   23533   if (CC == X86::COND_A) {
   23534     // Try to convert COND_A into COND_B in an attempt to facilitate
   23535     // materializing "setb reg".
   23536     //
   23537     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
   23538     // cannot take an immediate as its first operand.
   23539     //
   23540     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
   23541         EFLAGS.getValueType().isInteger() &&
   23542         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
   23543       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
   23544                                    EFLAGS.getNode()->getVTList(),
   23545                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
   23546       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
   23547       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
   23548     }
   23549   }
   23550 
   23551   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
   23552   // a zext and produces an all-ones bit which is more useful than 0/1 in some
   23553   // cases.
   23554   if (CC == X86::COND_B)
   23555     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
   23556 
   23557   SDValue Flags;
   23558 
   23559   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
   23560   if (Flags.getNode()) {
   23561     SDValue Cond = DAG.getConstant(CC, MVT::i8);
   23562     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
   23563   }
   23564 
   23565   return SDValue();
   23566 }
   23567 
   23568 // Optimize branch condition evaluation.
   23569 //
   23570 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
   23571                                     TargetLowering::DAGCombinerInfo &DCI,
   23572                                     const X86Subtarget *Subtarget) {
   23573   SDLoc DL(N);
   23574   SDValue Chain = N->getOperand(0);
   23575   SDValue Dest = N->getOperand(1);
   23576   SDValue EFLAGS = N->getOperand(3);
   23577   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
   23578 
   23579   SDValue Flags;
   23580 
   23581   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
   23582   if (Flags.getNode()) {
   23583     SDValue Cond = DAG.getConstant(CC, MVT::i8);
   23584     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
   23585                        Flags);
   23586   }
   23587 
   23588   return SDValue();
   23589 }
   23590 
   23591 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   23592                                                          SelectionDAG &DAG) {
   23593   // Take advantage of vector comparisons producing 0 or -1 in each lane to
   23594   // optimize away operation when it's from a constant.
   23595   //
   23596   // The general transformation is:
   23597   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
   23598   //       AND(VECTOR_CMP(x,y), constant2)
   23599   //    constant2 = UNARYOP(constant)
   23600 
   23601   // Early exit if this isn't a vector operation, the operand of the
   23602   // unary operation isn't a bitwise AND, or if the sizes of the operations
   23603   // aren't the same.
   23604   EVT VT = N->getValueType(0);
   23605   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
   23606       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
   23607       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
   23608     return SDValue();
   23609 
   23610   // Now check that the other operand of the AND is a constant. We could
   23611   // make the transformation for non-constant splats as well, but it's unclear
   23612   // that would be a benefit as it would not eliminate any operations, just
   23613   // perform one more step in scalar code before moving to the vector unit.
   23614   if (BuildVectorSDNode *BV =
   23615           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
   23616     // Bail out if the vector isn't a constant.
   23617     if (!BV->isConstant())
   23618       return SDValue();
   23619 
   23620     // Everything checks out. Build up the new and improved node.
   23621     SDLoc DL(N);
   23622     EVT IntVT = BV->getValueType(0);
   23623     // Create a new constant of the appropriate type for the transformed
   23624     // DAG.
   23625     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
   23626     // The AND node needs bitcasts to/from an integer vector type around it.
   23627     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
   23628     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
   23629                                  N->getOperand(0)->getOperand(0), MaskConst);
   23630     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
   23631     return Res;
   23632   }
   23633 
   23634   return SDValue();
   23635 }
   23636 
   23637 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
   23638                                         const X86Subtarget *Subtarget) {
   23639   // First try to optimize away the conversion entirely when it's
   23640   // conditionally from a constant. Vectors only.
   23641   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
   23642   if (Res != SDValue())
   23643     return Res;
   23644 
   23645   // Now move on to more general possibilities.
   23646   SDValue Op0 = N->getOperand(0);
   23647   EVT InVT = Op0->getValueType(0);
   23648 
   23649   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
   23650   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
   23651     SDLoc dl(N);
   23652     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
   23653     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
   23654     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
   23655   }
   23656 
   23657   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
   23658   // a 32-bit target where SSE doesn't support i64->FP operations.
   23659   if (Op0.getOpcode() == ISD::LOAD) {
   23660     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
   23661     EVT VT = Ld->getValueType(0);
   23662 
   23663     // This transformation is not supported if the result type is f16
   23664     if (N->getValueType(0) == MVT::f16)
   23665       return SDValue();
   23666 
   23667     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
   23668         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
   23669         !Subtarget->is64Bit() && VT == MVT::i64) {
   23670       SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
   23671           SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
   23672       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
   23673       return FILDChain;
   23674     }
   23675   }
   23676   return SDValue();
   23677 }
   23678 
   23679 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
   23680 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
   23681                                  X86TargetLowering::DAGCombinerInfo &DCI) {
   23682   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
   23683   // the result is either zero or one (depending on the input carry bit).
   23684   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
   23685   if (X86::isZeroNode(N->getOperand(0)) &&
   23686       X86::isZeroNode(N->getOperand(1)) &&
   23687       // We don't have a good way to replace an EFLAGS use, so only do this when
   23688       // dead right now.
   23689       SDValue(N, 1).use_empty()) {
   23690     SDLoc DL(N);
   23691     EVT VT = N->getValueType(0);
   23692     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
   23693     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
   23694                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
   23695                                            DAG.getConstant(X86::COND_B,MVT::i8),
   23696                                            N->getOperand(2)),
   23697                                DAG.getConstant(1, VT));
   23698     return DCI.CombineTo(N, Res1, CarryOut);
   23699   }
   23700 
   23701   return SDValue();
   23702 }
   23703 
   23704 // fold (add Y, (sete  X, 0)) -> adc  0, Y
   23705 //      (add Y, (setne X, 0)) -> sbb -1, Y
   23706 //      (sub (sete  X, 0), Y) -> sbb  0, Y
   23707 //      (sub (setne X, 0), Y) -> adc -1, Y
   23708 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
   23709   SDLoc DL(N);
   23710 
   23711   // Look through ZExts.
   23712   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
   23713   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
   23714     return SDValue();
   23715 
   23716   SDValue SetCC = Ext.getOperand(0);
   23717   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
   23718     return SDValue();
   23719 
   23720   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
   23721   if (CC != X86::COND_E && CC != X86::COND_NE)
   23722     return SDValue();
   23723 
   23724   SDValue Cmp = SetCC.getOperand(1);
   23725   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
   23726       !X86::isZeroNode(Cmp.getOperand(1)) ||
   23727       !Cmp.getOperand(0).getValueType().isInteger())
   23728     return SDValue();
   23729 
   23730   SDValue CmpOp0 = Cmp.getOperand(0);
   23731   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
   23732                                DAG.getConstant(1, CmpOp0.getValueType()));
   23733 
   23734   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
   23735   if (CC == X86::COND_NE)
   23736     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
   23737                        DL, OtherVal.getValueType(), OtherVal,
   23738                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
   23739   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
   23740                      DL, OtherVal.getValueType(), OtherVal,
   23741                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
   23742 }
   23743 
   23744 /// PerformADDCombine - Do target-specific dag combines on integer adds.
   23745 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
   23746                                  const X86Subtarget *Subtarget) {
   23747   EVT VT = N->getValueType(0);
   23748   SDValue Op0 = N->getOperand(0);
   23749   SDValue Op1 = N->getOperand(1);
   23750 
   23751   // Try to synthesize horizontal adds from adds of shuffles.
   23752   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
   23753        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
   23754       isHorizontalBinOp(Op0, Op1, true))
   23755     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
   23756 
   23757   return OptimizeConditionalInDecrement(N, DAG);
   23758 }
   23759 
   23760 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
   23761                                  const X86Subtarget *Subtarget) {
   23762   SDValue Op0 = N->getOperand(0);
   23763   SDValue Op1 = N->getOperand(1);
   23764 
   23765   // X86 can't encode an immediate LHS of a sub. See if we can push the
   23766   // negation into a preceding instruction.
   23767   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
   23768     // If the RHS of the sub is a XOR with one use and a constant, invert the
   23769     // immediate. Then add one to the LHS of the sub so we can turn
   23770     // X-Y -> X+~Y+1, saving one register.
   23771     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
   23772         isa<ConstantSDNode>(Op1.getOperand(1))) {
   23773       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
   23774       EVT VT = Op0.getValueType();
   23775       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
   23776                                    Op1.getOperand(0),
   23777                                    DAG.getConstant(~XorC, VT));
   23778       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
   23779                          DAG.getConstant(C->getAPIntValue()+1, VT));
   23780     }
   23781   }
   23782 
   23783   // Try to synthesize horizontal adds from adds of shuffles.
   23784   EVT VT = N->getValueType(0);
   23785   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
   23786        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
   23787       isHorizontalBinOp(Op0, Op1, true))
   23788     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
   23789 
   23790   return OptimizeConditionalInDecrement(N, DAG);
   23791 }
   23792 
   23793 /// performVZEXTCombine - Performs build vector combines
   23794 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
   23795                                    TargetLowering::DAGCombinerInfo &DCI,
   23796                                    const X86Subtarget *Subtarget) {
   23797   SDLoc DL(N);
   23798   MVT VT = N->getSimpleValueType(0);
   23799   SDValue Op = N->getOperand(0);
   23800   MVT OpVT = Op.getSimpleValueType();
   23801   MVT OpEltVT = OpVT.getVectorElementType();
   23802   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
   23803 
   23804   // (vzext (bitcast (vzext (x)) -> (vzext x)
   23805   SDValue V = Op;
   23806   while (V.getOpcode() == ISD::BITCAST)
   23807     V = V.getOperand(0);
   23808 
   23809   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
   23810     MVT InnerVT = V.getSimpleValueType();
   23811     MVT InnerEltVT = InnerVT.getVectorElementType();
   23812 
   23813     // If the element sizes match exactly, we can just do one larger vzext. This
   23814     // is always an exact type match as vzext operates on integer types.
   23815     if (OpEltVT == InnerEltVT) {
   23816       assert(OpVT == InnerVT && "Types must match for vzext!");
   23817       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
   23818     }
   23819 
   23820     // The only other way we can combine them is if only a single element of the
   23821     // inner vzext is used in the input to the outer vzext.
   23822     if (InnerEltVT.getSizeInBits() < InputBits)
   23823       return SDValue();
   23824 
   23825     // In this case, the inner vzext is completely dead because we're going to
   23826     // only look at bits inside of the low element. Just do the outer vzext on
   23827     // a bitcast of the input to the inner.
   23828     return DAG.getNode(X86ISD::VZEXT, DL, VT,
   23829                        DAG.getNode(ISD::BITCAST, DL, OpVT, V));
   23830   }
   23831 
   23832   // Check if we can bypass extracting and re-inserting an element of an input
   23833   // vector. Essentialy:
   23834   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
   23835   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
   23836       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   23837       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
   23838     SDValue ExtractedV = V.getOperand(0);
   23839     SDValue OrigV = ExtractedV.getOperand(0);
   23840     if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
   23841       if (ExtractIdx->getZExtValue() == 0) {
   23842         MVT OrigVT = OrigV.getSimpleValueType();
   23843         // Extract a subvector if necessary...
   23844         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
   23845           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
   23846           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
   23847                                     OrigVT.getVectorNumElements() / Ratio);
   23848           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
   23849                               DAG.getIntPtrConstant(0));
   23850         }
   23851         Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
   23852         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
   23853       }
   23854   }
   23855 
   23856   return SDValue();
   23857 }
   23858 
   23859 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   23860                                              DAGCombinerInfo &DCI) const {
   23861   SelectionDAG &DAG = DCI.DAG;
   23862   switch (N->getOpcode()) {
   23863   default: break;
   23864   case ISD::EXTRACT_VECTOR_ELT:
   23865     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
   23866   case ISD::VSELECT:
   23867   case ISD::SELECT:
   23868   case X86ISD::SHRUNKBLEND:
   23869     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
   23870   case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG);
   23871   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
   23872   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
   23873   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
   23874   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
   23875   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
   23876   case ISD::SHL:
   23877   case ISD::SRA:
   23878   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
   23879   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
   23880   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
   23881   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
   23882   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
   23883   case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
   23884   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
   23885   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
   23886   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
   23887   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   23888   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   23889   case X86ISD::FXOR:
   23890   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
   23891   case X86ISD::FMIN:
   23892   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
   23893   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
   23894   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
   23895   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   23896   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
   23897   case ISD::ANY_EXTEND:
   23898   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
   23899   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
   23900   case ISD::SIGN_EXTEND_INREG:
   23901     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
   23902   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
   23903   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
   23904   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
   23905   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
   23906   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
   23907   case X86ISD::SHUFP:       // Handle all target specific shuffles
   23908   case X86ISD::PALIGNR:
   23909   case X86ISD::UNPCKH:
   23910   case X86ISD::UNPCKL:
   23911   case X86ISD::MOVHLPS:
   23912   case X86ISD::MOVLHPS:
   23913   case X86ISD::PSHUFB:
   23914   case X86ISD::PSHUFD:
   23915   case X86ISD::PSHUFHW:
   23916   case X86ISD::PSHUFLW:
   23917   case X86ISD::MOVSS:
   23918   case X86ISD::MOVSD:
   23919   case X86ISD::VPERMILPI:
   23920   case X86ISD::VPERM2X128:
   23921   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   23922   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
   23923   case ISD::INTRINSIC_WO_CHAIN:
   23924     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
   23925   case X86ISD::INSERTPS: {
   23926     if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
   23927       return PerformINSERTPSCombine(N, DAG, Subtarget);
   23928     break;
   23929   }
   23930   case X86ISD::BLENDI:    return PerformBLENDICombine(N, DAG);
   23931   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
   23932   }
   23933 
   23934   return SDValue();
   23935 }
   23936 
   23937 /// isTypeDesirableForOp - Return true if the target has native support for
   23938 /// the specified value type and it is 'desirable' to use the type for the
   23939 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
   23940 /// instruction encodings are longer and some i16 instructions are slow.
   23941 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   23942   if (!isTypeLegal(VT))
   23943     return false;
   23944   if (VT != MVT::i16)
   23945     return true;
   23946 
   23947   switch (Opc) {
   23948   default:
   23949     return true;
   23950   case ISD::LOAD:
   23951   case ISD::SIGN_EXTEND:
   23952   case ISD::ZERO_EXTEND:
   23953   case ISD::ANY_EXTEND:
   23954   case ISD::SHL:
   23955   case ISD::SRL:
   23956   case ISD::SUB:
   23957   case ISD::ADD:
   23958   case ISD::MUL:
   23959   case ISD::AND:
   23960   case ISD::OR:
   23961   case ISD::XOR:
   23962     return false;
   23963   }
   23964 }
   23965 
   23966 /// IsDesirableToPromoteOp - This method query the target whether it is
   23967 /// beneficial for dag combiner to promote the specified node. If true, it
   23968 /// should return the desired promotion type by reference.
   23969 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   23970   EVT VT = Op.getValueType();
   23971   if (VT != MVT::i16)
   23972     return false;
   23973 
   23974   bool Promote = false;
   23975   bool Commute = false;
   23976   switch (Op.getOpcode()) {
   23977   default: break;
   23978   case ISD::LOAD: {
   23979     LoadSDNode *LD = cast<LoadSDNode>(Op);
   23980     // If the non-extending load has a single use and it's not live out, then it
   23981     // might be folded.
   23982     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
   23983                                                      Op.hasOneUse()*/) {
   23984       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   23985              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
   23986         // The only case where we'd want to promote LOAD (rather then it being
   23987         // promoted as an operand is when it's only use is liveout.
   23988         if (UI->getOpcode() != ISD::CopyToReg)
   23989           return false;
   23990       }
   23991     }
   23992     Promote = true;
   23993     break;
   23994   }
   23995   case ISD::SIGN_EXTEND:
   23996   case ISD::ZERO_EXTEND:
   23997   case ISD::ANY_EXTEND:
   23998     Promote = true;
   23999     break;
   24000   case ISD::SHL:
   24001   case ISD::SRL: {
   24002     SDValue N0 = Op.getOperand(0);
   24003     // Look out for (store (shl (load), x)).
   24004     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
   24005       return false;
   24006     Promote = true;
   24007     break;
   24008   }
   24009   case ISD::ADD:
   24010   case ISD::MUL:
   24011   case ISD::AND:
   24012   case ISD::OR:
   24013   case ISD::XOR:
   24014     Commute = true;
   24015     // fallthrough
   24016   case ISD::SUB: {
   24017     SDValue N0 = Op.getOperand(0);
   24018     SDValue N1 = Op.getOperand(1);
   24019     if (!Commute && MayFoldLoad(N1))
   24020       return false;
   24021     // Avoid disabling potential load folding opportunities.
   24022     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
   24023       return false;
   24024     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
   24025       return false;
   24026     Promote = true;
   24027   }
   24028   }
   24029 
   24030   PVT = MVT::i32;
   24031   return Promote;
   24032 }
   24033 
   24034 //===----------------------------------------------------------------------===//
   24035 //                           X86 Inline Assembly Support
   24036 //===----------------------------------------------------------------------===//
   24037 
   24038 // Helper to match a string separated by whitespace.
   24039 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
   24040   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
   24041 
   24042   for (StringRef Piece : Pieces) {
   24043     if (!S.startswith(Piece)) // Check if the piece matches.
   24044       return false;
   24045 
   24046     S = S.substr(Piece.size());
   24047     StringRef::size_type Pos = S.find_first_not_of(" \t");
   24048     if (Pos == 0) // We matched a prefix.
   24049       return false;
   24050 
   24051     S = S.substr(Pos);
   24052   }
   24053 
   24054   return S.empty();
   24055 }
   24056 
   24057 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
   24058 
   24059   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
   24060     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
   24061         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
   24062         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
   24063 
   24064       if (AsmPieces.size() == 3)
   24065         return true;
   24066       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
   24067         return true;
   24068     }
   24069   }
   24070   return false;
   24071 }
   24072 
   24073 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   24074   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
   24075 
   24076   std::string AsmStr = IA->getAsmString();
   24077 
   24078   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
   24079   if (!Ty || Ty->getBitWidth() % 16 != 0)
   24080     return false;
   24081 
   24082   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
   24083   SmallVector<StringRef, 4> AsmPieces;
   24084   SplitString(AsmStr, AsmPieces, ";\n");
   24085 
   24086   switch (AsmPieces.size()) {
   24087   default: return false;
   24088   case 1:
   24089     // FIXME: this should verify that we are targeting a 486 or better.  If not,
   24090     // we will turn this bswap into something that will be lowered to logical
   24091     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
   24092     // lower so don't worry about this.
   24093     // bswap $0
   24094     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
   24095         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
   24096         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
   24097         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
   24098         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
   24099         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
   24100       // No need to check constraints, nothing other than the equivalent of
   24101       // "=r,0" would be valid here.
   24102       return IntrinsicLowering::LowerToByteSwap(CI);
   24103     }
   24104 
   24105     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
   24106     if (CI->getType()->isIntegerTy(16) &&
   24107         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
   24108         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
   24109          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
   24110       AsmPieces.clear();
   24111       const std::string &ConstraintsStr = IA->getConstraintString();
   24112       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
   24113       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
   24114       if (clobbersFlagRegisters(AsmPieces))
   24115         return IntrinsicLowering::LowerToByteSwap(CI);
   24116     }
   24117     break;
   24118   case 3:
   24119     if (CI->getType()->isIntegerTy(32) &&
   24120         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
   24121         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
   24122         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
   24123         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
   24124       AsmPieces.clear();
   24125       const std::string &ConstraintsStr = IA->getConstraintString();
   24126       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
   24127       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
   24128       if (clobbersFlagRegisters(AsmPieces))
   24129         return IntrinsicLowering::LowerToByteSwap(CI);
   24130     }
   24131 
   24132     if (CI->getType()->isIntegerTy(64)) {
   24133       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
   24134       if (Constraints.size() >= 2 &&
   24135           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
   24136           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
   24137         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
   24138         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
   24139             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
   24140             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
   24141           return IntrinsicLowering::LowerToByteSwap(CI);
   24142       }
   24143     }
   24144     break;
   24145   }
   24146   return false;
   24147 }
   24148 
   24149 /// getConstraintType - Given a constraint letter, return the type of
   24150 /// constraint it is for this target.
   24151 X86TargetLowering::ConstraintType
   24152 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
   24153   if (Constraint.size() == 1) {
   24154     switch (Constraint[0]) {
   24155     case 'R':
   24156     case 'q':
   24157     case 'Q':
   24158     case 'f':
   24159     case 't':
   24160     case 'u':
   24161     case 'y':
   24162     case 'x':
   24163     case 'Y':
   24164     case 'l':
   24165       return C_RegisterClass;
   24166     case 'a':
   24167     case 'b':
   24168     case 'c':
   24169     case 'd':
   24170     case 'S':
   24171     case 'D':
   24172     case 'A':
   24173       return C_Register;
   24174     case 'I':
   24175     case 'J':
   24176     case 'K':
   24177     case 'L':
   24178     case 'M':
   24179     case 'N':
   24180     case 'G':
   24181     case 'C':
   24182     case 'e':
   24183     case 'Z':
   24184       return C_Other;
   24185     default:
   24186       break;
   24187     }
   24188   }
   24189   return TargetLowering::getConstraintType(Constraint);
   24190 }
   24191 
   24192 /// Examine constraint type and operand type and determine a weight value.
   24193 /// This object must already have been set up with the operand type
   24194 /// and the current alternative constraint selected.
   24195 TargetLowering::ConstraintWeight
   24196   X86TargetLowering::getSingleConstraintMatchWeight(
   24197     AsmOperandInfo &info, const char *constraint) const {
   24198   ConstraintWeight weight = CW_Invalid;
   24199   Value *CallOperandVal = info.CallOperandVal;
   24200     // If we don't have a value, we can't do a match,
   24201     // but allow it at the lowest weight.
   24202   if (!CallOperandVal)
   24203     return CW_Default;
   24204   Type *type = CallOperandVal->getType();
   24205   // Look at the constraint type.
   24206   switch (*constraint) {
   24207   default:
   24208     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
   24209   case 'R':
   24210   case 'q':
   24211   case 'Q':
   24212   case 'a':
   24213   case 'b':
   24214   case 'c':
   24215   case 'd':
   24216   case 'S':
   24217   case 'D':
   24218   case 'A':
   24219     if (CallOperandVal->getType()->isIntegerTy())
   24220       weight = CW_SpecificReg;
   24221     break;
   24222   case 'f':
   24223   case 't':
   24224   case 'u':
   24225     if (type->isFloatingPointTy())
   24226       weight = CW_SpecificReg;
   24227     break;
   24228   case 'y':
   24229     if (type->isX86_MMXTy() && Subtarget->hasMMX())
   24230       weight = CW_SpecificReg;
   24231     break;
   24232   case 'x':
   24233   case 'Y':
   24234     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
   24235         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
   24236       weight = CW_Register;
   24237     break;
   24238   case 'I':
   24239     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
   24240       if (C->getZExtValue() <= 31)
   24241         weight = CW_Constant;
   24242     }
   24243     break;
   24244   case 'J':
   24245     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   24246       if (C->getZExtValue() <= 63)
   24247         weight = CW_Constant;
   24248     }
   24249     break;
   24250   case 'K':
   24251     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   24252       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
   24253         weight = CW_Constant;
   24254     }
   24255     break;
   24256   case 'L':
   24257     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   24258       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
   24259         weight = CW_Constant;
   24260     }
   24261     break;
   24262   case 'M':
   24263     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   24264       if (C->getZExtValue() <= 3)
   24265         weight = CW_Constant;
   24266     }
   24267     break;
   24268   case 'N':
   24269     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   24270       if (C->getZExtValue() <= 0xff)
   24271         weight = CW_Constant;
   24272     }
   24273     break;
   24274   case 'G':
   24275   case 'C':
   24276     if (isa<ConstantFP>(CallOperandVal)) {
   24277       weight = CW_Constant;
   24278     }
   24279     break;
   24280   case 'e':
   24281     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   24282       if ((C->getSExtValue() >= -0x80000000LL) &&
   24283           (C->getSExtValue() <= 0x7fffffffLL))
   24284         weight = CW_Constant;
   24285     }
   24286     break;
   24287   case 'Z':
   24288     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   24289       if (C->getZExtValue() <= 0xffffffff)
   24290         weight = CW_Constant;
   24291     }
   24292     break;
   24293   }
   24294   return weight;
   24295 }
   24296 
   24297 /// LowerXConstraint - try to replace an X constraint, which matches anything,
   24298 /// with another that has more specific requirements based on the type of the
   24299 /// corresponding operand.
   24300 const char *X86TargetLowering::
   24301 LowerXConstraint(EVT ConstraintVT) const {
   24302   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   24303   // 'f' like normal targets.
   24304   if (ConstraintVT.isFloatingPoint()) {
   24305     if (Subtarget->hasSSE2())
   24306       return "Y";
   24307     if (Subtarget->hasSSE1())
   24308       return "x";
   24309   }
   24310 
   24311   return TargetLowering::LowerXConstraint(ConstraintVT);
   24312 }
   24313 
   24314 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
   24315 /// vector.  If it is invalid, don't add anything to Ops.
   24316 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   24317                                                      std::string &Constraint,
   24318                                                      std::vector<SDValue>&Ops,
   24319                                                      SelectionDAG &DAG) const {
   24320   SDValue Result;
   24321 
   24322   // Only support length 1 constraints for now.
   24323   if (Constraint.length() > 1) return;
   24324 
   24325   char ConstraintLetter = Constraint[0];
   24326   switch (ConstraintLetter) {
   24327   default: break;
   24328   case 'I':
   24329     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   24330       if (C->getZExtValue() <= 31) {
   24331         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   24332         break;
   24333       }
   24334     }
   24335     return;
   24336   case 'J':
   24337     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   24338       if (C->getZExtValue() <= 63) {
   24339         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   24340         break;
   24341       }
   24342     }
   24343     return;
   24344   case 'K':
   24345     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   24346       if (isInt<8>(C->getSExtValue())) {
   24347         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   24348         break;
   24349       }
   24350     }
   24351     return;
   24352   case 'L':
   24353     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   24354       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
   24355           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
   24356         Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
   24357         break;
   24358       }
   24359     }
   24360     return;
   24361   case 'M':
   24362     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   24363       if (C->getZExtValue() <= 3) {
   24364         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   24365         break;
   24366       }
   24367     }
   24368     return;
   24369   case 'N':
   24370     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   24371       if (C->getZExtValue() <= 255) {
   24372         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   24373         break;
   24374       }
   24375     }
   24376     return;
   24377   case 'O':
   24378     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   24379       if (C->getZExtValue() <= 127) {
   24380         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   24381         break;
   24382       }
   24383     }
   24384     return;
   24385   case 'e': {
   24386     // 32-bit signed value
   24387     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   24388       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
   24389                                            C->getSExtValue())) {
   24390         // Widen to 64 bits here to get it sign extended.
   24391         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
   24392         break;
   24393       }
   24394     // FIXME gcc accepts some relocatable values here too, but only in certain
   24395     // memory models; it's complicated.
   24396     }
   24397     return;
   24398   }
   24399   case 'Z': {
   24400     // 32-bit unsigned value
   24401     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   24402       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
   24403                                            C->getZExtValue())) {
   24404         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
   24405         break;
   24406       }
   24407     }
   24408     // FIXME gcc accepts some relocatable values here too, but only in certain
   24409     // memory models; it's complicated.
   24410     return;
   24411   }
   24412   case 'i': {
   24413     // Literal immediates are always ok.
   24414     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
   24415       // Widen to 64 bits here to get it sign extended.
   24416       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
   24417       break;
   24418     }
   24419 
   24420     // In any sort of PIC mode addresses need to be computed at runtime by
   24421     // adding in a register or some sort of table lookup.  These can't
   24422     // be used as immediates.
   24423     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
   24424       return;
   24425 
   24426     // If we are in non-pic codegen mode, we allow the address of a global (with
   24427     // an optional displacement) to be used with 'i'.
   24428     GlobalAddressSDNode *GA = nullptr;
   24429     int64_t Offset = 0;
   24430 
   24431     // Match either (GA), (GA+C), (GA+C1+C2), etc.
   24432     while (1) {
   24433       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
   24434         Offset += GA->getOffset();
   24435         break;
   24436       } else if (Op.getOpcode() == ISD::ADD) {
   24437         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   24438           Offset += C->getZExtValue();
   24439           Op = Op.getOperand(0);
   24440           continue;
   24441         }
   24442       } else if (Op.getOpcode() == ISD::SUB) {
   24443         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   24444           Offset += -C->getZExtValue();
   24445           Op = Op.getOperand(0);
   24446           continue;
   24447         }
   24448       }
   24449 
   24450       // Otherwise, this isn't something we can handle, reject it.
   24451       return;
   24452     }
   24453 
   24454     const GlobalValue *GV = GA->getGlobal();
   24455     // If we require an extra load to get this address, as in PIC mode, we
   24456     // can't accept it.
   24457     if (isGlobalStubReference(
   24458             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
   24459       return;
   24460 
   24461     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
   24462                                         GA->getValueType(0), Offset);
   24463     break;
   24464   }
   24465   }
   24466 
   24467   if (Result.getNode()) {
   24468     Ops.push_back(Result);
   24469     return;
   24470   }
   24471   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   24472 }
   24473 
   24474 std::pair<unsigned, const TargetRegisterClass *>
   24475 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   24476                                                 const std::string &Constraint,
   24477                                                 MVT VT) const {
   24478   // First, see if this is a constraint that directly corresponds to an LLVM
   24479   // register class.
   24480   if (Constraint.size() == 1) {
   24481     // GCC Constraint Letters
   24482     switch (Constraint[0]) {
   24483     default: break;
   24484       // TODO: Slight differences here in allocation order and leaving
   24485       // RIP in the class. Do they matter any more here than they do
   24486       // in the normal allocation?
   24487     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
   24488       if (Subtarget->is64Bit()) {
   24489         if (VT == MVT::i32 || VT == MVT::f32)
   24490           return std::make_pair(0U, &X86::GR32RegClass);
   24491         if (VT == MVT::i16)
   24492           return std::make_pair(0U, &X86::GR16RegClass);
   24493         if (VT == MVT::i8 || VT == MVT::i1)
   24494           return std::make_pair(0U, &X86::GR8RegClass);
   24495         if (VT == MVT::i64 || VT == MVT::f64)
   24496           return std::make_pair(0U, &X86::GR64RegClass);
   24497         break;
   24498       }
   24499       // 32-bit fallthrough
   24500     case 'Q':   // Q_REGS
   24501       if (VT == MVT::i32 || VT == MVT::f32)
   24502         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
   24503       if (VT == MVT::i16)
   24504         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
   24505       if (VT == MVT::i8 || VT == MVT::i1)
   24506         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
   24507       if (VT == MVT::i64)
   24508         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
   24509       break;
   24510     case 'r':   // GENERAL_REGS
   24511     case 'l':   // INDEX_REGS
   24512       if (VT == MVT::i8 || VT == MVT::i1)
   24513         return std::make_pair(0U, &X86::GR8RegClass);
   24514       if (VT == MVT::i16)
   24515         return std::make_pair(0U, &X86::GR16RegClass);
   24516       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
   24517         return std::make_pair(0U, &X86::GR32RegClass);
   24518       return std::make_pair(0U, &X86::GR64RegClass);
   24519     case 'R':   // LEGACY_REGS
   24520       if (VT == MVT::i8 || VT == MVT::i1)
   24521         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
   24522       if (VT == MVT::i16)
   24523         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
   24524       if (VT == MVT::i32 || !Subtarget->is64Bit())
   24525         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
   24526       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
   24527     case 'f':  // FP Stack registers.
   24528       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
   24529       // value to the correct fpstack register class.
   24530       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
   24531         return std::make_pair(0U, &X86::RFP32RegClass);
   24532       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
   24533         return std::make_pair(0U, &X86::RFP64RegClass);
   24534       return std::make_pair(0U, &X86::RFP80RegClass);
   24535     case 'y':   // MMX_REGS if MMX allowed.
   24536       if (!Subtarget->hasMMX()) break;
   24537       return std::make_pair(0U, &X86::VR64RegClass);
   24538     case 'Y':   // SSE_REGS if SSE2 allowed
   24539       if (!Subtarget->hasSSE2()) break;
   24540       // FALL THROUGH.
   24541     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
   24542       if (!Subtarget->hasSSE1()) break;
   24543 
   24544       switch (VT.SimpleTy) {
   24545       default: break;
   24546       // Scalar SSE types.
   24547       case MVT::f32:
   24548       case MVT::i32:
   24549         return std::make_pair(0U, &X86::FR32RegClass);
   24550       case MVT::f64:
   24551       case MVT::i64:
   24552         return std::make_pair(0U, &X86::FR64RegClass);
   24553       // Vector types.
   24554       case MVT::v16i8:
   24555       case MVT::v8i16:
   24556       case MVT::v4i32:
   24557       case MVT::v2i64:
   24558       case MVT::v4f32:
   24559       case MVT::v2f64:
   24560         return std::make_pair(0U, &X86::VR128RegClass);
   24561       // AVX types.
   24562       case MVT::v32i8:
   24563       case MVT::v16i16:
   24564       case MVT::v8i32:
   24565       case MVT::v4i64:
   24566       case MVT::v8f32:
   24567       case MVT::v4f64:
   24568         return std::make_pair(0U, &X86::VR256RegClass);
   24569       case MVT::v8f64:
   24570       case MVT::v16f32:
   24571       case MVT::v16i32:
   24572       case MVT::v8i64:
   24573         return std::make_pair(0U, &X86::VR512RegClass);
   24574       }
   24575       break;
   24576     }
   24577   }
   24578 
   24579   // Use the default implementation in TargetLowering to convert the register
   24580   // constraint into a member of a register class.
   24581   std::pair<unsigned, const TargetRegisterClass*> Res;
   24582   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
   24583 
   24584   // Not found as a standard register?
   24585   if (!Res.second) {
   24586     // Map st(0) -> st(7) -> ST0
   24587     if (Constraint.size() == 7 && Constraint[0] == '{' &&
   24588         tolower(Constraint[1]) == 's' &&
   24589         tolower(Constraint[2]) == 't' &&
   24590         Constraint[3] == '(' &&
   24591         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
   24592         Constraint[5] == ')' &&
   24593         Constraint[6] == '}') {
   24594 
   24595       Res.first = X86::FP0+Constraint[4]-'0';
   24596       Res.second = &X86::RFP80RegClass;
   24597       return Res;
   24598     }
   24599 
   24600     // GCC allows "st(0)" to be called just plain "st".
   24601     if (StringRef("{st}").equals_lower(Constraint)) {
   24602       Res.first = X86::FP0;
   24603       Res.second = &X86::RFP80RegClass;
   24604       return Res;
   24605     }
   24606 
   24607     // flags -> EFLAGS
   24608     if (StringRef("{flags}").equals_lower(Constraint)) {
   24609       Res.first = X86::EFLAGS;
   24610       Res.second = &X86::CCRRegClass;
   24611       return Res;
   24612     }
   24613 
   24614     // 'A' means EAX + EDX.
   24615     if (Constraint == "A") {
   24616       Res.first = X86::EAX;
   24617       Res.second = &X86::GR32_ADRegClass;
   24618       return Res;
   24619     }
   24620     return Res;
   24621   }
   24622 
   24623   // Otherwise, check to see if this is a register class of the wrong value
   24624   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
   24625   // turn into {ax},{dx}.
   24626   if (Res.second->hasType(VT))
   24627     return Res;   // Correct type already, nothing to do.
   24628 
   24629   // All of the single-register GCC register classes map their values onto
   24630   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
   24631   // really want an 8-bit or 32-bit register, map to the appropriate register
   24632   // class and return the appropriate register.
   24633   if (Res.second == &X86::GR16RegClass) {
   24634     if (VT == MVT::i8 || VT == MVT::i1) {
   24635       unsigned DestReg = 0;
   24636       switch (Res.first) {
   24637       default: break;
   24638       case X86::AX: DestReg = X86::AL; break;
   24639       case X86::DX: DestReg = X86::DL; break;
   24640       case X86::CX: DestReg = X86::CL; break;
   24641       case X86::BX: DestReg = X86::BL; break;
   24642       }
   24643       if (DestReg) {
   24644         Res.first = DestReg;
   24645         Res.second = &X86::GR8RegClass;
   24646       }
   24647     } else if (VT == MVT::i32 || VT == MVT::f32) {
   24648       unsigned DestReg = 0;
   24649       switch (Res.first) {
   24650       default: break;
   24651       case X86::AX: DestReg = X86::EAX; break;
   24652       case X86::DX: DestReg = X86::EDX; break;
   24653       case X86::CX: DestReg = X86::ECX; break;
   24654       case X86::BX: DestReg = X86::EBX; break;
   24655       case X86::SI: DestReg = X86::ESI; break;
   24656       case X86::DI: DestReg = X86::EDI; break;
   24657       case X86::BP: DestReg = X86::EBP; break;
   24658       case X86::SP: DestReg = X86::ESP; break;
   24659       }
   24660       if (DestReg) {
   24661         Res.first = DestReg;
   24662         Res.second = &X86::GR32RegClass;
   24663       }
   24664     } else if (VT == MVT::i64 || VT == MVT::f64) {
   24665       unsigned DestReg = 0;
   24666       switch (Res.first) {
   24667       default: break;
   24668       case X86::AX: DestReg = X86::RAX; break;
   24669       case X86::DX: DestReg = X86::RDX; break;
   24670       case X86::CX: DestReg = X86::RCX; break;
   24671       case X86::BX: DestReg = X86::RBX; break;
   24672       case X86::SI: DestReg = X86::RSI; break;
   24673       case X86::DI: DestReg = X86::RDI; break;
   24674       case X86::BP: DestReg = X86::RBP; break;
   24675       case X86::SP: DestReg = X86::RSP; break;
   24676       }
   24677       if (DestReg) {
   24678         Res.first = DestReg;
   24679         Res.second = &X86::GR64RegClass;
   24680       }
   24681     }
   24682   } else if (Res.second == &X86::FR32RegClass ||
   24683              Res.second == &X86::FR64RegClass ||
   24684              Res.second == &X86::VR128RegClass ||
   24685              Res.second == &X86::VR256RegClass ||
   24686              Res.second == &X86::FR32XRegClass ||
   24687              Res.second == &X86::FR64XRegClass ||
   24688              Res.second == &X86::VR128XRegClass ||
   24689              Res.second == &X86::VR256XRegClass ||
   24690              Res.second == &X86::VR512RegClass) {
   24691     // Handle references to XMM physical registers that got mapped into the
   24692     // wrong class.  This can happen with constraints like {xmm0} where the
   24693     // target independent register mapper will just pick the first match it can
   24694     // find, ignoring the required type.
   24695 
   24696     if (VT == MVT::f32 || VT == MVT::i32)
   24697       Res.second = &X86::FR32RegClass;
   24698     else if (VT == MVT::f64 || VT == MVT::i64)
   24699       Res.second = &X86::FR64RegClass;
   24700     else if (X86::VR128RegClass.hasType(VT))
   24701       Res.second = &X86::VR128RegClass;
   24702     else if (X86::VR256RegClass.hasType(VT))
   24703       Res.second = &X86::VR256RegClass;
   24704     else if (X86::VR512RegClass.hasType(VT))
   24705       Res.second = &X86::VR512RegClass;
   24706   }
   24707 
   24708   return Res;
   24709 }
   24710 
   24711 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
   24712                                             Type *Ty) const {
   24713   // Scaling factors are not free at all.
   24714   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
   24715   // will take 2 allocations in the out of order engine instead of 1
   24716   // for plain addressing mode, i.e. inst (reg1).
   24717   // E.g.,
   24718   // vaddps (%rsi,%drx), %ymm0, %ymm1
   24719   // Requires two allocations (one for the load, one for the computation)
   24720   // whereas:
   24721   // vaddps (%rsi), %ymm0, %ymm1
   24722   // Requires just 1 allocation, i.e., freeing allocations for other operations
   24723   // and having less micro operations to execute.
   24724   //
   24725   // For some X86 architectures, this is even worse because for instance for
   24726   // stores, the complex addressing mode forces the instruction to use the
   24727   // "load" ports instead of the dedicated "store" port.
   24728   // E.g., on Haswell:
   24729   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
   24730   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
   24731   if (isLegalAddressingMode(AM, Ty))
   24732     // Scale represents reg2 * scale, thus account for 1
   24733     // as soon as we use a second register.
   24734     return AM.Scale != 0;
   24735   return -1;
   24736 }
   24737 
   24738 bool X86TargetLowering::isTargetFTOL() const {
   24739   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
   24740 }
   24741