Home | History | Annotate | Download | only in X86
      1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the interfaces that X86 uses to lower LLVM code into a
     11 // selection DAG.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "X86ISelLowering.h"
     16 #include "Utils/X86ShuffleDecode.h"
     17 #include "X86CallingConv.h"
     18 #include "X86FrameLowering.h"
     19 #include "X86InstrBuilder.h"
     20 #include "X86IntrinsicsInfo.h"
     21 #include "X86MachineFunctionInfo.h"
     22 #include "X86ShuffleDecodeConstantPool.h"
     23 #include "X86TargetMachine.h"
     24 #include "X86TargetObjectFile.h"
     25 #include "llvm/ADT/SmallBitVector.h"
     26 #include "llvm/ADT/SmallSet.h"
     27 #include "llvm/ADT/Statistic.h"
     28 #include "llvm/ADT/StringExtras.h"
     29 #include "llvm/ADT/StringSwitch.h"
     30 #include "llvm/Analysis/EHPersonalities.h"
     31 #include "llvm/CodeGen/IntrinsicLowering.h"
     32 #include "llvm/CodeGen/MachineFrameInfo.h"
     33 #include "llvm/CodeGen/MachineFunction.h"
     34 #include "llvm/CodeGen/MachineInstrBuilder.h"
     35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
     36 #include "llvm/CodeGen/MachineModuleInfo.h"
     37 #include "llvm/CodeGen/MachineRegisterInfo.h"
     38 #include "llvm/CodeGen/TargetLowering.h"
     39 #include "llvm/CodeGen/WinEHFuncInfo.h"
     40 #include "llvm/IR/CallSite.h"
     41 #include "llvm/IR/CallingConv.h"
     42 #include "llvm/IR/Constants.h"
     43 #include "llvm/IR/DerivedTypes.h"
     44 #include "llvm/IR/DiagnosticInfo.h"
     45 #include "llvm/IR/Function.h"
     46 #include "llvm/IR/GlobalAlias.h"
     47 #include "llvm/IR/GlobalVariable.h"
     48 #include "llvm/IR/Instructions.h"
     49 #include "llvm/IR/Intrinsics.h"
     50 #include "llvm/MC/MCAsmInfo.h"
     51 #include "llvm/MC/MCContext.h"
     52 #include "llvm/MC/MCExpr.h"
     53 #include "llvm/MC/MCSymbol.h"
     54 #include "llvm/Support/CommandLine.h"
     55 #include "llvm/Support/Debug.h"
     56 #include "llvm/Support/ErrorHandling.h"
     57 #include "llvm/Support/KnownBits.h"
     58 #include "llvm/Support/MathExtras.h"
     59 #include "llvm/Target/TargetOptions.h"
     60 #include <algorithm>
     61 #include <bitset>
     62 #include <cctype>
     63 #include <numeric>
     64 using namespace llvm;
     65 
     66 #define DEBUG_TYPE "x86-isel"
     67 
     68 STATISTIC(NumTailCalls, "Number of tail calls");
     69 
     70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
     71     "x86-experimental-vector-widening-legalization", cl::init(false),
     72     cl::desc("Enable an experimental vector type legalization through widening "
     73              "rather than promotion."),
     74     cl::Hidden);
     75 
     76 static cl::opt<int> ExperimentalPrefLoopAlignment(
     77     "x86-experimental-pref-loop-alignment", cl::init(4),
     78     cl::desc("Sets the preferable loop alignment for experiments "
     79              "(the last x86-experimental-pref-loop-alignment bits"
     80              " of the loop header PC will be 0)."),
     81     cl::Hidden);
     82 
     83 static cl::opt<bool> MulConstantOptimization(
     84     "mul-constant-optimization", cl::init(true),
     85     cl::desc("Replace 'mul x, Const' with more effective instructions like "
     86              "SHIFT, LEA, etc."),
     87     cl::Hidden);
     88 
     89 /// Call this when the user attempts to do something unsupported, like
     90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
     91 /// report_fatal_error, so calling code should attempt to recover without
     92 /// crashing.
     93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
     94                              const char *Msg) {
     95   MachineFunction &MF = DAG.getMachineFunction();
     96   DAG.getContext()->diagnose(
     97       DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
     98 }
     99 
    100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    101                                      const X86Subtarget &STI)
    102     : TargetLowering(TM), Subtarget(STI) {
    103   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
    104   X86ScalarSSEf64 = Subtarget.hasSSE2();
    105   X86ScalarSSEf32 = Subtarget.hasSSE1();
    106   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
    107 
    108   // Set up the TargetLowering object.
    109 
    110   // X86 is weird. It always uses i8 for shift amounts and setcc results.
    111   setBooleanContents(ZeroOrOneBooleanContent);
    112   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
    113   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    114 
    115   // For 64-bit, since we have so many registers, use the ILP scheduler.
    116   // For 32-bit, use the register pressure specific scheduling.
    117   // For Atom, always use ILP scheduling.
    118   if (Subtarget.isAtom())
    119     setSchedulingPreference(Sched::ILP);
    120   else if (Subtarget.is64Bit())
    121     setSchedulingPreference(Sched::ILP);
    122   else
    123     setSchedulingPreference(Sched::RegPressure);
    124   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
    125   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
    126 
    127   // Bypass expensive divides and use cheaper ones.
    128   if (TM.getOptLevel() >= CodeGenOpt::Default) {
    129     if (Subtarget.hasSlowDivide32())
    130       addBypassSlowDiv(32, 8);
    131     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
    132       addBypassSlowDiv(64, 32);
    133   }
    134 
    135   if (Subtarget.isTargetKnownWindowsMSVC() ||
    136       Subtarget.isTargetWindowsItanium()) {
    137     // Setup Windows compiler runtime calls.
    138     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
    139     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
    140     setLibcallName(RTLIB::SREM_I64, "_allrem");
    141     setLibcallName(RTLIB::UREM_I64, "_aullrem");
    142     setLibcallName(RTLIB::MUL_I64, "_allmul");
    143     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
    144     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
    145     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
    146     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
    147     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
    148   }
    149 
    150   if (Subtarget.isTargetDarwin()) {
    151     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
    152     setUseUnderscoreSetJmp(false);
    153     setUseUnderscoreLongJmp(false);
    154   } else if (Subtarget.isTargetWindowsGNU()) {
    155     // MS runtime is weird: it exports _setjmp, but longjmp!
    156     setUseUnderscoreSetJmp(true);
    157     setUseUnderscoreLongJmp(false);
    158   } else {
    159     setUseUnderscoreSetJmp(true);
    160     setUseUnderscoreLongJmp(true);
    161   }
    162 
    163   // Set up the register classes.
    164   addRegisterClass(MVT::i8, &X86::GR8RegClass);
    165   addRegisterClass(MVT::i16, &X86::GR16RegClass);
    166   addRegisterClass(MVT::i32, &X86::GR32RegClass);
    167   if (Subtarget.is64Bit())
    168     addRegisterClass(MVT::i64, &X86::GR64RegClass);
    169 
    170   for (MVT VT : MVT::integer_valuetypes())
    171     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
    172 
    173   // We don't accept any truncstore of integer registers.
    174   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    175   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    176   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
    177   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
    178   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
    179   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
    180 
    181   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    182 
    183   // SETOEQ and SETUNE require checking two conditions.
    184   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
    185   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
    186   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
    187   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
    188   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
    189   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
    190 
    191   // Integer absolute.
    192   if (Subtarget.hasCMov()) {
    193     setOperationAction(ISD::ABS            , MVT::i16  , Custom);
    194     setOperationAction(ISD::ABS            , MVT::i32  , Custom);
    195     if (Subtarget.is64Bit())
    196       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
    197   }
    198 
    199   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
    200   // operation.
    201   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
    202   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
    203   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
    204 
    205   if (Subtarget.is64Bit()) {
    206     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
    207       // f32/f64 are legal, f80 is custom.
    208       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
    209     else
    210       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
    211     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    212   } else if (!Subtarget.useSoftFloat()) {
    213     // We have an algorithm for SSE2->double, and we turn this into a
    214     // 64-bit FILD followed by conditional FADD for other targets.
    215     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    216     // We have an algorithm for SSE2, and we turn this into a 64-bit
    217     // FILD or VCVTUSI2SS/SD for other targets.
    218     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
    219   } else {
    220     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Expand);
    221   }
    222 
    223   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
    224   // this operation.
    225   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
    226   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
    227 
    228   if (!Subtarget.useSoftFloat()) {
    229     // SSE has no i16 to fp conversion, only i32.
    230     if (X86ScalarSSEf32) {
    231       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    232       // f32 and f64 cases are Legal, f80 case is not
    233       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    234     } else {
    235       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
    236       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
    237     }
    238   } else {
    239     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
    240     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Expand);
    241   }
    242 
    243   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
    244   // this operation.
    245   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
    246   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
    247 
    248   if (!Subtarget.useSoftFloat()) {
    249     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
    250     // are Legal, f80 is custom lowered.
    251     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
    252     setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
    253 
    254     if (X86ScalarSSEf32) {
    255       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    256       // f32 and f64 cases are Legal, f80 case is not
    257       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    258     } else {
    259       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
    260       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
    261     }
    262   } else {
    263     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
    264     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
    265     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
    266   }
    267 
    268   // Handle FP_TO_UINT by promoting the destination to a larger signed
    269   // conversion.
    270   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
    271   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
    272   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
    273 
    274   if (Subtarget.is64Bit()) {
    275     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
    276       // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
    277       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
    278       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
    279     } else {
    280       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
    281       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
    282     }
    283   } else if (!Subtarget.useSoftFloat()) {
    284     // Since AVX is a superset of SSE3, only check for SSE here.
    285     if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
    286       // Expand FP_TO_UINT into a select.
    287       // FIXME: We would like to use a Custom expander here eventually to do
    288       // the optimal thing for SSE vs. the default expansion in the legalizer.
    289       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
    290     else
    291       // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
    292       // With SSE3 we can use fisttpll to convert to a signed i64; without
    293       // SSE, we're stuck with a fistpll.
    294       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
    295 
    296     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
    297   }
    298 
    299   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
    300   if (!X86ScalarSSEf64) {
    301     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
    302     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
    303     if (Subtarget.is64Bit()) {
    304       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
    305       // Without SSE, i64->f64 goes through memory.
    306       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
    307     }
    308   } else if (!Subtarget.is64Bit())
    309     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
    310 
    311   // Scalar integer divide and remainder are lowered to use operations that
    312   // produce two results, to match the available instructions. This exposes
    313   // the two-result form to trivial CSE, which is able to combine x/y and x%y
    314   // into a single instruction.
    315   //
    316   // Scalar integer multiply-high is also lowered to use two-result
    317   // operations, to match the available instructions. However, plain multiply
    318   // (low) operations are left as Legal, as there are single-result
    319   // instructions for this in x86. Using the two-result multiply instructions
    320   // when both high and low results are needed must be arranged by dagcombine.
    321   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
    322     setOperationAction(ISD::MULHS, VT, Expand);
    323     setOperationAction(ISD::MULHU, VT, Expand);
    324     setOperationAction(ISD::SDIV, VT, Expand);
    325     setOperationAction(ISD::UDIV, VT, Expand);
    326     setOperationAction(ISD::SREM, VT, Expand);
    327     setOperationAction(ISD::UREM, VT, Expand);
    328   }
    329 
    330   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
    331   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
    332   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
    333                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
    334     setOperationAction(ISD::BR_CC,     VT, Expand);
    335     setOperationAction(ISD::SELECT_CC, VT, Expand);
    336   }
    337   if (Subtarget.is64Bit())
    338     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    339   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
    340   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
    341   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
    342   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
    343 
    344   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
    345   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
    346   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    347   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
    348 
    349   // Promote the i8 variants and force them on up to i32 which has a shorter
    350   // encoding.
    351   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
    352   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
    353   if (!Subtarget.hasBMI()) {
    354     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
    355     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
    356     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
    357     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
    358     if (Subtarget.is64Bit()) {
    359       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
    360       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
    361     }
    362   }
    363 
    364   if (Subtarget.hasLZCNT()) {
    365     // When promoting the i8 variants, force them to i32 for a shorter
    366     // encoding.
    367     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
    368     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
    369   } else {
    370     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
    371     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
    372     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
    373     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
    374     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
    375     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
    376     if (Subtarget.is64Bit()) {
    377       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
    378       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
    379     }
    380   }
    381 
    382   // Special handling for half-precision floating point conversions.
    383   // If we don't have F16C support, then lower half float conversions
    384   // into library calls.
    385   if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
    386     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
    387     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
    388   }
    389 
    390   // There's never any support for operations beyond MVT::f32.
    391   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
    392   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
    393   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
    394   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
    395 
    396   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
    397   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
    398   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
    399   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
    400   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
    401   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
    402 
    403   if (Subtarget.hasPOPCNT()) {
    404     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
    405   } else {
    406     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
    407     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
    408     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
    409     if (Subtarget.is64Bit())
    410       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
    411   }
    412 
    413   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
    414 
    415   if (!Subtarget.hasMOVBE())
    416     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
    417 
    418   // These should be promoted to a larger select which is supported.
    419   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
    420   // X86 wants to expand cmov itself.
    421   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
    422     setOperationAction(ISD::SELECT, VT, Custom);
    423     setOperationAction(ISD::SETCC, VT, Custom);
    424   }
    425   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
    426     if (VT == MVT::i64 && !Subtarget.is64Bit())
    427       continue;
    428     setOperationAction(ISD::SELECT, VT, Custom);
    429     setOperationAction(ISD::SETCC,  VT, Custom);
    430   }
    431 
    432   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
    433   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
    434   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
    435 
    436   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
    437   // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
    438   // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
    439   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
    440   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
    441   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
    442   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
    443     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
    444 
    445   // Darwin ABI issue.
    446   for (auto VT : { MVT::i32, MVT::i64 }) {
    447     if (VT == MVT::i64 && !Subtarget.is64Bit())
    448       continue;
    449     setOperationAction(ISD::ConstantPool    , VT, Custom);
    450     setOperationAction(ISD::JumpTable       , VT, Custom);
    451     setOperationAction(ISD::GlobalAddress   , VT, Custom);
    452     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
    453     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
    454     setOperationAction(ISD::BlockAddress    , VT, Custom);
    455   }
    456 
    457   // 64-bit shl, sra, srl (iff 32-bit x86)
    458   for (auto VT : { MVT::i32, MVT::i64 }) {
    459     if (VT == MVT::i64 && !Subtarget.is64Bit())
    460       continue;
    461     setOperationAction(ISD::SHL_PARTS, VT, Custom);
    462     setOperationAction(ISD::SRA_PARTS, VT, Custom);
    463     setOperationAction(ISD::SRL_PARTS, VT, Custom);
    464   }
    465 
    466   if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
    467     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
    468 
    469   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
    470 
    471   // Expand certain atomics
    472   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
    473     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
    474     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
    475     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
    476     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
    477     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
    478     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
    479     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
    480   }
    481 
    482   if (Subtarget.hasCmpxchg16b()) {
    483     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
    484   }
    485 
    486   // FIXME - use subtarget debug flags
    487   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
    488       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
    489       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
    490     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
    491   }
    492 
    493   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
    494   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
    495 
    496   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
    497   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
    498 
    499   setOperationAction(ISD::TRAP, MVT::Other, Legal);
    500   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
    501 
    502   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    503   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
    504   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
    505   bool Is64Bit = Subtarget.is64Bit();
    506   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
    507   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
    508 
    509   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
    510   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
    511 
    512   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
    513 
    514   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
    515   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
    516   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
    517 
    518   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
    519     // f32 and f64 use SSE.
    520     // Set up the FP register classes.
    521     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
    522                                                      : &X86::FR32RegClass);
    523     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
    524                                                      : &X86::FR64RegClass);
    525 
    526     for (auto VT : { MVT::f32, MVT::f64 }) {
    527       // Use ANDPD to simulate FABS.
    528       setOperationAction(ISD::FABS, VT, Custom);
    529 
    530       // Use XORP to simulate FNEG.
    531       setOperationAction(ISD::FNEG, VT, Custom);
    532 
    533       // Use ANDPD and ORPD to simulate FCOPYSIGN.
    534       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
    535 
    536       // We don't support sin/cos/fmod
    537       setOperationAction(ISD::FSIN   , VT, Expand);
    538       setOperationAction(ISD::FCOS   , VT, Expand);
    539       setOperationAction(ISD::FSINCOS, VT, Expand);
    540     }
    541 
    542     // Lower this to MOVMSK plus an AND.
    543     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
    544     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
    545 
    546     // Expand FP immediates into loads from the stack, except for the special
    547     // cases we handle.
    548     addLegalFPImmediate(APFloat(+0.0)); // xorpd
    549     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    550   } else if (UseX87 && X86ScalarSSEf32) {
    551     // Use SSE for f32, x87 for f64.
    552     // Set up the FP register classes.
    553     addRegisterClass(MVT::f32, &X86::FR32RegClass);
    554     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    555 
    556     // Use ANDPS to simulate FABS.
    557     setOperationAction(ISD::FABS , MVT::f32, Custom);
    558 
    559     // Use XORP to simulate FNEG.
    560     setOperationAction(ISD::FNEG , MVT::f32, Custom);
    561 
    562     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    563 
    564     // Use ANDPS and ORPS to simulate FCOPYSIGN.
    565     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    566     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
    567 
    568     // We don't support sin/cos/fmod
    569     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
    570     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
    571     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    572 
    573     // Special cases we handle for FP constants.
    574     addLegalFPImmediate(APFloat(+0.0f)); // xorps
    575     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    576     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    577     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    578     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    579 
    580     // Always expand sin/cos functions even though x87 has an instruction.
    581     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
    582     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
    583     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
    584   } else if (UseX87) {
    585     // f32 and f64 in x87.
    586     // Set up the FP register classes.
    587     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
    588     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
    589 
    590     for (auto VT : { MVT::f32, MVT::f64 }) {
    591       setOperationAction(ISD::UNDEF,     VT, Expand);
    592       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    593 
    594       // Always expand sin/cos functions even though x87 has an instruction.
    595       setOperationAction(ISD::FSIN   , VT, Expand);
    596       setOperationAction(ISD::FCOS   , VT, Expand);
    597       setOperationAction(ISD::FSINCOS, VT, Expand);
    598     }
    599     addLegalFPImmediate(APFloat(+0.0)); // FLD0
    600     addLegalFPImmediate(APFloat(+1.0)); // FLD1
    601     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
    602     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
    603     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
    604     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
    605     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
    606     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
    607   }
    608 
    609   // We don't support FMA.
    610   setOperationAction(ISD::FMA, MVT::f64, Expand);
    611   setOperationAction(ISD::FMA, MVT::f32, Expand);
    612 
    613   // Long double always uses X87, except f128 in MMX.
    614   if (UseX87) {
    615     if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
    616       addRegisterClass(MVT::f128, &X86::VR128RegClass);
    617       ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
    618       setOperationAction(ISD::FABS , MVT::f128, Custom);
    619       setOperationAction(ISD::FNEG , MVT::f128, Custom);
    620       setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
    621     }
    622 
    623     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
    624     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
    625     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
    626     {
    627       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
    628       addLegalFPImmediate(TmpFlt);  // FLD0
    629       TmpFlt.changeSign();
    630       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
    631 
    632       bool ignored;
    633       APFloat TmpFlt2(+1.0);
    634       TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
    635                       &ignored);
    636       addLegalFPImmediate(TmpFlt2);  // FLD1
    637       TmpFlt2.changeSign();
    638       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
    639     }
    640 
    641     // Always expand sin/cos functions even though x87 has an instruction.
    642     setOperationAction(ISD::FSIN   , MVT::f80, Expand);
    643     setOperationAction(ISD::FCOS   , MVT::f80, Expand);
    644     setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
    645 
    646     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
    647     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
    648     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
    649     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
    650     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
    651     setOperationAction(ISD::FMA, MVT::f80, Expand);
    652   }
    653 
    654   // Always use a library call for pow.
    655   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
    656   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
    657   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
    658 
    659   setOperationAction(ISD::FLOG, MVT::f80, Expand);
    660   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
    661   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
    662   setOperationAction(ISD::FEXP, MVT::f80, Expand);
    663   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
    664   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
    665   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
    666 
    667   // Some FP actions are always expanded for vector types.
    668   for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
    669                    MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
    670     setOperationAction(ISD::FSIN,      VT, Expand);
    671     setOperationAction(ISD::FSINCOS,   VT, Expand);
    672     setOperationAction(ISD::FCOS,      VT, Expand);
    673     setOperationAction(ISD::FREM,      VT, Expand);
    674     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    675     setOperationAction(ISD::FPOW,      VT, Expand);
    676     setOperationAction(ISD::FLOG,      VT, Expand);
    677     setOperationAction(ISD::FLOG2,     VT, Expand);
    678     setOperationAction(ISD::FLOG10,    VT, Expand);
    679     setOperationAction(ISD::FEXP,      VT, Expand);
    680     setOperationAction(ISD::FEXP2,     VT, Expand);
    681   }
    682 
    683   // First set operation action for all vector types to either promote
    684   // (for widening) or expand (for scalarization). Then we will selectively
    685   // turn on ones that can be effectively codegen'd.
    686   for (MVT VT : MVT::vector_valuetypes()) {
    687     setOperationAction(ISD::SDIV, VT, Expand);
    688     setOperationAction(ISD::UDIV, VT, Expand);
    689     setOperationAction(ISD::SREM, VT, Expand);
    690     setOperationAction(ISD::UREM, VT, Expand);
    691     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
    692     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
    693     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
    694     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
    695     setOperationAction(ISD::FMA,  VT, Expand);
    696     setOperationAction(ISD::FFLOOR, VT, Expand);
    697     setOperationAction(ISD::FCEIL, VT, Expand);
    698     setOperationAction(ISD::FTRUNC, VT, Expand);
    699     setOperationAction(ISD::FRINT, VT, Expand);
    700     setOperationAction(ISD::FNEARBYINT, VT, Expand);
    701     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    702     setOperationAction(ISD::MULHS, VT, Expand);
    703     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    704     setOperationAction(ISD::MULHU, VT, Expand);
    705     setOperationAction(ISD::SDIVREM, VT, Expand);
    706     setOperationAction(ISD::UDIVREM, VT, Expand);
    707     setOperationAction(ISD::CTPOP, VT, Expand);
    708     setOperationAction(ISD::CTTZ, VT, Expand);
    709     setOperationAction(ISD::CTLZ, VT, Expand);
    710     setOperationAction(ISD::ROTL, VT, Expand);
    711     setOperationAction(ISD::ROTR, VT, Expand);
    712     setOperationAction(ISD::BSWAP, VT, Expand);
    713     setOperationAction(ISD::SETCC, VT, Expand);
    714     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    715     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    716     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    717     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    718     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
    719     setOperationAction(ISD::TRUNCATE, VT, Expand);
    720     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
    721     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
    722     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
    723     setOperationAction(ISD::SELECT_CC, VT, Expand);
    724     for (MVT InnerVT : MVT::vector_valuetypes()) {
    725       setTruncStoreAction(InnerVT, VT, Expand);
    726 
    727       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
    728       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
    729 
    730       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
    731       // types, we have to deal with them whether we ask for Expansion or not.
    732       // Setting Expand causes its own optimisation problems though, so leave
    733       // them legal.
    734       if (VT.getVectorElementType() == MVT::i1)
    735         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
    736 
    737       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
    738       // split/scalarized right now.
    739       if (VT.getVectorElementType() == MVT::f16)
    740         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
    741     }
    742   }
    743 
    744   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
    745   // with -msoft-float, disable use of MMX as well.
    746   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
    747     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
    748     // No operations on x86mmx supported, everything uses intrinsics.
    749   }
    750 
    751   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
    752     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
    753                                                     : &X86::VR128RegClass);
    754 
    755     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
    756     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
    757     setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
    758     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
    759     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
    760     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
    761     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    762     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
    763     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
    764   }
    765 
    766   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
    767     addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
    768                                                     : &X86::VR128RegClass);
    769 
    770     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
    771     // registers cannot be used even for integer operations.
    772     addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
    773                                                     : &X86::VR128RegClass);
    774     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
    775                                                     : &X86::VR128RegClass);
    776     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
    777                                                     : &X86::VR128RegClass);
    778     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
    779                                                     : &X86::VR128RegClass);
    780 
    781     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
    782     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
    783     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
    784     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
    785     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
    786     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
    787     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
    788     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
    789     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
    790     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
    791     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
    792     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
    793     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
    794 
    795     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
    796       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
    797       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
    798       setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
    799       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
    800     }
    801 
    802     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    803     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    804     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
    805 
    806     // Provide custom widening for v2f32 setcc. This is really for VLX when
    807     // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
    808     // type legalization changing the result type to v4i1 during widening.
    809     // It works fine for SSE2 and is probably faster so no need to qualify with
    810     // VLX support.
    811     setOperationAction(ISD::SETCC,               MVT::v2i32, Custom);
    812 
    813     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
    814       setOperationAction(ISD::SETCC,              VT, Custom);
    815       setOperationAction(ISD::CTPOP,              VT, Custom);
    816       setOperationAction(ISD::CTTZ,               VT, Custom);
    817 
    818       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
    819       // setcc all the way to isel and prefer SETGT in some isel patterns.
    820       setCondCodeAction(ISD::SETLT, VT, Custom);
    821       setCondCodeAction(ISD::SETLE, VT, Custom);
    822     }
    823 
    824     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
    825       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
    826       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
    827       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
    828       setOperationAction(ISD::VSELECT,            VT, Custom);
    829       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    830     }
    831 
    832     // We support custom legalizing of sext and anyext loads for specific
    833     // memory vector types which we can load as a scalar (or sequence of
    834     // scalars) and extend in-register to a legal 128-bit vector type. For sext
    835     // loads these must work with a single scalar load.
    836     for (MVT VT : MVT::integer_vector_valuetypes()) {
    837       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
    838       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
    839       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
    840       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
    841       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
    842       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
    843       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
    844       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
    845       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
    846     }
    847 
    848     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
    849       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
    850       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
    851       setOperationAction(ISD::VSELECT,            VT, Custom);
    852 
    853       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
    854         continue;
    855 
    856       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
    857       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    858     }
    859 
    860     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
    861     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
    862       setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
    863       setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
    864       setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
    865       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
    866       setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
    867     }
    868 
    869     // Custom lower v2i64 and v2f64 selects.
    870     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
    871     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
    872 
    873     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
    874     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
    875 
    876     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
    877     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
    878 
    879     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
    880 
    881     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
    882     setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
    883 
    884     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
    885     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
    886 
    887     for (MVT VT : MVT::fp_vector_valuetypes())
    888       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
    889 
    890     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
    891     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
    892     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
    893     if (!Subtarget.hasAVX512())
    894       setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
    895 
    896     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
    897     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
    898     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
    899 
    900     // In the customized shift lowering, the legal v4i32/v2i64 cases
    901     // in AVX2 will be recognized.
    902     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
    903       setOperationAction(ISD::SRL,              VT, Custom);
    904       setOperationAction(ISD::SHL,              VT, Custom);
    905       setOperationAction(ISD::SRA,              VT, Custom);
    906     }
    907 
    908     setOperationAction(ISD::ROTL,               MVT::v4i32, Custom);
    909     setOperationAction(ISD::ROTL,               MVT::v8i16, Custom);
    910     setOperationAction(ISD::ROTL,               MVT::v16i8, Custom);
    911   }
    912 
    913   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
    914     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
    915     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
    916     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
    917     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
    918     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
    919     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
    920     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
    921     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
    922   }
    923 
    924   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
    925     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
    926       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
    927       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
    928       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
    929       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
    930       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
    931     }
    932 
    933     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
    934     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
    935     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
    936     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
    937     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
    938     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
    939     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
    940     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
    941 
    942     // FIXME: Do we need to handle scalar-to-vector here?
    943     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
    944 
    945     // We directly match byte blends in the backend as they match the VSELECT
    946     // condition form.
    947     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
    948 
    949     // SSE41 brings specific instructions for doing vector sign extend even in
    950     // cases where we don't have SRA.
    951     for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
    952       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
    953       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
    954     }
    955 
    956     for (MVT VT : MVT::integer_vector_valuetypes()) {
    957       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
    958       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
    959       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
    960     }
    961 
    962     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
    963     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
    964       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
    965       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
    966       setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8,  Legal);
    967       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
    968       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
    969       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
    970       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
    971     }
    972 
    973     // i8 vectors are custom because the source register and source
    974     // source memory operand types are not the same width.
    975     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
    976   }
    977 
    978   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
    979     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
    980                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
    981       setOperationAction(ISD::ROTL, VT, Custom);
    982 
    983     // XOP can efficiently perform BITREVERSE with VPPERM.
    984     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
    985       setOperationAction(ISD::BITREVERSE, VT, Custom);
    986 
    987     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
    988                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
    989       setOperationAction(ISD::BITREVERSE, VT, Custom);
    990   }
    991 
    992   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
    993     bool HasInt256 = Subtarget.hasInt256();
    994 
    995     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
    996                                                      : &X86::VR256RegClass);
    997     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
    998                                                      : &X86::VR256RegClass);
    999     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
   1000                                                      : &X86::VR256RegClass);
   1001     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
   1002                                                      : &X86::VR256RegClass);
   1003     addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
   1004                                                      : &X86::VR256RegClass);
   1005     addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
   1006                                                      : &X86::VR256RegClass);
   1007 
   1008     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
   1009       setOperationAction(ISD::FFLOOR,     VT, Legal);
   1010       setOperationAction(ISD::FCEIL,      VT, Legal);
   1011       setOperationAction(ISD::FTRUNC,     VT, Legal);
   1012       setOperationAction(ISD::FRINT,      VT, Legal);
   1013       setOperationAction(ISD::FNEARBYINT, VT, Legal);
   1014       setOperationAction(ISD::FNEG,       VT, Custom);
   1015       setOperationAction(ISD::FABS,       VT, Custom);
   1016       setOperationAction(ISD::FCOPYSIGN,  VT, Custom);
   1017     }
   1018 
   1019     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
   1020     // even though v8i16 is a legal type.
   1021     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
   1022     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
   1023     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
   1024 
   1025     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
   1026     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
   1027 
   1028     if (!Subtarget.hasAVX512())
   1029       setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
   1030 
   1031     for (MVT VT : MVT::fp_vector_valuetypes())
   1032       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
   1033 
   1034     // In the customized shift lowering, the legal v8i32/v4i64 cases
   1035     // in AVX2 will be recognized.
   1036     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
   1037       setOperationAction(ISD::SRL, VT, Custom);
   1038       setOperationAction(ISD::SHL, VT, Custom);
   1039       setOperationAction(ISD::SRA, VT, Custom);
   1040     }
   1041 
   1042     setOperationAction(ISD::ROTL,              MVT::v8i32,  Custom);
   1043     setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
   1044     setOperationAction(ISD::ROTL,              MVT::v32i8,  Custom);
   1045 
   1046     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
   1047     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
   1048     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
   1049 
   1050     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
   1051       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
   1052       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
   1053       setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
   1054     }
   1055 
   1056     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
   1057     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
   1058     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
   1059     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
   1060 
   1061     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
   1062       setOperationAction(ISD::SETCC,           VT, Custom);
   1063       setOperationAction(ISD::CTPOP,           VT, Custom);
   1064       setOperationAction(ISD::CTTZ,            VT, Custom);
   1065       setOperationAction(ISD::CTLZ,            VT, Custom);
   1066 
   1067       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
   1068       // setcc all the way to isel and prefer SETGT in some isel patterns.
   1069       setCondCodeAction(ISD::SETLT, VT, Custom);
   1070       setCondCodeAction(ISD::SETLE, VT, Custom);
   1071     }
   1072 
   1073     if (Subtarget.hasAnyFMA()) {
   1074       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
   1075                        MVT::v2f64, MVT::v4f64 })
   1076         setOperationAction(ISD::FMA, VT, Legal);
   1077     }
   1078 
   1079     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
   1080       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
   1081       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
   1082     }
   1083 
   1084     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
   1085     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
   1086     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
   1087     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
   1088 
   1089     setOperationAction(ISD::UMUL_LOHI, MVT::v8i32,  Custom);
   1090     setOperationAction(ISD::SMUL_LOHI, MVT::v8i32,  Custom);
   1091 
   1092     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
   1093     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
   1094     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
   1095     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
   1096 
   1097     setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);
   1098     setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);
   1099     setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
   1100     setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
   1101 
   1102     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
   1103       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
   1104       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
   1105       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
   1106       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
   1107       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
   1108     }
   1109 
   1110     if (HasInt256) {
   1111       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64,  Custom);
   1112       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32,  Custom);
   1113       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
   1114 
   1115       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
   1116       // when we have a 256bit-wide blend with immediate.
   1117       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
   1118 
   1119       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
   1120       for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
   1121         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
   1122         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
   1123         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
   1124         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
   1125         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
   1126         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
   1127       }
   1128     }
   1129 
   1130     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
   1131                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
   1132       setOperationAction(ISD::MLOAD,  VT, Legal);
   1133       setOperationAction(ISD::MSTORE, VT, Legal);
   1134     }
   1135 
   1136     // Extract subvector is special because the value type
   1137     // (result) is 128-bit but the source is 256-bit wide.
   1138     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
   1139                      MVT::v4f32, MVT::v2f64 }) {
   1140       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
   1141     }
   1142 
   1143     // Custom lower several nodes for 256-bit types.
   1144     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
   1145                     MVT::v8f32, MVT::v4f64 }) {
   1146       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
   1147       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
   1148       setOperationAction(ISD::VSELECT,            VT, Custom);
   1149       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
   1150       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   1151       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
   1152       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
   1153       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
   1154     }
   1155 
   1156     if (HasInt256)
   1157       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
   1158 
   1159     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
   1160     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
   1161       setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
   1162       setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
   1163       setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
   1164       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
   1165       setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
   1166     }
   1167 
   1168     if (HasInt256) {
   1169       // Custom legalize 2x32 to get a little better code.
   1170       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
   1171       setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
   1172 
   1173       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
   1174                        MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
   1175         setOperationAction(ISD::MGATHER,  VT, Custom);
   1176     }
   1177   }
   1178 
   1179   // This block controls legalization of the mask vector sizes that are
   1180   // available with AVX512. 512-bit vectors are in a separate block controlled
   1181   // by useAVX512Regs.
   1182   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
   1183     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
   1184     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
   1185     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
   1186     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
   1187     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
   1188 
   1189     setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
   1190     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
   1191     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
   1192 
   1193     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
   1194     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
   1195     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
   1196     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
   1197     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i1,  Custom);
   1198     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i1,  Custom);
   1199 
   1200     // There is no byte sized k-register load or store without AVX512DQ.
   1201     if (!Subtarget.hasDQI()) {
   1202       setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
   1203       setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
   1204       setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
   1205       setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
   1206 
   1207       setOperationAction(ISD::STORE, MVT::v1i1, Custom);
   1208       setOperationAction(ISD::STORE, MVT::v2i1, Custom);
   1209       setOperationAction(ISD::STORE, MVT::v4i1, Custom);
   1210       setOperationAction(ISD::STORE, MVT::v8i1, Custom);
   1211     }
   1212 
   1213     // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
   1214     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
   1215       setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
   1216       setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
   1217       setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
   1218     }
   1219 
   1220     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
   1221       setOperationAction(ISD::ADD,              VT, Custom);
   1222       setOperationAction(ISD::SUB,              VT, Custom);
   1223       setOperationAction(ISD::MUL,              VT, Custom);
   1224       setOperationAction(ISD::SETCC,            VT, Custom);
   1225       setOperationAction(ISD::SELECT,           VT, Custom);
   1226       setOperationAction(ISD::TRUNCATE,         VT, Custom);
   1227 
   1228       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
   1229       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   1230       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
   1231       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
   1232       setOperationAction(ISD::VSELECT,          VT,  Expand);
   1233     }
   1234 
   1235     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Custom);
   1236     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,  Custom);
   1237     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1,  Custom);
   1238     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v2i1,  Custom);
   1239     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1,  Custom);
   1240     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1,  Custom);
   1241     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
   1242     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
   1243       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1244   }
   1245 
   1246   // This block controls legalization for 512-bit operations with 32/64 bit
   1247   // elements. 512-bits can be disabled based on prefer-vector-width and
   1248   // required-vector-width function attributes.
   1249   if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
   1250     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
   1251     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
   1252     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
   1253     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
   1254 
   1255     for (MVT VT : MVT::fp_vector_valuetypes())
   1256       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
   1257 
   1258     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
   1259       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
   1260       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
   1261       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
   1262       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
   1263       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
   1264     }
   1265 
   1266     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
   1267       setOperationAction(ISD::FNEG,  VT, Custom);
   1268       setOperationAction(ISD::FABS,  VT, Custom);
   1269       setOperationAction(ISD::FMA,   VT, Legal);
   1270       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
   1271     }
   1272 
   1273     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
   1274     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
   1275     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
   1276     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
   1277     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
   1278     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
   1279     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
   1280     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
   1281     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
   1282     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
   1283 
   1284     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
   1285     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
   1286     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
   1287     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
   1288     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
   1289 
   1290     if (!Subtarget.hasVLX()) {
   1291       // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
   1292       // to 512-bit rather than use the AVX2 instructions so that we can use
   1293       // k-masks.
   1294       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
   1295            MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
   1296         setOperationAction(ISD::MLOAD,  VT, Custom);
   1297         setOperationAction(ISD::MSTORE, VT, Custom);
   1298       }
   1299     }
   1300 
   1301     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
   1302     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
   1303     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
   1304     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
   1305     setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
   1306     setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
   1307     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
   1308     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
   1309 
   1310     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
   1311       setOperationAction(ISD::FFLOOR,           VT, Legal);
   1312       setOperationAction(ISD::FCEIL,            VT, Legal);
   1313       setOperationAction(ISD::FTRUNC,           VT, Legal);
   1314       setOperationAction(ISD::FRINT,            VT, Legal);
   1315       setOperationAction(ISD::FNEARBYINT,       VT, Legal);
   1316     }
   1317 
   1318     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64,  Custom);
   1319     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
   1320 
   1321     // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
   1322     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
   1323     setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
   1324 
   1325     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
   1326     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
   1327     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
   1328     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
   1329 
   1330     setOperationAction(ISD::MUL,                MVT::v8i64, Custom);
   1331     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
   1332 
   1333     setOperationAction(ISD::UMUL_LOHI,          MVT::v16i32,  Custom);
   1334     setOperationAction(ISD::SMUL_LOHI,          MVT::v16i32,  Custom);
   1335 
   1336     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
   1337     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
   1338     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
   1339 
   1340     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
   1341       setOperationAction(ISD::SMAX,             VT, Legal);
   1342       setOperationAction(ISD::UMAX,             VT, Legal);
   1343       setOperationAction(ISD::SMIN,             VT, Legal);
   1344       setOperationAction(ISD::UMIN,             VT, Legal);
   1345       setOperationAction(ISD::ABS,              VT, Legal);
   1346       setOperationAction(ISD::SRL,              VT, Custom);
   1347       setOperationAction(ISD::SHL,              VT, Custom);
   1348       setOperationAction(ISD::SRA,              VT, Custom);
   1349       setOperationAction(ISD::CTPOP,            VT, Custom);
   1350       setOperationAction(ISD::CTTZ,             VT, Custom);
   1351       setOperationAction(ISD::ROTL,             VT, Custom);
   1352       setOperationAction(ISD::ROTR,             VT, Custom);
   1353       setOperationAction(ISD::SETCC,            VT, Custom);
   1354 
   1355       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
   1356       // setcc all the way to isel and prefer SETGT in some isel patterns.
   1357       setCondCodeAction(ISD::SETLT, VT, Custom);
   1358       setCondCodeAction(ISD::SETLE, VT, Custom);
   1359     }
   1360 
   1361     // Need to promote to 64-bit even though we have 32-bit masked instructions
   1362     // because the IR optimizers rearrange bitcasts around logic ops leaving
   1363     // too many variations to handle if we don't promote them.
   1364     setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
   1365     setOperationPromotedToType(ISD::OR,  MVT::v16i32, MVT::v8i64);
   1366     setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
   1367 
   1368     if (Subtarget.hasDQI()) {
   1369       setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
   1370       setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
   1371       setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
   1372       setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
   1373 
   1374       setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
   1375     }
   1376 
   1377     if (Subtarget.hasCDI()) {
   1378       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
   1379       for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
   1380         setOperationAction(ISD::CTLZ,            VT, Legal);
   1381         setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
   1382       }
   1383     } // Subtarget.hasCDI()
   1384 
   1385     if (Subtarget.hasVPOPCNTDQ()) {
   1386       for (auto VT : { MVT::v16i32, MVT::v8i64 })
   1387         setOperationAction(ISD::CTPOP, VT, Legal);
   1388     }
   1389 
   1390     // Extract subvector is special because the value type
   1391     // (result) is 256-bit but the source is 512-bit wide.
   1392     // 128-bit was made Legal under AVX1.
   1393     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
   1394                      MVT::v8f32, MVT::v4f64 })
   1395       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
   1396 
   1397     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
   1398       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
   1399       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
   1400       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
   1401       setOperationAction(ISD::VSELECT,             VT, Custom);
   1402       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
   1403       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
   1404       setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
   1405       setOperationAction(ISD::MLOAD,               VT, Legal);
   1406       setOperationAction(ISD::MSTORE,              VT, Legal);
   1407       setOperationAction(ISD::MGATHER,             VT, Custom);
   1408       setOperationAction(ISD::MSCATTER,            VT, Custom);
   1409     }
   1410     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
   1411       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v8i64);
   1412       setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
   1413     }
   1414 
   1415     // Need to custom split v32i16/v64i8 bitcasts.
   1416     if (!Subtarget.hasBWI()) {
   1417       setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
   1418       setOperationAction(ISD::BITCAST, MVT::v64i8,  Custom);
   1419     }
   1420   }// has  AVX-512
   1421 
   1422   // This block controls legalization for operations that don't have
   1423   // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
   1424   // narrower widths.
   1425   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
   1426     // These operations are handled on non-VLX by artificially widening in
   1427     // isel patterns.
   1428     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
   1429 
   1430     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
   1431     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
   1432     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
   1433     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
   1434     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
   1435 
   1436     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
   1437       setOperationAction(ISD::SMAX, VT, Legal);
   1438       setOperationAction(ISD::UMAX, VT, Legal);
   1439       setOperationAction(ISD::SMIN, VT, Legal);
   1440       setOperationAction(ISD::UMIN, VT, Legal);
   1441       setOperationAction(ISD::ABS,  VT, Legal);
   1442     }
   1443 
   1444     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
   1445       setOperationAction(ISD::ROTL,     VT, Custom);
   1446       setOperationAction(ISD::ROTR,     VT, Custom);
   1447     }
   1448 
   1449     // Custom legalize 2x32 to get a little better code.
   1450     setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
   1451     setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
   1452 
   1453     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
   1454                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
   1455       setOperationAction(ISD::MSCATTER, VT, Custom);
   1456 
   1457     if (Subtarget.hasDQI()) {
   1458       for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
   1459         setOperationAction(ISD::SINT_TO_FP,     VT, Legal);
   1460         setOperationAction(ISD::UINT_TO_FP,     VT, Legal);
   1461         setOperationAction(ISD::FP_TO_SINT,     VT, Legal);
   1462         setOperationAction(ISD::FP_TO_UINT,     VT, Legal);
   1463 
   1464         setOperationAction(ISD::MUL,            VT, Legal);
   1465       }
   1466     }
   1467 
   1468     if (Subtarget.hasCDI()) {
   1469       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
   1470         setOperationAction(ISD::CTLZ,            VT, Legal);
   1471         setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
   1472       }
   1473     } // Subtarget.hasCDI()
   1474 
   1475     if (Subtarget.hasVPOPCNTDQ()) {
   1476       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
   1477         setOperationAction(ISD::CTPOP, VT, Legal);
   1478     }
   1479   }
   1480 
   1481   // This block control legalization of v32i1/v64i1 which are available with
   1482   // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
   1483   // useBWIRegs.
   1484   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
   1485     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
   1486     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
   1487 
   1488     for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
   1489       setOperationAction(ISD::ADD,                VT, Custom);
   1490       setOperationAction(ISD::SUB,                VT, Custom);
   1491       setOperationAction(ISD::MUL,                VT, Custom);
   1492       setOperationAction(ISD::VSELECT,            VT, Expand);
   1493 
   1494       setOperationAction(ISD::TRUNCATE,           VT, Custom);
   1495       setOperationAction(ISD::SETCC,              VT, Custom);
   1496       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   1497       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
   1498       setOperationAction(ISD::SELECT,             VT, Custom);
   1499       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
   1500       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
   1501     }
   1502 
   1503     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
   1504     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
   1505     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
   1506     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
   1507     for (auto VT : { MVT::v16i1, MVT::v32i1 })
   1508       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   1509 
   1510     // Extends from v32i1 masks to 256-bit vectors.
   1511     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
   1512     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
   1513     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
   1514   }
   1515 
   1516   // This block controls legalization for v32i16 and v64i8. 512-bits can be
   1517   // disabled based on prefer-vector-width and required-vector-width function
   1518   // attributes.
   1519   if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
   1520     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
   1521     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
   1522 
   1523     // Extends from v64i1 masks to 512-bit vectors.
   1524     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
   1525     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
   1526     setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
   1527 
   1528     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
   1529     setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
   1530     setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
   1531     setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
   1532     setOperationAction(ISD::MULHS,              MVT::v64i8, Custom);
   1533     setOperationAction(ISD::MULHU,              MVT::v64i8, Custom);
   1534     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
   1535     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
   1536     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Legal);
   1537     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Legal);
   1538     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
   1539     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
   1540     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
   1541     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
   1542     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
   1543     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
   1544     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
   1545     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
   1546     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
   1547     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
   1548     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
   1549     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
   1550     setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
   1551 
   1552     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
   1553 
   1554     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
   1555 
   1556     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
   1557       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
   1558       setOperationAction(ISD::VSELECT,      VT, Custom);
   1559       setOperationAction(ISD::ABS,          VT, Legal);
   1560       setOperationAction(ISD::SRL,          VT, Custom);
   1561       setOperationAction(ISD::SHL,          VT, Custom);
   1562       setOperationAction(ISD::SRA,          VT, Custom);
   1563       setOperationAction(ISD::MLOAD,        VT, Legal);
   1564       setOperationAction(ISD::MSTORE,       VT, Legal);
   1565       setOperationAction(ISD::CTPOP,        VT, Custom);
   1566       setOperationAction(ISD::CTTZ,         VT, Custom);
   1567       setOperationAction(ISD::CTLZ,         VT, Custom);
   1568       setOperationAction(ISD::SMAX,         VT, Legal);
   1569       setOperationAction(ISD::UMAX,         VT, Legal);
   1570       setOperationAction(ISD::SMIN,         VT, Legal);
   1571       setOperationAction(ISD::UMIN,         VT, Legal);
   1572       setOperationAction(ISD::SETCC,        VT, Custom);
   1573 
   1574       setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
   1575       setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
   1576       setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
   1577     }
   1578 
   1579     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
   1580       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
   1581     }
   1582 
   1583     if (Subtarget.hasBITALG()) {
   1584       for (auto VT : { MVT::v64i8, MVT::v32i16 })
   1585         setOperationAction(ISD::CTPOP, VT, Legal);
   1586     }
   1587   }
   1588 
   1589   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
   1590     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
   1591       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
   1592       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
   1593     }
   1594 
   1595     // These operations are handled on non-VLX by artificially widening in
   1596     // isel patterns.
   1597     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
   1598 
   1599     if (Subtarget.hasBITALG()) {
   1600       for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
   1601         setOperationAction(ISD::CTPOP, VT, Legal);
   1602     }
   1603   }
   1604 
   1605   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
   1606     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
   1607     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
   1608     setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
   1609     setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
   1610     setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
   1611 
   1612     setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
   1613     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
   1614     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
   1615     setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
   1616     setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
   1617 
   1618     if (Subtarget.hasDQI()) {
   1619       // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
   1620       // v2f32 UINT_TO_FP is already custom under SSE2.
   1621       setOperationAction(ISD::SINT_TO_FP,    MVT::v2f32, Custom);
   1622       assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
   1623              "Unexpected operation action!");
   1624       // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
   1625       setOperationAction(ISD::FP_TO_SINT,    MVT::v2f32, Custom);
   1626       setOperationAction(ISD::FP_TO_UINT,    MVT::v2f32, Custom);
   1627     }
   1628 
   1629     if (Subtarget.hasBWI()) {
   1630       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
   1631       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
   1632     }
   1633   }
   1634 
   1635   // We want to custom lower some of our intrinsics.
   1636   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   1637   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   1638   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   1639   if (!Subtarget.is64Bit()) {
   1640     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
   1641     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   1642   }
   1643 
   1644   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   1645   // handle type legalization for these operations here.
   1646   //
   1647   // FIXME: We really should do custom legalization for addition and
   1648   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   1649   // than generic legalization for 64-bit multiplication-with-overflow, though.
   1650   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
   1651     if (VT == MVT::i64 && !Subtarget.is64Bit())
   1652       continue;
   1653     // Add/Sub/Mul with overflow operations are custom lowered.
   1654     setOperationAction(ISD::SADDO, VT, Custom);
   1655     setOperationAction(ISD::UADDO, VT, Custom);
   1656     setOperationAction(ISD::SSUBO, VT, Custom);
   1657     setOperationAction(ISD::USUBO, VT, Custom);
   1658     setOperationAction(ISD::SMULO, VT, Custom);
   1659     setOperationAction(ISD::UMULO, VT, Custom);
   1660 
   1661     // Support carry in as value rather than glue.
   1662     setOperationAction(ISD::ADDCARRY, VT, Custom);
   1663     setOperationAction(ISD::SUBCARRY, VT, Custom);
   1664     setOperationAction(ISD::SETCCCARRY, VT, Custom);
   1665   }
   1666 
   1667   if (!Subtarget.is64Bit()) {
   1668     // These libcalls are not available in 32-bit.
   1669     setLibcallName(RTLIB::SHL_I128, nullptr);
   1670     setLibcallName(RTLIB::SRL_I128, nullptr);
   1671     setLibcallName(RTLIB::SRA_I128, nullptr);
   1672     setLibcallName(RTLIB::MUL_I128, nullptr);
   1673   }
   1674 
   1675   // Combine sin / cos into _sincos_stret if it is available.
   1676   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
   1677       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
   1678     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
   1679     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   1680   }
   1681 
   1682   if (Subtarget.isTargetWin64()) {
   1683     setOperationAction(ISD::SDIV, MVT::i128, Custom);
   1684     setOperationAction(ISD::UDIV, MVT::i128, Custom);
   1685     setOperationAction(ISD::SREM, MVT::i128, Custom);
   1686     setOperationAction(ISD::UREM, MVT::i128, Custom);
   1687     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
   1688     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
   1689   }
   1690 
   1691   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
   1692   // is. We should promote the value to 64-bits to solve this.
   1693   // This is what the CRT headers do - `fmodf` is an inline header
   1694   // function casting to f64 and calling `fmod`.
   1695   if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
   1696                               Subtarget.isTargetWindowsItanium()))
   1697     for (ISD::NodeType Op :
   1698          {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
   1699           ISD::FLOG10, ISD::FPOW, ISD::FSIN})
   1700       if (isOperationExpand(Op, MVT::f32))
   1701         setOperationAction(Op, MVT::f32, Promote);
   1702 
   1703   // We have target-specific dag combine patterns for the following nodes:
   1704   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   1705   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
   1706   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   1707   setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
   1708   setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
   1709   setTargetDAGCombine(ISD::BITCAST);
   1710   setTargetDAGCombine(ISD::VSELECT);
   1711   setTargetDAGCombine(ISD::SELECT);
   1712   setTargetDAGCombine(ISD::SHL);
   1713   setTargetDAGCombine(ISD::SRA);
   1714   setTargetDAGCombine(ISD::SRL);
   1715   setTargetDAGCombine(ISD::OR);
   1716   setTargetDAGCombine(ISD::AND);
   1717   setTargetDAGCombine(ISD::ADD);
   1718   setTargetDAGCombine(ISD::FADD);
   1719   setTargetDAGCombine(ISD::FSUB);
   1720   setTargetDAGCombine(ISD::FNEG);
   1721   setTargetDAGCombine(ISD::FMA);
   1722   setTargetDAGCombine(ISD::FMINNUM);
   1723   setTargetDAGCombine(ISD::FMAXNUM);
   1724   setTargetDAGCombine(ISD::SUB);
   1725   setTargetDAGCombine(ISD::LOAD);
   1726   setTargetDAGCombine(ISD::MLOAD);
   1727   setTargetDAGCombine(ISD::STORE);
   1728   setTargetDAGCombine(ISD::MSTORE);
   1729   setTargetDAGCombine(ISD::TRUNCATE);
   1730   setTargetDAGCombine(ISD::ZERO_EXTEND);
   1731   setTargetDAGCombine(ISD::ANY_EXTEND);
   1732   setTargetDAGCombine(ISD::SIGN_EXTEND);
   1733   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   1734   setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
   1735   setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
   1736   setTargetDAGCombine(ISD::SINT_TO_FP);
   1737   setTargetDAGCombine(ISD::UINT_TO_FP);
   1738   setTargetDAGCombine(ISD::SETCC);
   1739   setTargetDAGCombine(ISD::MUL);
   1740   setTargetDAGCombine(ISD::XOR);
   1741   setTargetDAGCombine(ISD::MSCATTER);
   1742   setTargetDAGCombine(ISD::MGATHER);
   1743 
   1744   computeRegisterProperties(Subtarget.getRegisterInfo());
   1745 
   1746   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   1747   MaxStoresPerMemsetOptSize = 8;
   1748   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   1749   MaxStoresPerMemcpyOptSize = 4;
   1750   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   1751   MaxStoresPerMemmoveOptSize = 4;
   1752 
   1753   // TODO: These control memcmp expansion in CGP and could be raised higher, but
   1754   // that needs to benchmarked and balanced with the potential use of vector
   1755   // load/store types (PR33329, PR33914).
   1756   MaxLoadsPerMemcmp = 2;
   1757   MaxLoadsPerMemcmpOptSize = 2;
   1758 
   1759   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
   1760   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
   1761 
   1762   // An out-of-order CPU can speculatively execute past a predictable branch,
   1763   // but a conditional move could be stalled by an expensive earlier operation.
   1764   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
   1765   EnableExtLdPromotion = true;
   1766   setPrefFunctionAlignment(4); // 2^4 bytes.
   1767 
   1768   verifyIntrinsicTables();
   1769 }
   1770 
   1771 // This has so far only been implemented for 64-bit MachO.
   1772 bool X86TargetLowering::useLoadStackGuardNode() const {
   1773   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
   1774 }
   1775 
   1776 bool X86TargetLowering::useStackGuardXorFP() const {
   1777   // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
   1778   return Subtarget.getTargetTriple().isOSMSVCRT();
   1779 }
   1780 
   1781 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
   1782                                                const SDLoc &DL) const {
   1783   EVT PtrTy = getPointerTy(DAG.getDataLayout());
   1784   unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
   1785   MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
   1786   return SDValue(Node, 0);
   1787 }
   1788 
   1789 TargetLoweringBase::LegalizeTypeAction
   1790 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
   1791   if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
   1792     return TypeSplitVector;
   1793 
   1794   if (ExperimentalVectorWideningLegalization &&
   1795       VT.getVectorNumElements() != 1 &&
   1796       VT.getVectorElementType().getSimpleVT() != MVT::i1)
   1797     return TypeWidenVector;
   1798 
   1799   return TargetLoweringBase::getPreferredVectorAction(VT);
   1800 }
   1801 
   1802 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
   1803                                                      CallingConv::ID CC,
   1804                                                      EVT VT) const {
   1805   if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
   1806     return MVT::v32i8;
   1807   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
   1808 }
   1809 
   1810 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
   1811                                                           CallingConv::ID CC,
   1812                                                           EVT VT) const {
   1813   if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
   1814     return 1;
   1815   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
   1816 }
   1817 
   1818 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
   1819                                           LLVMContext& Context,
   1820                                           EVT VT) const {
   1821   if (!VT.isVector())
   1822     return MVT::i8;
   1823 
   1824   if (Subtarget.hasAVX512()) {
   1825     const unsigned NumElts = VT.getVectorNumElements();
   1826 
   1827     // Figure out what this type will be legalized to.
   1828     EVT LegalVT = VT;
   1829     while (getTypeAction(Context, LegalVT) != TypeLegal)
   1830       LegalVT = getTypeToTransformTo(Context, LegalVT);
   1831 
   1832     // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
   1833     if (LegalVT.getSimpleVT().is512BitVector())
   1834       return EVT::getVectorVT(Context, MVT::i1, NumElts);
   1835 
   1836     if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
   1837       // If we legalized to less than a 512-bit vector, then we will use a vXi1
   1838       // compare for vXi32/vXi64 for sure. If we have BWI we will also support
   1839       // vXi16/vXi8.
   1840       MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
   1841       if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
   1842         return EVT::getVectorVT(Context, MVT::i1, NumElts);
   1843     }
   1844   }
   1845 
   1846   return VT.changeVectorElementTypeToInteger();
   1847 }
   1848 
   1849 /// Helper for getByValTypeAlignment to determine
   1850 /// the desired ByVal argument alignment.
   1851 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   1852   if (MaxAlign == 16)
   1853     return;
   1854   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
   1855     if (VTy->getBitWidth() == 128)
   1856       MaxAlign = 16;
   1857   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
   1858     unsigned EltAlign = 0;
   1859     getMaxByValAlign(ATy->getElementType(), EltAlign);
   1860     if (EltAlign > MaxAlign)
   1861       MaxAlign = EltAlign;
   1862   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
   1863     for (auto *EltTy : STy->elements()) {
   1864       unsigned EltAlign = 0;
   1865       getMaxByValAlign(EltTy, EltAlign);
   1866       if (EltAlign > MaxAlign)
   1867         MaxAlign = EltAlign;
   1868       if (MaxAlign == 16)
   1869         break;
   1870     }
   1871   }
   1872 }
   1873 
   1874 /// Return the desired alignment for ByVal aggregate
   1875 /// function arguments in the caller parameter area. For X86, aggregates
   1876 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
   1877 /// are at 4-byte boundaries.
   1878 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
   1879                                                   const DataLayout &DL) const {
   1880   if (Subtarget.is64Bit()) {
   1881     // Max of 8 and alignment of type.
   1882     unsigned TyAlign = DL.getABITypeAlignment(Ty);
   1883     if (TyAlign > 8)
   1884       return TyAlign;
   1885     return 8;
   1886   }
   1887 
   1888   unsigned Align = 4;
   1889   if (Subtarget.hasSSE1())
   1890     getMaxByValAlign(Ty, Align);
   1891   return Align;
   1892 }
   1893 
   1894 /// Returns the target specific optimal type for load
   1895 /// and store operations as a result of memset, memcpy, and memmove
   1896 /// lowering. If DstAlign is zero that means it's safe to destination
   1897 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
   1898 /// means there isn't a need to check it against alignment requirement,
   1899 /// probably because the source does not need to be loaded. If 'IsMemset' is
   1900 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
   1901 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
   1902 /// source is constant so it does not need to be loaded.
   1903 /// It returns EVT::Other if the type should be determined using generic
   1904 /// target-independent logic.
   1905 EVT
   1906 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   1907                                        unsigned DstAlign, unsigned SrcAlign,
   1908                                        bool IsMemset, bool ZeroMemset,
   1909                                        bool MemcpyStrSrc,
   1910                                        MachineFunction &MF) const {
   1911   const Function &F = MF.getFunction();
   1912   if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
   1913     if (Size >= 16 &&
   1914         (!Subtarget.isUnalignedMem16Slow() ||
   1915          ((DstAlign == 0 || DstAlign >= 16) &&
   1916           (SrcAlign == 0 || SrcAlign >= 16)))) {
   1917       // FIXME: Check if unaligned 32-byte accesses are slow.
   1918       if (Size >= 32 && Subtarget.hasAVX()) {
   1919         // Although this isn't a well-supported type for AVX1, we'll let
   1920         // legalization and shuffle lowering produce the optimal codegen. If we
   1921         // choose an optimal type with a vector element larger than a byte,
   1922         // getMemsetStores() may create an intermediate splat (using an integer
   1923         // multiply) before we splat as a vector.
   1924         return MVT::v32i8;
   1925       }
   1926       if (Subtarget.hasSSE2())
   1927         return MVT::v16i8;
   1928       // TODO: Can SSE1 handle a byte vector?
   1929       if (Subtarget.hasSSE1())
   1930         return MVT::v4f32;
   1931     } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
   1932                !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
   1933       // Do not use f64 to lower memcpy if source is string constant. It's
   1934       // better to use i32 to avoid the loads.
   1935       // Also, do not use f64 to lower memset unless this is a memset of zeros.
   1936       // The gymnastics of splatting a byte value into an XMM register and then
   1937       // only using 8-byte stores (because this is a CPU with slow unaligned
   1938       // 16-byte accesses) makes that a loser.
   1939       return MVT::f64;
   1940     }
   1941   }
   1942   // This is a compromise. If we reach here, unaligned accesses may be slow on
   1943   // this target. However, creating smaller, aligned accesses could be even
   1944   // slower and would certainly be a lot more code.
   1945   if (Subtarget.is64Bit() && Size >= 8)
   1946     return MVT::i64;
   1947   return MVT::i32;
   1948 }
   1949 
   1950 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   1951   if (VT == MVT::f32)
   1952     return X86ScalarSSEf32;
   1953   else if (VT == MVT::f64)
   1954     return X86ScalarSSEf64;
   1955   return true;
   1956 }
   1957 
   1958 bool
   1959 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   1960                                                   unsigned,
   1961                                                   unsigned,
   1962                                                   bool *Fast) const {
   1963   if (Fast) {
   1964     switch (VT.getSizeInBits()) {
   1965     default:
   1966       // 8-byte and under are always assumed to be fast.
   1967       *Fast = true;
   1968       break;
   1969     case 128:
   1970       *Fast = !Subtarget.isUnalignedMem16Slow();
   1971       break;
   1972     case 256:
   1973       *Fast = !Subtarget.isUnalignedMem32Slow();
   1974       break;
   1975     // TODO: What about AVX-512 (512-bit) accesses?
   1976     }
   1977   }
   1978   // Misaligned accesses of any size are always allowed.
   1979   return true;
   1980 }
   1981 
   1982 /// Return the entry encoding for a jump table in the
   1983 /// current function.  The returned value is a member of the
   1984 /// MachineJumpTableInfo::JTEntryKind enum.
   1985 unsigned X86TargetLowering::getJumpTableEncoding() const {
   1986   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
   1987   // symbol.
   1988   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
   1989     return MachineJumpTableInfo::EK_Custom32;
   1990 
   1991   // Otherwise, use the normal jump table encoding heuristics.
   1992   return TargetLowering::getJumpTableEncoding();
   1993 }
   1994 
   1995 bool X86TargetLowering::useSoftFloat() const {
   1996   return Subtarget.useSoftFloat();
   1997 }
   1998 
   1999 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
   2000                                               ArgListTy &Args) const {
   2001 
   2002   // Only relabel X86-32 for C / Stdcall CCs.
   2003   if (Subtarget.is64Bit())
   2004     return;
   2005   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
   2006     return;
   2007   unsigned ParamRegs = 0;
   2008   if (auto *M = MF->getFunction().getParent())
   2009     ParamRegs = M->getNumberRegisterParameters();
   2010 
   2011   // Mark the first N int arguments as having reg
   2012   for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
   2013     Type *T = Args[Idx].Ty;
   2014     if (T->isIntOrPtrTy())
   2015       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
   2016         unsigned numRegs = 1;
   2017         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
   2018           numRegs = 2;
   2019         if (ParamRegs < numRegs)
   2020           return;
   2021         ParamRegs -= numRegs;
   2022         Args[Idx].IsInReg = true;
   2023       }
   2024   }
   2025 }
   2026 
   2027 const MCExpr *
   2028 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
   2029                                              const MachineBasicBlock *MBB,
   2030                                              unsigned uid,MCContext &Ctx) const{
   2031   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
   2032   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   2033   // entries.
   2034   return MCSymbolRefExpr::create(MBB->getSymbol(),
   2035                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
   2036 }
   2037 
   2038 /// Returns relocation base for the given PIC jumptable.
   2039 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   2040                                                     SelectionDAG &DAG) const {
   2041   if (!Subtarget.is64Bit())
   2042     // This doesn't have SDLoc associated with it, but is not really the
   2043     // same as a Register.
   2044     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
   2045                        getPointerTy(DAG.getDataLayout()));
   2046   return Table;
   2047 }
   2048 
   2049 /// This returns the relocation base for the given PIC jumptable,
   2050 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
   2051 const MCExpr *X86TargetLowering::
   2052 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
   2053                              MCContext &Ctx) const {
   2054   // X86-64 uses RIP relative addressing based on the jump table label.
   2055   if (Subtarget.isPICStyleRIPRel())
   2056     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
   2057 
   2058   // Otherwise, the reference is relative to the PIC base.
   2059   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
   2060 }
   2061 
   2062 std::pair<const TargetRegisterClass *, uint8_t>
   2063 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
   2064                                            MVT VT) const {
   2065   const TargetRegisterClass *RRC = nullptr;
   2066   uint8_t Cost = 1;
   2067   switch (VT.SimpleTy) {
   2068   default:
   2069     return TargetLowering::findRepresentativeClass(TRI, VT);
   2070   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
   2071     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
   2072     break;
   2073   case MVT::x86mmx:
   2074     RRC = &X86::VR64RegClass;
   2075     break;
   2076   case MVT::f32: case MVT::f64:
   2077   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   2078   case MVT::v4f32: case MVT::v2f64:
   2079   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
   2080   case MVT::v8f32: case MVT::v4f64:
   2081   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
   2082   case MVT::v16f32: case MVT::v8f64:
   2083     RRC = &X86::VR128XRegClass;
   2084     break;
   2085   }
   2086   return std::make_pair(RRC, Cost);
   2087 }
   2088 
   2089 unsigned X86TargetLowering::getAddressSpace() const {
   2090   if (Subtarget.is64Bit())
   2091     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
   2092   return 256;
   2093 }
   2094 
   2095 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
   2096   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
   2097          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
   2098 }
   2099 
   2100 static Constant* SegmentOffset(IRBuilder<> &IRB,
   2101                                unsigned Offset, unsigned AddressSpace) {
   2102   return ConstantExpr::getIntToPtr(
   2103       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
   2104       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
   2105 }
   2106 
   2107 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
   2108   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
   2109   // tcbhead_t; use it instead of the usual global variable (see
   2110   // sysdeps/{i386,x86_64}/nptl/tls.h)
   2111   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
   2112     if (Subtarget.isTargetFuchsia()) {
   2113       // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
   2114       return SegmentOffset(IRB, 0x10, getAddressSpace());
   2115     } else {
   2116       // %fs:0x28, unless we're using a Kernel code model, in which case
   2117       // it's %gs:0x28.  gs:0x14 on i386.
   2118       unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
   2119       return SegmentOffset(IRB, Offset, getAddressSpace());
   2120     }
   2121   }
   2122 
   2123   return TargetLowering::getIRStackGuard(IRB);
   2124 }
   2125 
   2126 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
   2127   // MSVC CRT provides functionalities for stack protection.
   2128   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
   2129       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
   2130     // MSVC CRT has a global variable holding security cookie.
   2131     M.getOrInsertGlobal("__security_cookie",
   2132                         Type::getInt8PtrTy(M.getContext()));
   2133 
   2134     // MSVC CRT has a function to validate security cookie.
   2135     auto *SecurityCheckCookie = cast<Function>(
   2136         M.getOrInsertFunction("__security_check_cookie",
   2137                               Type::getVoidTy(M.getContext()),
   2138                               Type::getInt8PtrTy(M.getContext())));
   2139     SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
   2140     SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
   2141     return;
   2142   }
   2143   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
   2144   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
   2145     return;
   2146   TargetLowering::insertSSPDeclarations(M);
   2147 }
   2148 
   2149 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
   2150   // MSVC CRT has a global variable holding security cookie.
   2151   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
   2152       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
   2153     return M.getGlobalVariable("__security_cookie");
   2154   }
   2155   return TargetLowering::getSDagStackGuard(M);
   2156 }
   2157 
   2158 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
   2159   // MSVC CRT has a function to validate security cookie.
   2160   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
   2161       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
   2162     return M.getFunction("__security_check_cookie");
   2163   }
   2164   return TargetLowering::getSSPStackGuardCheck(M);
   2165 }
   2166 
   2167 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   2168   if (Subtarget.getTargetTriple().isOSContiki())
   2169     return getDefaultSafeStackPointerLocation(IRB, false);
   2170 
   2171   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   2172   // definition of TLS_SLOT_SAFESTACK in
   2173   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
   2174   if (Subtarget.isTargetAndroid()) {
   2175     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
   2176     // %gs:0x24 on i386
   2177     unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
   2178     return SegmentOffset(IRB, Offset, getAddressSpace());
   2179   }
   2180 
   2181   // Fuchsia is similar.
   2182   if (Subtarget.isTargetFuchsia()) {
   2183     // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
   2184     return SegmentOffset(IRB, 0x18, getAddressSpace());
   2185   }
   2186 
   2187   return TargetLowering::getSafeStackPointerLocation(IRB);
   2188 }
   2189 
   2190 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
   2191                                             unsigned DestAS) const {
   2192   assert(SrcAS != DestAS && "Expected different address spaces!");
   2193 
   2194   return SrcAS < 256 && DestAS < 256;
   2195 }
   2196 
   2197 //===----------------------------------------------------------------------===//
   2198 //               Return Value Calling Convention Implementation
   2199 //===----------------------------------------------------------------------===//
   2200 
   2201 #include "X86GenCallingConv.inc"
   2202 
   2203 bool X86TargetLowering::CanLowerReturn(
   2204     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
   2205     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   2206   SmallVector<CCValAssign, 16> RVLocs;
   2207   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   2208   return CCInfo.CheckReturn(Outs, RetCC_X86);
   2209 }
   2210 
   2211 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
   2212   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
   2213   return ScratchRegs;
   2214 }
   2215 
   2216 /// Lowers masks values (v*i1) to the local register values
   2217 /// \returns DAG node after lowering to register type
   2218 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
   2219                                const SDLoc &Dl, SelectionDAG &DAG) {
   2220   EVT ValVT = ValArg.getValueType();
   2221 
   2222   if (ValVT == MVT::v1i1)
   2223     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
   2224                        DAG.getIntPtrConstant(0, Dl));
   2225 
   2226   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
   2227       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
   2228     // Two stage lowering might be required
   2229     // bitcast:   v8i1 -> i8 / v16i1 -> i16
   2230     // anyextend: i8   -> i32 / i16   -> i32
   2231     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
   2232     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
   2233     if (ValLoc == MVT::i32)
   2234       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
   2235     return ValToCopy;
   2236   }
   2237 
   2238   if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
   2239       (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
   2240     // One stage lowering is required
   2241     // bitcast:   v32i1 -> i32 / v64i1 -> i64
   2242     return DAG.getBitcast(ValLoc, ValArg);
   2243   }
   2244 
   2245   return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
   2246 }
   2247 
   2248 /// Breaks v64i1 value into two registers and adds the new node to the DAG
   2249 static void Passv64i1ArgInRegs(
   2250     const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
   2251     SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
   2252     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
   2253   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
   2254   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
   2255   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
   2256   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
   2257          "The value should reside in two registers");
   2258 
   2259   // Before splitting the value we cast it to i64
   2260   Arg = DAG.getBitcast(MVT::i64, Arg);
   2261 
   2262   // Splitting the value into two i32 types
   2263   SDValue Lo, Hi;
   2264   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
   2265                    DAG.getConstant(0, Dl, MVT::i32));
   2266   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
   2267                    DAG.getConstant(1, Dl, MVT::i32));
   2268 
   2269   // Attach the two i32 types into corresponding registers
   2270   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
   2271   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
   2272 }
   2273 
   2274 SDValue
   2275 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   2276                                bool isVarArg,
   2277                                const SmallVectorImpl<ISD::OutputArg> &Outs,
   2278                                const SmallVectorImpl<SDValue> &OutVals,
   2279                                const SDLoc &dl, SelectionDAG &DAG) const {
   2280   MachineFunction &MF = DAG.getMachineFunction();
   2281   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   2282 
   2283   // In some cases we need to disable registers from the default CSR list.
   2284   // For example, when they are used for argument passing.
   2285   bool ShouldDisableCalleeSavedRegister =
   2286       CallConv == CallingConv::X86_RegCall ||
   2287       MF.getFunction().hasFnAttribute("no_caller_saved_registers");
   2288 
   2289   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
   2290     report_fatal_error("X86 interrupts may not return any value");
   2291 
   2292   SmallVector<CCValAssign, 16> RVLocs;
   2293   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
   2294   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
   2295 
   2296   SDValue Flag;
   2297   SmallVector<SDValue, 6> RetOps;
   2298   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   2299   // Operand #1 = Bytes To Pop
   2300   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
   2301                    MVT::i32));
   2302 
   2303   // Copy the result values into the output registers.
   2304   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
   2305        ++I, ++OutsIndex) {
   2306     CCValAssign &VA = RVLocs[I];
   2307     assert(VA.isRegLoc() && "Can only return in registers!");
   2308 
   2309     // Add the register to the CalleeSaveDisableRegs list.
   2310     if (ShouldDisableCalleeSavedRegister)
   2311       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
   2312 
   2313     SDValue ValToCopy = OutVals[OutsIndex];
   2314     EVT ValVT = ValToCopy.getValueType();
   2315 
   2316     // Promote values to the appropriate types.
   2317     if (VA.getLocInfo() == CCValAssign::SExt)
   2318       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
   2319     else if (VA.getLocInfo() == CCValAssign::ZExt)
   2320       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
   2321     else if (VA.getLocInfo() == CCValAssign::AExt) {
   2322       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
   2323         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
   2324       else
   2325         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
   2326     }
   2327     else if (VA.getLocInfo() == CCValAssign::BCvt)
   2328       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
   2329 
   2330     assert(VA.getLocInfo() != CCValAssign::FPExt &&
   2331            "Unexpected FP-extend for return value.");
   2332 
   2333     // If this is x86-64, and we disabled SSE, we can't return FP values,
   2334     // or SSE or MMX vectors.
   2335     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
   2336          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
   2337         (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
   2338       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
   2339       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
   2340     } else if (ValVT == MVT::f64 &&
   2341                (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
   2342       // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
   2343       // llvm-gcc has never done it right and no one has noticed, so this
   2344       // should be OK for now.
   2345       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
   2346       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
   2347     }
   2348 
   2349     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
   2350     // the RET instruction and handled by the FP Stackifier.
   2351     if (VA.getLocReg() == X86::FP0 ||
   2352         VA.getLocReg() == X86::FP1) {
   2353       // If this is a copy from an xmm register to ST(0), use an FPExtend to
   2354       // change the value to the FP stack register class.
   2355       if (isScalarFPTypeInSSEReg(VA.getValVT()))
   2356         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
   2357       RetOps.push_back(ValToCopy);
   2358       // Don't emit a copytoreg.
   2359       continue;
   2360     }
   2361 
   2362     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
   2363     // which is returned in RAX / RDX.
   2364     if (Subtarget.is64Bit()) {
   2365       if (ValVT == MVT::x86mmx) {
   2366         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
   2367           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
   2368           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   2369                                   ValToCopy);
   2370           // If we don't have SSE2 available, convert to v4f32 so the generated
   2371           // register is legal.
   2372           if (!Subtarget.hasSSE2())
   2373             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
   2374         }
   2375       }
   2376     }
   2377 
   2378     SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   2379 
   2380     if (VA.needsCustom()) {
   2381       assert(VA.getValVT() == MVT::v64i1 &&
   2382              "Currently the only custom case is when we split v64i1 to 2 regs");
   2383 
   2384       Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
   2385                          Subtarget);
   2386 
   2387       assert(2 == RegsToPass.size() &&
   2388              "Expecting two registers after Pass64BitArgInRegs");
   2389 
   2390       // Add the second register to the CalleeSaveDisableRegs list.
   2391       if (ShouldDisableCalleeSavedRegister)
   2392         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
   2393     } else {
   2394       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
   2395     }
   2396 
   2397     // Add nodes to the DAG and add the values into the RetOps list
   2398     for (auto &Reg : RegsToPass) {
   2399       Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
   2400       Flag = Chain.getValue(1);
   2401       RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
   2402     }
   2403   }
   2404 
   2405   // Swift calling convention does not require we copy the sret argument
   2406   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
   2407 
   2408   // All x86 ABIs require that for returning structs by value we copy
   2409   // the sret argument into %rax/%eax (depending on ABI) for the return.
   2410   // We saved the argument into a virtual register in the entry block,
   2411   // so now we copy the value out and into %rax/%eax.
   2412   //
   2413   // Checking Function.hasStructRetAttr() here is insufficient because the IR
   2414   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
   2415   // false, then an sret argument may be implicitly inserted in the SelDAG. In
   2416   // either case FuncInfo->setSRetReturnReg() will have been called.
   2417   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
   2418     // When we have both sret and another return value, we should use the
   2419     // original Chain stored in RetOps[0], instead of the current Chain updated
   2420     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
   2421 
   2422     // For the case of sret and another return value, we have
   2423     //   Chain_0 at the function entry
   2424     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
   2425     // If we use Chain_1 in getCopyFromReg, we will have
   2426     //   Val = getCopyFromReg(Chain_1)
   2427     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
   2428 
   2429     // getCopyToReg(Chain_0) will be glued together with
   2430     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
   2431     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
   2432     //   Data dependency from Unit B to Unit A due to usage of Val in
   2433     //     getCopyToReg(Chain_1, Val)
   2434     //   Chain dependency from Unit A to Unit B
   2435 
   2436     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
   2437     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
   2438                                      getPointerTy(MF.getDataLayout()));
   2439 
   2440     unsigned RetValReg
   2441         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
   2442           X86::RAX : X86::EAX;
   2443     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
   2444     Flag = Chain.getValue(1);
   2445 
   2446     // RAX/EAX now acts like a return value.
   2447     RetOps.push_back(
   2448         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
   2449 
   2450     // Add the returned register to the CalleeSaveDisableRegs list.
   2451     if (ShouldDisableCalleeSavedRegister)
   2452       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
   2453   }
   2454 
   2455   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   2456   const MCPhysReg *I =
   2457       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
   2458   if (I) {
   2459     for (; *I; ++I) {
   2460       if (X86::GR64RegClass.contains(*I))
   2461         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
   2462       else
   2463         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
   2464     }
   2465   }
   2466 
   2467   RetOps[0] = Chain;  // Update chain.
   2468 
   2469   // Add the flag if we have it.
   2470   if (Flag.getNode())
   2471     RetOps.push_back(Flag);
   2472 
   2473   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
   2474   if (CallConv == CallingConv::X86_INTR)
   2475     opcode = X86ISD::IRET;
   2476   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
   2477 }
   2478 
   2479 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   2480   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
   2481     return false;
   2482 
   2483   SDValue TCChain = Chain;
   2484   SDNode *Copy = *N->use_begin();
   2485   if (Copy->getOpcode() == ISD::CopyToReg) {
   2486     // If the copy has a glue operand, we conservatively assume it isn't safe to
   2487     // perform a tail call.
   2488     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
   2489       return false;
   2490     TCChain = Copy->getOperand(0);
   2491   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
   2492     return false;
   2493 
   2494   bool HasRet = false;
   2495   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
   2496        UI != UE; ++UI) {
   2497     if (UI->getOpcode() != X86ISD::RET_FLAG)
   2498       return false;
   2499     // If we are returning more than one value, we can definitely
   2500     // not make a tail call see PR19530
   2501     if (UI->getNumOperands() > 4)
   2502       return false;
   2503     if (UI->getNumOperands() == 4 &&
   2504         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
   2505       return false;
   2506     HasRet = true;
   2507   }
   2508 
   2509   if (!HasRet)
   2510     return false;
   2511 
   2512   Chain = TCChain;
   2513   return true;
   2514 }
   2515 
   2516 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
   2517                                            ISD::NodeType ExtendKind) const {
   2518   MVT ReturnMVT = MVT::i32;
   2519 
   2520   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
   2521   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
   2522     // The ABI does not require i1, i8 or i16 to be extended.
   2523     //
   2524     // On Darwin, there is code in the wild relying on Clang's old behaviour of
   2525     // always extending i8/i16 return values, so keep doing that for now.
   2526     // (PR26665).
   2527     ReturnMVT = MVT::i8;
   2528   }
   2529 
   2530   EVT MinVT = getRegisterType(Context, ReturnMVT);
   2531   return VT.bitsLT(MinVT) ? MinVT : VT;
   2532 }
   2533 
   2534 /// Reads two 32 bit registers and creates a 64 bit mask value.
   2535 /// \param VA The current 32 bit value that need to be assigned.
   2536 /// \param NextVA The next 32 bit value that need to be assigned.
   2537 /// \param Root The parent DAG node.
   2538 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
   2539 ///                        glue purposes. In the case the DAG is already using
   2540 ///                        physical register instead of virtual, we should glue
   2541 ///                        our new SDValue to InFlag SDvalue.
   2542 /// \return a new SDvalue of size 64bit.
   2543 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
   2544                                 SDValue &Root, SelectionDAG &DAG,
   2545                                 const SDLoc &Dl, const X86Subtarget &Subtarget,
   2546                                 SDValue *InFlag = nullptr) {
   2547   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
   2548   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
   2549   assert(VA.getValVT() == MVT::v64i1 &&
   2550          "Expecting first location of 64 bit width type");
   2551   assert(NextVA.getValVT() == VA.getValVT() &&
   2552          "The locations should have the same type");
   2553   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
   2554          "The values should reside in two registers");
   2555 
   2556   SDValue Lo, Hi;
   2557   unsigned Reg;
   2558   SDValue ArgValueLo, ArgValueHi;
   2559 
   2560   MachineFunction &MF = DAG.getMachineFunction();
   2561   const TargetRegisterClass *RC = &X86::GR32RegClass;
   2562 
   2563   // Read a 32 bit value from the registers.
   2564   if (nullptr == InFlag) {
   2565     // When no physical register is present,
   2566     // create an intermediate virtual register.
   2567     Reg = MF.addLiveIn(VA.getLocReg(), RC);
   2568     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
   2569     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
   2570     ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
   2571   } else {
   2572     // When a physical register is available read the value from it and glue
   2573     // the reads together.
   2574     ArgValueLo =
   2575       DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
   2576     *InFlag = ArgValueLo.getValue(2);
   2577     ArgValueHi =
   2578       DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
   2579     *InFlag = ArgValueHi.getValue(2);
   2580   }
   2581 
   2582   // Convert the i32 type into v32i1 type.
   2583   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
   2584 
   2585   // Convert the i32 type into v32i1 type.
   2586   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
   2587 
   2588   // Concatenate the two values together.
   2589   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
   2590 }
   2591 
   2592 /// The function will lower a register of various sizes (8/16/32/64)
   2593 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
   2594 /// \returns a DAG node contains the operand after lowering to mask type.
   2595 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
   2596                                const EVT &ValLoc, const SDLoc &Dl,
   2597                                SelectionDAG &DAG) {
   2598   SDValue ValReturned = ValArg;
   2599 
   2600   if (ValVT == MVT::v1i1)
   2601     return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
   2602 
   2603   if (ValVT == MVT::v64i1) {
   2604     // In 32 bit machine, this case is handled by getv64i1Argument
   2605     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
   2606     // In 64 bit machine, There is no need to truncate the value only bitcast
   2607   } else {
   2608     MVT maskLen;
   2609     switch (ValVT.getSimpleVT().SimpleTy) {
   2610     case MVT::v8i1:
   2611       maskLen = MVT::i8;
   2612       break;
   2613     case MVT::v16i1:
   2614       maskLen = MVT::i16;
   2615       break;
   2616     case MVT::v32i1:
   2617       maskLen = MVT::i32;
   2618       break;
   2619     default:
   2620       llvm_unreachable("Expecting a vector of i1 types");
   2621     }
   2622 
   2623     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
   2624   }
   2625   return DAG.getBitcast(ValVT, ValReturned);
   2626 }
   2627 
   2628 /// Lower the result values of a call into the
   2629 /// appropriate copies out of appropriate physical registers.
   2630 ///
   2631 SDValue X86TargetLowering::LowerCallResult(
   2632     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
   2633     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
   2634     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
   2635     uint32_t *RegMask) const {
   2636 
   2637   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   2638   // Assign locations to each value returned by this call.
   2639   SmallVector<CCValAssign, 16> RVLocs;
   2640   bool Is64Bit = Subtarget.is64Bit();
   2641   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
   2642                  *DAG.getContext());
   2643   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   2644 
   2645   // Copy all of the result registers out of their specified physreg.
   2646   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
   2647        ++I, ++InsIndex) {
   2648     CCValAssign &VA = RVLocs[I];
   2649     EVT CopyVT = VA.getLocVT();
   2650 
   2651     // In some calling conventions we need to remove the used registers
   2652     // from the register mask.
   2653     if (RegMask) {
   2654       for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
   2655            SubRegs.isValid(); ++SubRegs)
   2656         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
   2657     }
   2658 
   2659     // If this is x86-64, and we disabled SSE, we can't return FP values
   2660     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
   2661         ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
   2662       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
   2663       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
   2664     }
   2665 
   2666     // If we prefer to use the value in xmm registers, copy it out as f80 and
   2667     // use a truncate to move it from fp stack reg to xmm reg.
   2668     bool RoundAfterCopy = false;
   2669     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
   2670         isScalarFPTypeInSSEReg(VA.getValVT())) {
   2671       if (!Subtarget.hasX87())
   2672         report_fatal_error("X87 register return with X87 disabled");
   2673       CopyVT = MVT::f80;
   2674       RoundAfterCopy = (CopyVT != VA.getLocVT());
   2675     }
   2676 
   2677     SDValue Val;
   2678     if (VA.needsCustom()) {
   2679       assert(VA.getValVT() == MVT::v64i1 &&
   2680              "Currently the only custom case is when we split v64i1 to 2 regs");
   2681       Val =
   2682           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
   2683     } else {
   2684       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
   2685                   .getValue(1);
   2686       Val = Chain.getValue(0);
   2687       InFlag = Chain.getValue(2);
   2688     }
   2689 
   2690     if (RoundAfterCopy)
   2691       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
   2692                         // This truncation won't change the value.
   2693                         DAG.getIntPtrConstant(1, dl));
   2694 
   2695     if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
   2696       if (VA.getValVT().isVector() &&
   2697           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
   2698            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
   2699         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
   2700         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
   2701       } else
   2702         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
   2703     }
   2704 
   2705     InVals.push_back(Val);
   2706   }
   2707 
   2708   return Chain;
   2709 }
   2710 
   2711 //===----------------------------------------------------------------------===//
   2712 //                C & StdCall & Fast Calling Convention implementation
   2713 //===----------------------------------------------------------------------===//
   2714 //  StdCall calling convention seems to be standard for many Windows' API
   2715 //  routines and around. It differs from C calling convention just a little:
   2716 //  callee should clean up the stack, not caller. Symbols should be also
   2717 //  decorated in some fancy way :) It doesn't support any vector arguments.
   2718 //  For info on fast calling convention see Fast Calling Convention (tail call)
   2719 //  implementation LowerX86_32FastCCCallTo.
   2720 
   2721 /// CallIsStructReturn - Determines whether a call uses struct return
   2722 /// semantics.
   2723 enum StructReturnType {
   2724   NotStructReturn,
   2725   RegStructReturn,
   2726   StackStructReturn
   2727 };
   2728 static StructReturnType
   2729 callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
   2730   if (Outs.empty())
   2731     return NotStructReturn;
   2732 
   2733   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
   2734   if (!Flags.isSRet())
   2735     return NotStructReturn;
   2736   if (Flags.isInReg() || IsMCU)
   2737     return RegStructReturn;
   2738   return StackStructReturn;
   2739 }
   2740 
   2741 /// Determines whether a function uses struct return semantics.
   2742 static StructReturnType
   2743 argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
   2744   if (Ins.empty())
   2745     return NotStructReturn;
   2746 
   2747   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
   2748   if (!Flags.isSRet())
   2749     return NotStructReturn;
   2750   if (Flags.isInReg() || IsMCU)
   2751     return RegStructReturn;
   2752   return StackStructReturn;
   2753 }
   2754 
   2755 /// Make a copy of an aggregate at address specified by "Src" to address
   2756 /// "Dst" with size and alignment information specified by the specific
   2757 /// parameter attribute. The copy will be passed as a byval function parameter.
   2758 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
   2759                                          SDValue Chain, ISD::ArgFlagsTy Flags,
   2760                                          SelectionDAG &DAG, const SDLoc &dl) {
   2761   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
   2762 
   2763   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
   2764                        /*isVolatile*/false, /*AlwaysInline=*/true,
   2765                        /*isTailCall*/false,
   2766                        MachinePointerInfo(), MachinePointerInfo());
   2767 }
   2768 
   2769 /// Return true if the calling convention is one that we can guarantee TCO for.
   2770 static bool canGuaranteeTCO(CallingConv::ID CC) {
   2771   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
   2772           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
   2773           CC == CallingConv::HHVM);
   2774 }
   2775 
   2776 /// Return true if we might ever do TCO for calls with this calling convention.
   2777 static bool mayTailCallThisCC(CallingConv::ID CC) {
   2778   switch (CC) {
   2779   // C calling conventions:
   2780   case CallingConv::C:
   2781   case CallingConv::Win64:
   2782   case CallingConv::X86_64_SysV:
   2783   // Callee pop conventions:
   2784   case CallingConv::X86_ThisCall:
   2785   case CallingConv::X86_StdCall:
   2786   case CallingConv::X86_VectorCall:
   2787   case CallingConv::X86_FastCall:
   2788     return true;
   2789   default:
   2790     return canGuaranteeTCO(CC);
   2791   }
   2792 }
   2793 
   2794 /// Return true if the function is being made into a tailcall target by
   2795 /// changing its ABI.
   2796 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
   2797   return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
   2798 }
   2799 
   2800 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   2801   auto Attr =
   2802       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
   2803   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
   2804     return false;
   2805 
   2806   ImmutableCallSite CS(CI);
   2807   CallingConv::ID CalleeCC = CS.getCallingConv();
   2808   if (!mayTailCallThisCC(CalleeCC))
   2809     return false;
   2810 
   2811   return true;
   2812 }
   2813 
   2814 SDValue
   2815 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
   2816                                     const SmallVectorImpl<ISD::InputArg> &Ins,
   2817                                     const SDLoc &dl, SelectionDAG &DAG,
   2818                                     const CCValAssign &VA,
   2819                                     MachineFrameInfo &MFI, unsigned i) const {
   2820   // Create the nodes corresponding to a load from this parameter slot.
   2821   ISD::ArgFlagsTy Flags = Ins[i].Flags;
   2822   bool AlwaysUseMutable = shouldGuaranteeTCO(
   2823       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   2824   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   2825   EVT ValVT;
   2826   MVT PtrVT = getPointerTy(DAG.getDataLayout());
   2827 
   2828   // If value is passed by pointer we have address passed instead of the value
   2829   // itself. No need to extend if the mask value and location share the same
   2830   // absolute size.
   2831   bool ExtendedInMem =
   2832       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
   2833       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
   2834 
   2835   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
   2836     ValVT = VA.getLocVT();
   2837   else
   2838     ValVT = VA.getValVT();
   2839 
   2840   // Calculate SP offset of interrupt parameter, re-arrange the slot normally
   2841   // taken by a return address.
   2842   int Offset = 0;
   2843   if (CallConv == CallingConv::X86_INTR) {
   2844     // X86 interrupts may take one or two arguments.
   2845     // On the stack there will be no return address as in regular call.
   2846     // Offset of last argument need to be set to -4/-8 bytes.
   2847     // Where offset of the first argument out of two, should be set to 0 bytes.
   2848     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
   2849     if (Subtarget.is64Bit() && Ins.size() == 2) {
   2850       // The stack pointer needs to be realigned for 64 bit handlers with error
   2851       // code, so the argument offset changes by 8 bytes.
   2852       Offset += 8;
   2853     }
   2854   }
   2855 
   2856   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   2857   // changed with more analysis.
   2858   // In case of tail call optimization mark all arguments mutable. Since they
   2859   // could be overwritten by lowering of arguments in case of a tail call.
   2860   if (Flags.isByVal()) {
   2861     unsigned Bytes = Flags.getByValSize();
   2862     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
   2863 
   2864     // FIXME: For now, all byval parameter objects are marked as aliasing. This
   2865     // can be improved with deeper analysis.
   2866     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
   2867                                    /*isAliased=*/true);
   2868     // Adjust SP offset of interrupt parameter.
   2869     if (CallConv == CallingConv::X86_INTR) {
   2870       MFI.setObjectOffset(FI, Offset);
   2871     }
   2872     return DAG.getFrameIndex(FI, PtrVT);
   2873   }
   2874 
   2875   // This is an argument in memory. We might be able to perform copy elision.
   2876   if (Flags.isCopyElisionCandidate()) {
   2877     EVT ArgVT = Ins[i].ArgVT;
   2878     SDValue PartAddr;
   2879     if (Ins[i].PartOffset == 0) {
   2880       // If this is a one-part value or the first part of a multi-part value,
   2881       // create a stack object for the entire argument value type and return a
   2882       // load from our portion of it. This assumes that if the first part of an
   2883       // argument is in memory, the rest will also be in memory.
   2884       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
   2885                                      /*Immutable=*/false);
   2886       PartAddr = DAG.getFrameIndex(FI, PtrVT);
   2887       return DAG.getLoad(
   2888           ValVT, dl, Chain, PartAddr,
   2889           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
   2890     } else {
   2891       // This is not the first piece of an argument in memory. See if there is
   2892       // already a fixed stack object including this offset. If so, assume it
   2893       // was created by the PartOffset == 0 branch above and create a load from
   2894       // the appropriate offset into it.
   2895       int64_t PartBegin = VA.getLocMemOffset();
   2896       int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
   2897       int FI = MFI.getObjectIndexBegin();
   2898       for (; MFI.isFixedObjectIndex(FI); ++FI) {
   2899         int64_t ObjBegin = MFI.getObjectOffset(FI);
   2900         int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
   2901         if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
   2902           break;
   2903       }
   2904       if (MFI.isFixedObjectIndex(FI)) {
   2905         SDValue Addr =
   2906             DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
   2907                         DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
   2908         return DAG.getLoad(
   2909             ValVT, dl, Chain, Addr,
   2910             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
   2911                                               Ins[i].PartOffset));
   2912       }
   2913     }
   2914   }
   2915 
   2916   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
   2917                                  VA.getLocMemOffset(), isImmutable);
   2918 
   2919   // Set SExt or ZExt flag.
   2920   if (VA.getLocInfo() == CCValAssign::ZExt) {
   2921     MFI.setObjectZExt(FI, true);
   2922   } else if (VA.getLocInfo() == CCValAssign::SExt) {
   2923     MFI.setObjectSExt(FI, true);
   2924   }
   2925 
   2926   // Adjust SP offset of interrupt parameter.
   2927   if (CallConv == CallingConv::X86_INTR) {
   2928     MFI.setObjectOffset(FI, Offset);
   2929   }
   2930 
   2931   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
   2932   SDValue Val = DAG.getLoad(
   2933       ValVT, dl, Chain, FIN,
   2934       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
   2935   return ExtendedInMem
   2936              ? (VA.getValVT().isVector()
   2937                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
   2938                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
   2939              : Val;
   2940 }
   2941 
   2942 // FIXME: Get this from tablegen.
   2943 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
   2944                                                 const X86Subtarget &Subtarget) {
   2945   assert(Subtarget.is64Bit());
   2946 
   2947   if (Subtarget.isCallingConvWin64(CallConv)) {
   2948     static const MCPhysReg GPR64ArgRegsWin64[] = {
   2949       X86::RCX, X86::RDX, X86::R8,  X86::R9
   2950     };
   2951     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
   2952   }
   2953 
   2954   static const MCPhysReg GPR64ArgRegs64Bit[] = {
   2955     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
   2956   };
   2957   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
   2958 }
   2959 
   2960 // FIXME: Get this from tablegen.
   2961 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
   2962                                                 CallingConv::ID CallConv,
   2963                                                 const X86Subtarget &Subtarget) {
   2964   assert(Subtarget.is64Bit());
   2965   if (Subtarget.isCallingConvWin64(CallConv)) {
   2966     // The XMM registers which might contain var arg parameters are shadowed
   2967     // in their paired GPR.  So we only need to save the GPR to their home
   2968     // slots.
   2969     // TODO: __vectorcall will change this.
   2970     return None;
   2971   }
   2972 
   2973   const Function &F = MF.getFunction();
   2974   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
   2975   bool isSoftFloat = Subtarget.useSoftFloat();
   2976   assert(!(isSoftFloat && NoImplicitFloatOps) &&
   2977          "SSE register cannot be used when SSE is disabled!");
   2978   if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
   2979     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
   2980     // registers.
   2981     return None;
   2982 
   2983   static const MCPhysReg XMMArgRegs64Bit[] = {
   2984     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   2985     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   2986   };
   2987   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
   2988 }
   2989 
   2990 #ifndef NDEBUG
   2991 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
   2992   return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
   2993                         [](const CCValAssign &A, const CCValAssign &B) -> bool {
   2994                           return A.getValNo() < B.getValNo();
   2995                         });
   2996 }
   2997 #endif
   2998 
   2999 SDValue X86TargetLowering::LowerFormalArguments(
   3000     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   3001     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
   3002     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   3003   MachineFunction &MF = DAG.getMachineFunction();
   3004   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   3005   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   3006 
   3007   const Function &F = MF.getFunction();
   3008   if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
   3009       F.getName() == "main")
   3010     FuncInfo->setForceFramePointer(true);
   3011 
   3012   MachineFrameInfo &MFI = MF.getFrameInfo();
   3013   bool Is64Bit = Subtarget.is64Bit();
   3014   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
   3015 
   3016   assert(
   3017       !(isVarArg && canGuaranteeTCO(CallConv)) &&
   3018       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
   3019 
   3020   if (CallConv == CallingConv::X86_INTR) {
   3021     bool isLegal = Ins.size() == 1 ||
   3022                    (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
   3023                                         (!Is64Bit && Ins[1].VT == MVT::i32)));
   3024     if (!isLegal)
   3025       report_fatal_error("X86 interrupts may take one or two arguments");
   3026   }
   3027 
   3028   // Assign locations to all of the incoming arguments.
   3029   SmallVector<CCValAssign, 16> ArgLocs;
   3030   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
   3031 
   3032   // Allocate shadow area for Win64.
   3033   if (IsWin64)
   3034     CCInfo.AllocateStack(32, 8);
   3035 
   3036   CCInfo.AnalyzeArguments(Ins, CC_X86);
   3037 
   3038   // In vectorcall calling convention a second pass is required for the HVA
   3039   // types.
   3040   if (CallingConv::X86_VectorCall == CallConv) {
   3041     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
   3042   }
   3043 
   3044   // The next loop assumes that the locations are in the same order of the
   3045   // input arguments.
   3046   assert(isSortedByValueNo(ArgLocs) &&
   3047          "Argument Location list must be sorted before lowering");
   3048 
   3049   SDValue ArgValue;
   3050   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
   3051        ++I, ++InsIndex) {
   3052     assert(InsIndex < Ins.size() && "Invalid Ins index");
   3053     CCValAssign &VA = ArgLocs[I];
   3054 
   3055     if (VA.isRegLoc()) {
   3056       EVT RegVT = VA.getLocVT();
   3057       if (VA.needsCustom()) {
   3058         assert(
   3059             VA.getValVT() == MVT::v64i1 &&
   3060             "Currently the only custom case is when we split v64i1 to 2 regs");
   3061 
   3062         // v64i1 values, in regcall calling convention, that are
   3063         // compiled to 32 bit arch, are split up into two registers.
   3064         ArgValue =
   3065             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
   3066       } else {
   3067         const TargetRegisterClass *RC;
   3068         if (RegVT == MVT::i8)
   3069           RC = &X86::GR8RegClass;
   3070         else if (RegVT == MVT::i16)
   3071           RC = &X86::GR16RegClass;
   3072         else if (RegVT == MVT::i32)
   3073           RC = &X86::GR32RegClass;
   3074         else if (Is64Bit && RegVT == MVT::i64)
   3075           RC = &X86::GR64RegClass;
   3076         else if (RegVT == MVT::f32)
   3077           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
   3078         else if (RegVT == MVT::f64)
   3079           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
   3080         else if (RegVT == MVT::f80)
   3081           RC = &X86::RFP80RegClass;
   3082         else if (RegVT == MVT::f128)
   3083           RC = &X86::VR128RegClass;
   3084         else if (RegVT.is512BitVector())
   3085           RC = &X86::VR512RegClass;
   3086         else if (RegVT.is256BitVector())
   3087           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
   3088         else if (RegVT.is128BitVector())
   3089           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
   3090         else if (RegVT == MVT::x86mmx)
   3091           RC = &X86::VR64RegClass;
   3092         else if (RegVT == MVT::v1i1)
   3093           RC = &X86::VK1RegClass;
   3094         else if (RegVT == MVT::v8i1)
   3095           RC = &X86::VK8RegClass;
   3096         else if (RegVT == MVT::v16i1)
   3097           RC = &X86::VK16RegClass;
   3098         else if (RegVT == MVT::v32i1)
   3099           RC = &X86::VK32RegClass;
   3100         else if (RegVT == MVT::v64i1)
   3101           RC = &X86::VK64RegClass;
   3102         else
   3103           llvm_unreachable("Unknown argument type!");
   3104 
   3105         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   3106         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
   3107       }
   3108 
   3109       // If this is an 8 or 16-bit value, it is really passed promoted to 32
   3110       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
   3111       // right size.
   3112       if (VA.getLocInfo() == CCValAssign::SExt)
   3113         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
   3114                                DAG.getValueType(VA.getValVT()));
   3115       else if (VA.getLocInfo() == CCValAssign::ZExt)
   3116         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
   3117                                DAG.getValueType(VA.getValVT()));
   3118       else if (VA.getLocInfo() == CCValAssign::BCvt)
   3119         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
   3120 
   3121       if (VA.isExtInLoc()) {
   3122         // Handle MMX values passed in XMM regs.
   3123         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
   3124           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
   3125         else if (VA.getValVT().isVector() &&
   3126                  VA.getValVT().getScalarType() == MVT::i1 &&
   3127                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
   3128                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
   3129           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
   3130           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
   3131         } else
   3132           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
   3133       }
   3134     } else {
   3135       assert(VA.isMemLoc());
   3136       ArgValue =
   3137           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
   3138     }
   3139 
   3140     // If value is passed via pointer - do a load.
   3141     if (VA.getLocInfo() == CCValAssign::Indirect)
   3142       ArgValue =
   3143           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
   3144 
   3145     InVals.push_back(ArgValue);
   3146   }
   3147 
   3148   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
   3149     // Swift calling convention does not require we copy the sret argument
   3150     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
   3151     if (CallConv == CallingConv::Swift)
   3152       continue;
   3153 
   3154     // All x86 ABIs require that for returning structs by value we copy the
   3155     // sret argument into %rax/%eax (depending on ABI) for the return. Save
   3156     // the argument into a virtual register so that we can access it from the
   3157     // return points.
   3158     if (Ins[I].Flags.isSRet()) {
   3159       unsigned Reg = FuncInfo->getSRetReturnReg();
   3160       if (!Reg) {
   3161         MVT PtrTy = getPointerTy(DAG.getDataLayout());
   3162         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
   3163         FuncInfo->setSRetReturnReg(Reg);
   3164       }
   3165       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
   3166       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   3167       break;
   3168     }
   3169   }
   3170 
   3171   unsigned StackSize = CCInfo.getNextStackOffset();
   3172   // Align stack specially for tail calls.
   3173   if (shouldGuaranteeTCO(CallConv,
   3174                          MF.getTarget().Options.GuaranteedTailCallOpt))
   3175     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
   3176 
   3177   // If the function takes variable number of arguments, make a frame index for
   3178   // the start of the first vararg value... for expansion of llvm.va_start. We
   3179   // can skip this if there are no va_start calls.
   3180   if (MFI.hasVAStart() &&
   3181       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
   3182                    CallConv != CallingConv::X86_ThisCall))) {
   3183     FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
   3184   }
   3185 
   3186   // Figure out if XMM registers are in use.
   3187   assert(!(Subtarget.useSoftFloat() &&
   3188            F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
   3189          "SSE register cannot be used when SSE is disabled!");
   3190 
   3191   // 64-bit calling conventions support varargs and register parameters, so we
   3192   // have to do extra work to spill them in the prologue.
   3193   if (Is64Bit && isVarArg && MFI.hasVAStart()) {
   3194     // Find the first unallocated argument registers.
   3195     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
   3196     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
   3197     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
   3198     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
   3199     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
   3200            "SSE register cannot be used when SSE is disabled!");
   3201 
   3202     // Gather all the live in physical registers.
   3203     SmallVector<SDValue, 6> LiveGPRs;
   3204     SmallVector<SDValue, 8> LiveXMMRegs;
   3205     SDValue ALVal;
   3206     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
   3207       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
   3208       LiveGPRs.push_back(
   3209           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
   3210     }
   3211     if (!ArgXMMs.empty()) {
   3212       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
   3213       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
   3214       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
   3215         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
   3216         LiveXMMRegs.push_back(
   3217             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
   3218       }
   3219     }
   3220 
   3221     if (IsWin64) {
   3222       // Get to the caller-allocated home save location.  Add 8 to account
   3223       // for the return address.
   3224       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
   3225       FuncInfo->setRegSaveFrameIndex(
   3226           MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
   3227       // Fixup to set vararg frame on shadow area (4 x i64).
   3228       if (NumIntRegs < 4)
   3229         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
   3230     } else {
   3231       // For X86-64, if there are vararg parameters that are passed via
   3232       // registers, then we must store them to their spots on the stack so
   3233       // they may be loaded by dereferencing the result of va_next.
   3234       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
   3235       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
   3236       FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
   3237           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
   3238     }
   3239 
   3240     // Store the integer parameter registers.
   3241     SmallVector<SDValue, 8> MemOps;
   3242     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
   3243                                       getPointerTy(DAG.getDataLayout()));
   3244     unsigned Offset = FuncInfo->getVarArgsGPOffset();
   3245     for (SDValue Val : LiveGPRs) {
   3246       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
   3247                                 RSFIN, DAG.getIntPtrConstant(Offset, dl));
   3248       SDValue Store =
   3249           DAG.getStore(Val.getValue(1), dl, Val, FIN,
   3250                        MachinePointerInfo::getFixedStack(
   3251                            DAG.getMachineFunction(),
   3252                            FuncInfo->getRegSaveFrameIndex(), Offset));
   3253       MemOps.push_back(Store);
   3254       Offset += 8;
   3255     }
   3256 
   3257     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
   3258       // Now store the XMM (fp + vector) parameter registers.
   3259       SmallVector<SDValue, 12> SaveXMMOps;
   3260       SaveXMMOps.push_back(Chain);
   3261       SaveXMMOps.push_back(ALVal);
   3262       SaveXMMOps.push_back(DAG.getIntPtrConstant(
   3263                              FuncInfo->getRegSaveFrameIndex(), dl));
   3264       SaveXMMOps.push_back(DAG.getIntPtrConstant(
   3265                              FuncInfo->getVarArgsFPOffset(), dl));
   3266       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
   3267                         LiveXMMRegs.end());
   3268       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
   3269                                    MVT::Other, SaveXMMOps));
   3270     }
   3271 
   3272     if (!MemOps.empty())
   3273       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
   3274   }
   3275 
   3276   if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
   3277     // Find the largest legal vector type.
   3278     MVT VecVT = MVT::Other;
   3279     // FIXME: Only some x86_32 calling conventions support AVX512.
   3280     if (Subtarget.hasAVX512() &&
   3281         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
   3282                      CallConv == CallingConv::Intel_OCL_BI)))
   3283       VecVT = MVT::v16f32;
   3284     else if (Subtarget.hasAVX())
   3285       VecVT = MVT::v8f32;
   3286     else if (Subtarget.hasSSE2())
   3287       VecVT = MVT::v4f32;
   3288 
   3289     // We forward some GPRs and some vector types.
   3290     SmallVector<MVT, 2> RegParmTypes;
   3291     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
   3292     RegParmTypes.push_back(IntVT);
   3293     if (VecVT != MVT::Other)
   3294       RegParmTypes.push_back(VecVT);
   3295 
   3296     // Compute the set of forwarded registers. The rest are scratch.
   3297     SmallVectorImpl<ForwardedRegister> &Forwards =
   3298         FuncInfo->getForwardedMustTailRegParms();
   3299     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
   3300 
   3301     // Conservatively forward AL on x86_64, since it might be used for varargs.
   3302     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
   3303       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
   3304       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
   3305     }
   3306 
   3307     // Copy all forwards from physical to virtual registers.
   3308     for (ForwardedRegister &F : Forwards) {
   3309       // FIXME: Can we use a less constrained schedule?
   3310       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
   3311       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
   3312       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
   3313     }
   3314   }
   3315 
   3316   // Some CCs need callee pop.
   3317   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   3318                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
   3319     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   3320   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
   3321     // X86 interrupts must pop the error code (and the alignment padding) if
   3322     // present.
   3323     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
   3324   } else {
   3325     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
   3326     // If this is an sret function, the return should pop the hidden pointer.
   3327     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
   3328         !Subtarget.getTargetTriple().isOSMSVCRT() &&
   3329         argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
   3330       FuncInfo->setBytesToPopOnReturn(4);
   3331   }
   3332 
   3333   if (!Is64Bit) {
   3334     // RegSaveFrameIndex is X86-64 only.
   3335     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
   3336     if (CallConv == CallingConv::X86_FastCall ||
   3337         CallConv == CallingConv::X86_ThisCall)
   3338       // fastcc functions can't have varargs.
   3339       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   3340   }
   3341 
   3342   FuncInfo->setArgumentStackSize(StackSize);
   3343 
   3344   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
   3345     EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
   3346     if (Personality == EHPersonality::CoreCLR) {
   3347       assert(Is64Bit);
   3348       // TODO: Add a mechanism to frame lowering that will allow us to indicate
   3349       // that we'd prefer this slot be allocated towards the bottom of the frame
   3350       // (i.e. near the stack pointer after allocating the frame).  Every
   3351       // funclet needs a copy of this slot in its (mostly empty) frame, and the
   3352       // offset from the bottom of this and each funclet's frame must be the
   3353       // same, so the size of funclets' (mostly empty) frames is dictated by
   3354       // how far this slot is from the bottom (since they allocate just enough
   3355       // space to accommodate holding this slot at the correct offset).
   3356       int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
   3357       EHInfo->PSPSymFrameIdx = PSPSymFI;
   3358     }
   3359   }
   3360 
   3361   if (CallConv == CallingConv::X86_RegCall ||
   3362       F.hasFnAttribute("no_caller_saved_registers")) {
   3363     MachineRegisterInfo &MRI = MF.getRegInfo();
   3364     for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
   3365       MRI.disableCalleeSavedRegister(Pair.first);
   3366   }
   3367 
   3368   return Chain;
   3369 }
   3370 
   3371 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
   3372                                             SDValue Arg, const SDLoc &dl,
   3373                                             SelectionDAG &DAG,
   3374                                             const CCValAssign &VA,
   3375                                             ISD::ArgFlagsTy Flags) const {
   3376   unsigned LocMemOffset = VA.getLocMemOffset();
   3377   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
   3378   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
   3379                        StackPtr, PtrOff);
   3380   if (Flags.isByVal())
   3381     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
   3382 
   3383   return DAG.getStore(
   3384       Chain, dl, Arg, PtrOff,
   3385       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
   3386 }
   3387 
   3388 /// Emit a load of return address if tail call
   3389 /// optimization is performed and it is required.
   3390 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
   3391     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
   3392     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
   3393   // Adjust the Return address stack slot.
   3394   EVT VT = getPointerTy(DAG.getDataLayout());
   3395   OutRetAddr = getReturnAddressFrameIndex(DAG);
   3396 
   3397   // Load the "old" Return address.
   3398   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
   3399   return SDValue(OutRetAddr.getNode(), 1);
   3400 }
   3401 
   3402 /// Emit a store of the return address if tail call
   3403 /// optimization is performed and it is required (FPDiff!=0).
   3404 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
   3405                                         SDValue Chain, SDValue RetAddrFrIdx,
   3406                                         EVT PtrVT, unsigned SlotSize,
   3407                                         int FPDiff, const SDLoc &dl) {
   3408   // Store the return address to the appropriate stack slot.
   3409   if (!FPDiff) return Chain;
   3410   // Calculate the new stack slot for the return address.
   3411   int NewReturnAddrFI =
   3412     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
   3413                                          false);
   3414   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   3415   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
   3416                        MachinePointerInfo::getFixedStack(
   3417                            DAG.getMachineFunction(), NewReturnAddrFI));
   3418   return Chain;
   3419 }
   3420 
   3421 /// Returns a vector_shuffle mask for an movs{s|d}, movd
   3422 /// operation of specified width.
   3423 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
   3424                        SDValue V2) {
   3425   unsigned NumElems = VT.getVectorNumElements();
   3426   SmallVector<int, 8> Mask;
   3427   Mask.push_back(NumElems);
   3428   for (unsigned i = 1; i != NumElems; ++i)
   3429     Mask.push_back(i);
   3430   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
   3431 }
   3432 
   3433 SDValue
   3434 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   3435                              SmallVectorImpl<SDValue> &InVals) const {
   3436   SelectionDAG &DAG                     = CLI.DAG;
   3437   SDLoc &dl                             = CLI.DL;
   3438   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   3439   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
   3440   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   3441   SDValue Chain                         = CLI.Chain;
   3442   SDValue Callee                        = CLI.Callee;
   3443   CallingConv::ID CallConv              = CLI.CallConv;
   3444   bool &isTailCall                      = CLI.IsTailCall;
   3445   bool isVarArg                         = CLI.IsVarArg;
   3446 
   3447   MachineFunction &MF = DAG.getMachineFunction();
   3448   bool Is64Bit        = Subtarget.is64Bit();
   3449   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
   3450   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
   3451   bool IsSibcall      = false;
   3452   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   3453   auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
   3454   const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
   3455   const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
   3456   bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
   3457                  (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
   3458   const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
   3459   bool HasNoCfCheck =
   3460       (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
   3461   const Module *M = MF.getMMI().getModule();
   3462   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
   3463 
   3464   if (CallConv == CallingConv::X86_INTR)
   3465     report_fatal_error("X86 interrupts may not be called directly");
   3466 
   3467   if (Attr.getValueAsString() == "true")
   3468     isTailCall = false;
   3469 
   3470   if (Subtarget.isPICStyleGOT() &&
   3471       !MF.getTarget().Options.GuaranteedTailCallOpt) {
   3472     // If we are using a GOT, disable tail calls to external symbols with
   3473     // default visibility. Tail calling such a symbol requires using a GOT
   3474     // relocation, which forces early binding of the symbol. This breaks code
   3475     // that require lazy function symbol resolution. Using musttail or
   3476     // GuaranteedTailCallOpt will override this.
   3477     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   3478     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
   3479                G->getGlobal()->hasDefaultVisibility()))
   3480       isTailCall = false;
   3481   }
   3482 
   3483   bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
   3484   if (IsMustTail) {
   3485     // Force this to be a tail call.  The verifier rules are enough to ensure
   3486     // that we can lower this successfully without moving the return address
   3487     // around.
   3488     isTailCall = true;
   3489   } else if (isTailCall) {
   3490     // Check if it's really possible to do a tail call.
   3491     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
   3492                     isVarArg, SR != NotStructReturn,
   3493                     MF.getFunction().hasStructRetAttr(), CLI.RetTy,
   3494                     Outs, OutVals, Ins, DAG);
   3495 
   3496     // Sibcalls are automatically detected tailcalls which do not require
   3497     // ABI changes.
   3498     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
   3499       IsSibcall = true;
   3500 
   3501     if (isTailCall)
   3502       ++NumTailCalls;
   3503   }
   3504 
   3505   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
   3506          "Var args not supported with calling convention fastcc, ghc or hipe");
   3507 
   3508   // Analyze operands of the call, assigning locations to each operand.
   3509   SmallVector<CCValAssign, 16> ArgLocs;
   3510   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
   3511 
   3512   // Allocate shadow area for Win64.
   3513   if (IsWin64)
   3514     CCInfo.AllocateStack(32, 8);
   3515 
   3516   CCInfo.AnalyzeArguments(Outs, CC_X86);
   3517 
   3518   // In vectorcall calling convention a second pass is required for the HVA
   3519   // types.
   3520   if (CallingConv::X86_VectorCall == CallConv) {
   3521     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
   3522   }
   3523 
   3524   // Get a count of how many bytes are to be pushed on the stack.
   3525   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
   3526   if (IsSibcall)
   3527     // This is a sibcall. The memory operands are available in caller's
   3528     // own caller's stack.
   3529     NumBytes = 0;
   3530   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
   3531            canGuaranteeTCO(CallConv))
   3532     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
   3533 
   3534   int FPDiff = 0;
   3535   if (isTailCall && !IsSibcall && !IsMustTail) {
   3536     // Lower arguments at fp - stackoffset + fpdiff.
   3537     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
   3538 
   3539     FPDiff = NumBytesCallerPushed - NumBytes;
   3540 
   3541     // Set the delta of movement of the returnaddr stackslot.
   3542     // But only set if delta is greater than previous delta.
   3543     if (FPDiff < X86Info->getTCReturnAddrDelta())
   3544       X86Info->setTCReturnAddrDelta(FPDiff);
   3545   }
   3546 
   3547   unsigned NumBytesToPush = NumBytes;
   3548   unsigned NumBytesToPop = NumBytes;
   3549 
   3550   // If we have an inalloca argument, all stack space has already been allocated
   3551   // for us and be right at the top of the stack.  We don't support multiple
   3552   // arguments passed in memory when using inalloca.
   3553   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
   3554     NumBytesToPush = 0;
   3555     if (!ArgLocs.back().isMemLoc())
   3556       report_fatal_error("cannot use inalloca attribute on a register "
   3557                          "parameter");
   3558     if (ArgLocs.back().getLocMemOffset() != 0)
   3559       report_fatal_error("any parameter with the inalloca attribute must be "
   3560                          "the only memory argument");
   3561   }
   3562 
   3563   if (!IsSibcall)
   3564     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
   3565                                  NumBytes - NumBytesToPush, dl);
   3566 
   3567   SDValue RetAddrFrIdx;
   3568   // Load return address for tail calls.
   3569   if (isTailCall && FPDiff)
   3570     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
   3571                                     Is64Bit, FPDiff, dl);
   3572 
   3573   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   3574   SmallVector<SDValue, 8> MemOpChains;
   3575   SDValue StackPtr;
   3576 
   3577   // The next loop assumes that the locations are in the same order of the
   3578   // input arguments.
   3579   assert(isSortedByValueNo(ArgLocs) &&
   3580          "Argument Location list must be sorted before lowering");
   3581 
   3582   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   3583   // of tail call optimization arguments are handle later.
   3584   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   3585   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
   3586        ++I, ++OutIndex) {
   3587     assert(OutIndex < Outs.size() && "Invalid Out index");
   3588     // Skip inalloca arguments, they have already been written.
   3589     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
   3590     if (Flags.isInAlloca())
   3591       continue;
   3592 
   3593     CCValAssign &VA = ArgLocs[I];
   3594     EVT RegVT = VA.getLocVT();
   3595     SDValue Arg = OutVals[OutIndex];
   3596     bool isByVal = Flags.isByVal();
   3597 
   3598     // Promote the value if needed.
   3599     switch (VA.getLocInfo()) {
   3600     default: llvm_unreachable("Unknown loc info!");
   3601     case CCValAssign::Full: break;
   3602     case CCValAssign::SExt:
   3603       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
   3604       break;
   3605     case CCValAssign::ZExt:
   3606       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
   3607       break;
   3608     case CCValAssign::AExt:
   3609       if (Arg.getValueType().isVector() &&
   3610           Arg.getValueType().getVectorElementType() == MVT::i1)
   3611         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
   3612       else if (RegVT.is128BitVector()) {
   3613         // Special case: passing MMX values in XMM registers.
   3614         Arg = DAG.getBitcast(MVT::i64, Arg);
   3615         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
   3616         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
   3617       } else
   3618         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
   3619       break;
   3620     case CCValAssign::BCvt:
   3621       Arg = DAG.getBitcast(RegVT, Arg);
   3622       break;
   3623     case CCValAssign::Indirect: {
   3624       // Store the argument.
   3625       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
   3626       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
   3627       Chain = DAG.getStore(
   3628           Chain, dl, Arg, SpillSlot,
   3629           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
   3630       Arg = SpillSlot;
   3631       break;
   3632     }
   3633     }
   3634 
   3635     if (VA.needsCustom()) {
   3636       assert(VA.getValVT() == MVT::v64i1 &&
   3637              "Currently the only custom case is when we split v64i1 to 2 regs");
   3638       // Split v64i1 value into two registers
   3639       Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
   3640                          Subtarget);
   3641     } else if (VA.isRegLoc()) {
   3642       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
   3643       if (isVarArg && IsWin64) {
   3644         // Win64 ABI requires argument XMM reg to be copied to the corresponding
   3645         // shadow reg if callee is a varargs function.
   3646         unsigned ShadowReg = 0;
   3647         switch (VA.getLocReg()) {
   3648         case X86::XMM0: ShadowReg = X86::RCX; break;
   3649         case X86::XMM1: ShadowReg = X86::RDX; break;
   3650         case X86::XMM2: ShadowReg = X86::R8; break;
   3651         case X86::XMM3: ShadowReg = X86::R9; break;
   3652         }
   3653         if (ShadowReg)
   3654           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
   3655       }
   3656     } else if (!IsSibcall && (!isTailCall || isByVal)) {
   3657       assert(VA.isMemLoc());
   3658       if (!StackPtr.getNode())
   3659         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
   3660                                       getPointerTy(DAG.getDataLayout()));
   3661       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
   3662                                              dl, DAG, VA, Flags));
   3663     }
   3664   }
   3665 
   3666   if (!MemOpChains.empty())
   3667     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
   3668 
   3669   if (Subtarget.isPICStyleGOT()) {
   3670     // ELF / PIC requires GOT in the EBX register before function calls via PLT
   3671     // GOT pointer.
   3672     if (!isTailCall) {
   3673       RegsToPass.push_back(std::make_pair(
   3674           unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
   3675                                           getPointerTy(DAG.getDataLayout()))));
   3676     } else {
   3677       // If we are tail calling and generating PIC/GOT style code load the
   3678       // address of the callee into ECX. The value in ecx is used as target of
   3679       // the tail jump. This is done to circumvent the ebx/callee-saved problem
   3680       // for tail calls on PIC/GOT architectures. Normally we would just put the
   3681       // address of GOT into ebx and then call target@PLT. But for tail calls
   3682       // ebx would be restored (since ebx is callee saved) before jumping to the
   3683       // target@PLT.
   3684 
   3685       // Note: The actual moving to ECX is done further down.
   3686       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   3687       if (G && !G->getGlobal()->hasLocalLinkage() &&
   3688           G->getGlobal()->hasDefaultVisibility())
   3689         Callee = LowerGlobalAddress(Callee, DAG);
   3690       else if (isa<ExternalSymbolSDNode>(Callee))
   3691         Callee = LowerExternalSymbol(Callee, DAG);
   3692     }
   3693   }
   3694 
   3695   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
   3696     // From AMD64 ABI document:
   3697     // For calls that may call functions that use varargs or stdargs
   3698     // (prototype-less calls or calls to functions containing ellipsis (...) in
   3699     // the declaration) %al is used as hidden argument to specify the number
   3700     // of SSE registers used. The contents of %al do not need to match exactly
   3701     // the number of registers, but must be an ubound on the number of SSE
   3702     // registers used and is in the range 0 - 8 inclusive.
   3703 
   3704     // Count the number of XMM registers allocated.
   3705     static const MCPhysReg XMMArgRegs[] = {
   3706       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
   3707       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   3708     };
   3709     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
   3710     assert((Subtarget.hasSSE1() || !NumXMMRegs)
   3711            && "SSE registers cannot be used when SSE is disabled");
   3712 
   3713     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
   3714                                         DAG.getConstant(NumXMMRegs, dl,
   3715                                                         MVT::i8)));
   3716   }
   3717 
   3718   if (isVarArg && IsMustTail) {
   3719     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
   3720     for (const auto &F : Forwards) {
   3721       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
   3722       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
   3723     }
   3724   }
   3725 
   3726   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
   3727   // don't need this because the eligibility check rejects calls that require
   3728   // shuffling arguments passed in memory.
   3729   if (!IsSibcall && isTailCall) {
   3730     // Force all the incoming stack arguments to be loaded from the stack
   3731     // before any new outgoing arguments are stored to the stack, because the
   3732     // outgoing stack slots may alias the incoming argument stack slots, and
   3733     // the alias isn't otherwise explicit. This is slightly more conservative
   3734     // than necessary, because it means that each store effectively depends
   3735     // on every argument instead of just those arguments it would clobber.
   3736     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
   3737 
   3738     SmallVector<SDValue, 8> MemOpChains2;
   3739     SDValue FIN;
   3740     int FI = 0;
   3741     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
   3742          ++I, ++OutsIndex) {
   3743       CCValAssign &VA = ArgLocs[I];
   3744 
   3745       if (VA.isRegLoc()) {
   3746         if (VA.needsCustom()) {
   3747           assert((CallConv == CallingConv::X86_RegCall) &&
   3748                  "Expecting custom case only in regcall calling convention");
   3749           // This means that we are in special case where one argument was
   3750           // passed through two register locations - Skip the next location
   3751           ++I;
   3752         }
   3753 
   3754         continue;
   3755       }
   3756 
   3757       assert(VA.isMemLoc());
   3758       SDValue Arg = OutVals[OutsIndex];
   3759       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
   3760       // Skip inalloca arguments.  They don't require any work.
   3761       if (Flags.isInAlloca())
   3762         continue;
   3763       // Create frame index.
   3764       int32_t Offset = VA.getLocMemOffset()+FPDiff;
   3765       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
   3766       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
   3767       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   3768 
   3769       if (Flags.isByVal()) {
   3770         // Copy relative to framepointer.
   3771         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
   3772         if (!StackPtr.getNode())
   3773           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
   3774                                         getPointerTy(DAG.getDataLayout()));
   3775         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
   3776                              StackPtr, Source);
   3777 
   3778         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
   3779                                                          ArgChain,
   3780                                                          Flags, DAG, dl));
   3781       } else {
   3782         // Store relative to framepointer.
   3783         MemOpChains2.push_back(DAG.getStore(
   3784             ArgChain, dl, Arg, FIN,
   3785             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
   3786       }
   3787     }
   3788 
   3789     if (!MemOpChains2.empty())
   3790       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
   3791 
   3792     // Store the return address to the appropriate stack slot.
   3793     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
   3794                                      getPointerTy(DAG.getDataLayout()),
   3795                                      RegInfo->getSlotSize(), FPDiff, dl);
   3796   }
   3797 
   3798   // Build a sequence of copy-to-reg nodes chained together with token chain
   3799   // and flag operands which copy the outgoing args into registers.
   3800   SDValue InFlag;
   3801   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
   3802     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
   3803                              RegsToPass[i].second, InFlag);
   3804     InFlag = Chain.getValue(1);
   3805   }
   3806 
   3807   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
   3808     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
   3809     // In the 64-bit large code model, we have to make all calls
   3810     // through a register, since the call instruction's 32-bit
   3811     // pc-relative offset may not be large enough to hold the whole
   3812     // address.
   3813   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
   3814     // If the callee is a GlobalAddress node (quite common, every direct call
   3815     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
   3816     // it.
   3817     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
   3818 
   3819     // We should use extra load for direct calls to dllimported functions in
   3820     // non-JIT mode.
   3821     const GlobalValue *GV = G->getGlobal();
   3822     if (!GV->hasDLLImportStorageClass()) {
   3823       unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
   3824 
   3825       Callee = DAG.getTargetGlobalAddress(
   3826           GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
   3827 
   3828       if (OpFlags == X86II::MO_GOTPCREL) {
   3829         // Add a wrapper.
   3830         Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
   3831           getPointerTy(DAG.getDataLayout()), Callee);
   3832         // Add extra indirection
   3833         Callee = DAG.getLoad(
   3834             getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
   3835             MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   3836       }
   3837     }
   3838   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
   3839     const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
   3840     unsigned char OpFlags =
   3841         Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
   3842 
   3843     Callee = DAG.getTargetExternalSymbol(
   3844         S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
   3845 
   3846     if (OpFlags == X86II::MO_GOTPCREL) {
   3847       Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
   3848           getPointerTy(DAG.getDataLayout()), Callee);
   3849       Callee = DAG.getLoad(
   3850           getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
   3851           MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   3852     }
   3853   } else if (Subtarget.isTarget64BitILP32() &&
   3854              Callee->getValueType(0) == MVT::i32) {
   3855     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
   3856     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
   3857   }
   3858 
   3859   // Returns a chain & a flag for retval copy to use.
   3860   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   3861   SmallVector<SDValue, 8> Ops;
   3862 
   3863   if (!IsSibcall && isTailCall) {
   3864     Chain = DAG.getCALLSEQ_END(Chain,
   3865                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
   3866                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
   3867     InFlag = Chain.getValue(1);
   3868   }
   3869 
   3870   Ops.push_back(Chain);
   3871   Ops.push_back(Callee);
   3872 
   3873   if (isTailCall)
   3874     Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
   3875 
   3876   // Add argument registers to the end of the list so that they are known live
   3877   // into the call.
   3878   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
   3879     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
   3880                                   RegsToPass[i].second.getValueType()));
   3881 
   3882   // Add a register mask operand representing the call-preserved registers.
   3883   // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
   3884   // set X86_INTR calling convention because it has the same CSR mask
   3885   // (same preserved registers).
   3886   const uint32_t *Mask = RegInfo->getCallPreservedMask(
   3887       MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
   3888   assert(Mask && "Missing call preserved mask for calling convention");
   3889 
   3890   // If this is an invoke in a 32-bit function using a funclet-based
   3891   // personality, assume the function clobbers all registers. If an exception
   3892   // is thrown, the runtime will not restore CSRs.
   3893   // FIXME: Model this more precisely so that we can register allocate across
   3894   // the normal edge and spill and fill across the exceptional edge.
   3895   if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
   3896     const Function &CallerFn = MF.getFunction();
   3897     EHPersonality Pers =
   3898         CallerFn.hasPersonalityFn()
   3899             ? classifyEHPersonality(CallerFn.getPersonalityFn())
   3900             : EHPersonality::Unknown;
   3901     if (isFuncletEHPersonality(Pers))
   3902       Mask = RegInfo->getNoPreservedMask();
   3903   }
   3904 
   3905   // Define a new register mask from the existing mask.
   3906   uint32_t *RegMask = nullptr;
   3907 
   3908   // In some calling conventions we need to remove the used physical registers
   3909   // from the reg mask.
   3910   if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
   3911     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   3912 
   3913     // Allocate a new Reg Mask and copy Mask.
   3914     RegMask = MF.allocateRegMask();
   3915     unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
   3916     memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
   3917 
   3918     // Make sure all sub registers of the argument registers are reset
   3919     // in the RegMask.
   3920     for (auto const &RegPair : RegsToPass)
   3921       for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
   3922            SubRegs.isValid(); ++SubRegs)
   3923         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
   3924 
   3925     // Create the RegMask Operand according to our updated mask.
   3926     Ops.push_back(DAG.getRegisterMask(RegMask));
   3927   } else {
   3928     // Create the RegMask Operand according to the static mask.
   3929     Ops.push_back(DAG.getRegisterMask(Mask));
   3930   }
   3931 
   3932   if (InFlag.getNode())
   3933     Ops.push_back(InFlag);
   3934 
   3935   if (isTailCall) {
   3936     // We used to do:
   3937     //// If this is the first return lowered for this function, add the regs
   3938     //// to the liveout set for the function.
   3939     // This isn't right, although it's probably harmless on x86; liveouts
   3940     // should be computed from returns not tail calls.  Consider a void
   3941     // function making a tail call to a function returning int.
   3942     MF.getFrameInfo().setHasTailCall();
   3943     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
   3944   }
   3945 
   3946   if (HasNoCfCheck && IsCFProtectionSupported) {
   3947     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
   3948   } else {
   3949     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
   3950   }
   3951   InFlag = Chain.getValue(1);
   3952 
   3953   // Create the CALLSEQ_END node.
   3954   unsigned NumBytesForCalleeToPop;
   3955   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
   3956                        DAG.getTarget().Options.GuaranteedTailCallOpt))
   3957     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
   3958   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
   3959            !Subtarget.getTargetTriple().isOSMSVCRT() &&
   3960            SR == StackStructReturn)
   3961     // If this is a call to a struct-return function, the callee
   3962     // pops the hidden struct pointer, so we have to push it back.
   3963     // This is common for Darwin/X86, Linux & Mingw32 targets.
   3964     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
   3965     NumBytesForCalleeToPop = 4;
   3966   else
   3967     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
   3968 
   3969   if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
   3970     // No need to reset the stack after the call if the call doesn't return. To
   3971     // make the MI verify, we'll pretend the callee does it for us.
   3972     NumBytesForCalleeToPop = NumBytes;
   3973   }
   3974 
   3975   // Returns a flag for retval copy to use.
   3976   if (!IsSibcall) {
   3977     Chain = DAG.getCALLSEQ_END(Chain,
   3978                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
   3979                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
   3980                                                      true),
   3981                                InFlag, dl);
   3982     InFlag = Chain.getValue(1);
   3983   }
   3984 
   3985   // Handle result values, copying them out of physregs into vregs that we
   3986   // return.
   3987   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
   3988                          InVals, RegMask);
   3989 }
   3990 
   3991 //===----------------------------------------------------------------------===//
   3992 //                Fast Calling Convention (tail call) implementation
   3993 //===----------------------------------------------------------------------===//
   3994 
   3995 //  Like std call, callee cleans arguments, convention except that ECX is
   3996 //  reserved for storing the tail called function address. Only 2 registers are
   3997 //  free for argument passing (inreg). Tail call optimization is performed
   3998 //  provided:
   3999 //                * tailcallopt is enabled
   4000 //                * caller/callee are fastcc
   4001 //  On X86_64 architecture with GOT-style position independent code only local
   4002 //  (within module) calls are supported at the moment.
   4003 //  To keep the stack aligned according to platform abi the function
   4004 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
   4005 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
   4006 //  If a tail called function callee has more arguments than the caller the
   4007 //  caller needs to make sure that there is room to move the RETADDR to. This is
   4008 //  achieved by reserving an area the size of the argument delta right after the
   4009 //  original RETADDR, but before the saved framepointer or the spilled registers
   4010 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
   4011 //  stack layout:
   4012 //    arg1
   4013 //    arg2
   4014 //    RETADDR
   4015 //    [ new RETADDR
   4016 //      move area ]
   4017 //    (possible EBP)
   4018 //    ESI
   4019 //    EDI
   4020 //    local1 ..
   4021 
   4022 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
   4023 /// requirement.
   4024 unsigned
   4025 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   4026                                                SelectionDAG& DAG) const {
   4027   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   4028   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   4029   unsigned StackAlignment = TFI.getStackAlignment();
   4030   uint64_t AlignMask = StackAlignment - 1;
   4031   int64_t Offset = StackSize;
   4032   unsigned SlotSize = RegInfo->getSlotSize();
   4033   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
   4034     // Number smaller than 12 so just add the difference.
   4035     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
   4036   } else {
   4037     // Mask out lower bits, add stackalignment once plus the 12 bytes.
   4038     Offset = ((~AlignMask) & Offset) + StackAlignment +
   4039       (StackAlignment-SlotSize);
   4040   }
   4041   return Offset;
   4042 }
   4043 
   4044 /// Return true if the given stack call argument is already available in the
   4045 /// same position (relatively) of the caller's incoming argument stack.
   4046 static
   4047 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   4048                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
   4049                          const X86InstrInfo *TII, const CCValAssign &VA) {
   4050   unsigned Bytes = Arg.getValueSizeInBits() / 8;
   4051 
   4052   for (;;) {
   4053     // Look through nodes that don't alter the bits of the incoming value.
   4054     unsigned Op = Arg.getOpcode();
   4055     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
   4056       Arg = Arg.getOperand(0);
   4057       continue;
   4058     }
   4059     if (Op == ISD::TRUNCATE) {
   4060       const SDValue &TruncInput = Arg.getOperand(0);
   4061       if (TruncInput.getOpcode() == ISD::AssertZext &&
   4062           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
   4063               Arg.getValueType()) {
   4064         Arg = TruncInput.getOperand(0);
   4065         continue;
   4066       }
   4067     }
   4068     break;
   4069   }
   4070 
   4071   int FI = INT_MAX;
   4072   if (Arg.getOpcode() == ISD::CopyFromReg) {
   4073     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
   4074     if (!TargetRegisterInfo::isVirtualRegister(VR))
   4075       return false;
   4076     MachineInstr *Def = MRI->getVRegDef(VR);
   4077     if (!Def)
   4078       return false;
   4079     if (!Flags.isByVal()) {
   4080       if (!TII->isLoadFromStackSlot(*Def, FI))
   4081         return false;
   4082     } else {
   4083       unsigned Opcode = Def->getOpcode();
   4084       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
   4085            Opcode == X86::LEA64_32r) &&
   4086           Def->getOperand(1).isFI()) {
   4087         FI = Def->getOperand(1).getIndex();
   4088         Bytes = Flags.getByValSize();
   4089       } else
   4090         return false;
   4091     }
   4092   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
   4093     if (Flags.isByVal())
   4094       // ByVal argument is passed in as a pointer but it's now being
   4095       // dereferenced. e.g.
   4096       // define @foo(%struct.X* %A) {
   4097       //   tail call @bar(%struct.X* byval %A)
   4098       // }
   4099       return false;
   4100     SDValue Ptr = Ld->getBasePtr();
   4101     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
   4102     if (!FINode)
   4103       return false;
   4104     FI = FINode->getIndex();
   4105   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
   4106     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
   4107     FI = FINode->getIndex();
   4108     Bytes = Flags.getByValSize();
   4109   } else
   4110     return false;
   4111 
   4112   assert(FI != INT_MAX);
   4113   if (!MFI.isFixedObjectIndex(FI))
   4114     return false;
   4115 
   4116   if (Offset != MFI.getObjectOffset(FI))
   4117     return false;
   4118 
   4119   // If this is not byval, check that the argument stack object is immutable.
   4120   // inalloca and argument copy elision can create mutable argument stack
   4121   // objects. Byval objects can be mutated, but a byval call intends to pass the
   4122   // mutated memory.
   4123   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
   4124     return false;
   4125 
   4126   if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
   4127     // If the argument location is wider than the argument type, check that any
   4128     // extension flags match.
   4129     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
   4130         Flags.isSExt() != MFI.isObjectSExt(FI)) {
   4131       return false;
   4132     }
   4133   }
   4134 
   4135   return Bytes == MFI.getObjectSize(FI);
   4136 }
   4137 
   4138 /// Check whether the call is eligible for tail call optimization. Targets
   4139 /// that want to do tail call optimization should implement this function.
   4140 bool X86TargetLowering::IsEligibleForTailCallOptimization(
   4141     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
   4142     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
   4143     const SmallVectorImpl<ISD::OutputArg> &Outs,
   4144     const SmallVectorImpl<SDValue> &OutVals,
   4145     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   4146   if (!mayTailCallThisCC(CalleeCC))
   4147     return false;
   4148 
   4149   // If -tailcallopt is specified, make fastcc functions tail-callable.
   4150   MachineFunction &MF = DAG.getMachineFunction();
   4151   const Function &CallerF = MF.getFunction();
   4152 
   4153   // If the function return type is x86_fp80 and the callee return type is not,
   4154   // then the FP_EXTEND of the call result is not a nop. It's not safe to
   4155   // perform a tailcall optimization here.
   4156   if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
   4157     return false;
   4158 
   4159   CallingConv::ID CallerCC = CallerF.getCallingConv();
   4160   bool CCMatch = CallerCC == CalleeCC;
   4161   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
   4162   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
   4163 
   4164   // Win64 functions have extra shadow space for argument homing. Don't do the
   4165   // sibcall if the caller and callee have mismatched expectations for this
   4166   // space.
   4167   if (IsCalleeWin64 != IsCallerWin64)
   4168     return false;
   4169 
   4170   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
   4171     if (canGuaranteeTCO(CalleeCC) && CCMatch)
   4172       return true;
   4173     return false;
   4174   }
   4175 
   4176   // Look for obvious safe cases to perform tail call optimization that do not
   4177   // require ABI changes. This is what gcc calls sibcall.
   4178 
   4179   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   4180   // emit a special epilogue.
   4181   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   4182   if (RegInfo->needsStackRealignment(MF))
   4183     return false;
   4184 
   4185   // Also avoid sibcall optimization if either caller or callee uses struct
   4186   // return semantics.
   4187   if (isCalleeStructRet || isCallerStructRet)
   4188     return false;
   4189 
   4190   // Do not sibcall optimize vararg calls unless all arguments are passed via
   4191   // registers.
   4192   LLVMContext &C = *DAG.getContext();
   4193   if (isVarArg && !Outs.empty()) {
   4194     // Optimizing for varargs on Win64 is unlikely to be safe without
   4195     // additional testing.
   4196     if (IsCalleeWin64 || IsCallerWin64)
   4197       return false;
   4198 
   4199     SmallVector<CCValAssign, 16> ArgLocs;
   4200     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
   4201 
   4202     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   4203     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
   4204       if (!ArgLocs[i].isRegLoc())
   4205         return false;
   4206   }
   4207 
   4208   // If the call result is in ST0 / ST1, it needs to be popped off the x87
   4209   // stack.  Therefore, if it's not used by the call it is not safe to optimize
   4210   // this into a sibcall.
   4211   bool Unused = false;
   4212   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
   4213     if (!Ins[i].Used) {
   4214       Unused = true;
   4215       break;
   4216     }
   4217   }
   4218   if (Unused) {
   4219     SmallVector<CCValAssign, 16> RVLocs;
   4220     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
   4221     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
   4222     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
   4223       CCValAssign &VA = RVLocs[i];
   4224       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
   4225         return false;
   4226     }
   4227   }
   4228 
   4229   // Check that the call results are passed in the same way.
   4230   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
   4231                                   RetCC_X86, RetCC_X86))
   4232     return false;
   4233   // The callee has to preserve all registers the caller needs to preserve.
   4234   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   4235   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
   4236   if (!CCMatch) {
   4237     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
   4238     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
   4239       return false;
   4240   }
   4241 
   4242   unsigned StackArgsSize = 0;
   4243 
   4244   // If the callee takes no arguments then go on to check the results of the
   4245   // call.
   4246   if (!Outs.empty()) {
   4247     // Check if stack adjustment is needed. For now, do not do this if any
   4248     // argument is passed on the stack.
   4249     SmallVector<CCValAssign, 16> ArgLocs;
   4250     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
   4251 
   4252     // Allocate shadow area for Win64
   4253     if (IsCalleeWin64)
   4254       CCInfo.AllocateStack(32, 8);
   4255 
   4256     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
   4257     StackArgsSize = CCInfo.getNextStackOffset();
   4258 
   4259     if (CCInfo.getNextStackOffset()) {
   4260       // Check if the arguments are already laid out in the right way as
   4261       // the caller's fixed stack objects.
   4262       MachineFrameInfo &MFI = MF.getFrameInfo();
   4263       const MachineRegisterInfo *MRI = &MF.getRegInfo();
   4264       const X86InstrInfo *TII = Subtarget.getInstrInfo();
   4265       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   4266         CCValAssign &VA = ArgLocs[i];
   4267         SDValue Arg = OutVals[i];
   4268         ISD::ArgFlagsTy Flags = Outs[i].Flags;
   4269         if (VA.getLocInfo() == CCValAssign::Indirect)
   4270           return false;
   4271         if (!VA.isRegLoc()) {
   4272           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
   4273                                    MFI, MRI, TII, VA))
   4274             return false;
   4275         }
   4276       }
   4277     }
   4278 
   4279     bool PositionIndependent = isPositionIndependent();
   4280     // If the tailcall address may be in a register, then make sure it's
   4281     // possible to register allocate for it. In 32-bit, the call address can
   4282     // only target EAX, EDX, or ECX since the tail call must be scheduled after
   4283     // callee-saved registers are restored. These happen to be the same
   4284     // registers used to pass 'inreg' arguments so watch out for those.
   4285     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
   4286                                   !isa<ExternalSymbolSDNode>(Callee)) ||
   4287                                  PositionIndependent)) {
   4288       unsigned NumInRegs = 0;
   4289       // In PIC we need an extra register to formulate the address computation
   4290       // for the callee.
   4291       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
   4292 
   4293       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
   4294         CCValAssign &VA = ArgLocs[i];
   4295         if (!VA.isRegLoc())
   4296           continue;
   4297         unsigned Reg = VA.getLocReg();
   4298         switch (Reg) {
   4299         default: break;
   4300         case X86::EAX: case X86::EDX: case X86::ECX:
   4301           if (++NumInRegs == MaxInRegs)
   4302             return false;
   4303           break;
   4304         }
   4305       }
   4306     }
   4307 
   4308     const MachineRegisterInfo &MRI = MF.getRegInfo();
   4309     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
   4310       return false;
   4311   }
   4312 
   4313   bool CalleeWillPop =
   4314       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
   4315                        MF.getTarget().Options.GuaranteedTailCallOpt);
   4316 
   4317   if (unsigned BytesToPop =
   4318           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
   4319     // If we have bytes to pop, the callee must pop them.
   4320     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
   4321     if (!CalleePopMatches)
   4322       return false;
   4323   } else if (CalleeWillPop && StackArgsSize > 0) {
   4324     // If we don't have bytes to pop, make sure the callee doesn't pop any.
   4325     return false;
   4326   }
   4327 
   4328   return true;
   4329 }
   4330 
   4331 FastISel *
   4332 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
   4333                                   const TargetLibraryInfo *libInfo) const {
   4334   return X86::createFastISel(funcInfo, libInfo);
   4335 }
   4336 
   4337 //===----------------------------------------------------------------------===//
   4338 //                           Other Lowering Hooks
   4339 //===----------------------------------------------------------------------===//
   4340 
   4341 static bool MayFoldLoad(SDValue Op) {
   4342   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
   4343 }
   4344 
   4345 static bool MayFoldIntoStore(SDValue Op) {
   4346   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
   4347 }
   4348 
   4349 static bool MayFoldIntoZeroExtend(SDValue Op) {
   4350   if (Op.hasOneUse()) {
   4351     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
   4352     return (ISD::ZERO_EXTEND == Opcode);
   4353   }
   4354   return false;
   4355 }
   4356 
   4357 static bool isTargetShuffle(unsigned Opcode) {
   4358   switch(Opcode) {
   4359   default: return false;
   4360   case X86ISD::BLENDI:
   4361   case X86ISD::PSHUFB:
   4362   case X86ISD::PSHUFD:
   4363   case X86ISD::PSHUFHW:
   4364   case X86ISD::PSHUFLW:
   4365   case X86ISD::SHUFP:
   4366   case X86ISD::INSERTPS:
   4367   case X86ISD::EXTRQI:
   4368   case X86ISD::INSERTQI:
   4369   case X86ISD::PALIGNR:
   4370   case X86ISD::VSHLDQ:
   4371   case X86ISD::VSRLDQ:
   4372   case X86ISD::MOVLHPS:
   4373   case X86ISD::MOVHLPS:
   4374   case X86ISD::MOVSHDUP:
   4375   case X86ISD::MOVSLDUP:
   4376   case X86ISD::MOVDDUP:
   4377   case X86ISD::MOVSS:
   4378   case X86ISD::MOVSD:
   4379   case X86ISD::UNPCKL:
   4380   case X86ISD::UNPCKH:
   4381   case X86ISD::VBROADCAST:
   4382   case X86ISD::VPERMILPI:
   4383   case X86ISD::VPERMILPV:
   4384   case X86ISD::VPERM2X128:
   4385   case X86ISD::SHUF128:
   4386   case X86ISD::VPERMIL2:
   4387   case X86ISD::VPERMI:
   4388   case X86ISD::VPPERM:
   4389   case X86ISD::VPERMV:
   4390   case X86ISD::VPERMV3:
   4391   case X86ISD::VZEXT_MOVL:
   4392     return true;
   4393   }
   4394 }
   4395 
   4396 static bool isTargetShuffleVariableMask(unsigned Opcode) {
   4397   switch (Opcode) {
   4398   default: return false;
   4399   // Target Shuffles.
   4400   case X86ISD::PSHUFB:
   4401   case X86ISD::VPERMILPV:
   4402   case X86ISD::VPERMIL2:
   4403   case X86ISD::VPPERM:
   4404   case X86ISD::VPERMV:
   4405   case X86ISD::VPERMV3:
   4406     return true;
   4407   // 'Faux' Target Shuffles.
   4408   case ISD::AND:
   4409   case X86ISD::ANDNP:
   4410     return true;
   4411   }
   4412 }
   4413 
   4414 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   4415   MachineFunction &MF = DAG.getMachineFunction();
   4416   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   4417   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   4418   int ReturnAddrIndex = FuncInfo->getRAIndex();
   4419 
   4420   if (ReturnAddrIndex == 0) {
   4421     // Set up a frame object for the return address.
   4422     unsigned SlotSize = RegInfo->getSlotSize();
   4423     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
   4424                                                           -(int64_t)SlotSize,
   4425                                                           false);
   4426     FuncInfo->setRAIndex(ReturnAddrIndex);
   4427   }
   4428 
   4429   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
   4430 }
   4431 
   4432 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
   4433                                        bool hasSymbolicDisplacement) {
   4434   // Offset should fit into 32 bit immediate field.
   4435   if (!isInt<32>(Offset))
   4436     return false;
   4437 
   4438   // If we don't have a symbolic displacement - we don't have any extra
   4439   // restrictions.
   4440   if (!hasSymbolicDisplacement)
   4441     return true;
   4442 
   4443   // FIXME: Some tweaks might be needed for medium code model.
   4444   if (M != CodeModel::Small && M != CodeModel::Kernel)
   4445     return false;
   4446 
   4447   // For small code model we assume that latest object is 16MB before end of 31
   4448   // bits boundary. We may also accept pretty large negative constants knowing
   4449   // that all objects are in the positive half of address space.
   4450   if (M == CodeModel::Small && Offset < 16*1024*1024)
   4451     return true;
   4452 
   4453   // For kernel code model we know that all object resist in the negative half
   4454   // of 32bits address space. We may not accept negative offsets, since they may
   4455   // be just off and we may accept pretty large positive ones.
   4456   if (M == CodeModel::Kernel && Offset >= 0)
   4457     return true;
   4458 
   4459   return false;
   4460 }
   4461 
   4462 /// Determines whether the callee is required to pop its own arguments.
   4463 /// Callee pop is necessary to support tail calls.
   4464 bool X86::isCalleePop(CallingConv::ID CallingConv,
   4465                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
   4466   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
   4467   // can guarantee TCO.
   4468   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
   4469     return true;
   4470 
   4471   switch (CallingConv) {
   4472   default:
   4473     return false;
   4474   case CallingConv::X86_StdCall:
   4475   case CallingConv::X86_FastCall:
   4476   case CallingConv::X86_ThisCall:
   4477   case CallingConv::X86_VectorCall:
   4478     return !is64Bit;
   4479   }
   4480 }
   4481 
   4482 /// Return true if the condition is an unsigned comparison operation.
   4483 static bool isX86CCUnsigned(unsigned X86CC) {
   4484   switch (X86CC) {
   4485   default:
   4486     llvm_unreachable("Invalid integer condition!");
   4487   case X86::COND_E:
   4488   case X86::COND_NE:
   4489   case X86::COND_B:
   4490   case X86::COND_A:
   4491   case X86::COND_BE:
   4492   case X86::COND_AE:
   4493     return true;
   4494   case X86::COND_G:
   4495   case X86::COND_GE:
   4496   case X86::COND_L:
   4497   case X86::COND_LE:
   4498     return false;
   4499   }
   4500 }
   4501 
   4502 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
   4503   switch (SetCCOpcode) {
   4504   default: llvm_unreachable("Invalid integer condition!");
   4505   case ISD::SETEQ:  return X86::COND_E;
   4506   case ISD::SETGT:  return X86::COND_G;
   4507   case ISD::SETGE:  return X86::COND_GE;
   4508   case ISD::SETLT:  return X86::COND_L;
   4509   case ISD::SETLE:  return X86::COND_LE;
   4510   case ISD::SETNE:  return X86::COND_NE;
   4511   case ISD::SETULT: return X86::COND_B;
   4512   case ISD::SETUGT: return X86::COND_A;
   4513   case ISD::SETULE: return X86::COND_BE;
   4514   case ISD::SETUGE: return X86::COND_AE;
   4515   }
   4516 }
   4517 
   4518 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
   4519 /// condition code, returning the condition code and the LHS/RHS of the
   4520 /// comparison to make.
   4521 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
   4522                                bool isFP, SDValue &LHS, SDValue &RHS,
   4523                                SelectionDAG &DAG) {
   4524   if (!isFP) {
   4525     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
   4526       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
   4527         // X > -1   -> X == 0, jump !sign.
   4528         RHS = DAG.getConstant(0, DL, RHS.getValueType());
   4529         return X86::COND_NS;
   4530       }
   4531       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
   4532         // X < 0   -> X == 0, jump on sign.
   4533         return X86::COND_S;
   4534       }
   4535       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
   4536         // X < 1   -> X <= 0
   4537         RHS = DAG.getConstant(0, DL, RHS.getValueType());
   4538         return X86::COND_LE;
   4539       }
   4540     }
   4541 
   4542     return TranslateIntegerX86CC(SetCCOpcode);
   4543   }
   4544 
   4545   // First determine if it is required or is profitable to flip the operands.
   4546 
   4547   // If LHS is a foldable load, but RHS is not, flip the condition.
   4548   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
   4549       !ISD::isNON_EXTLoad(RHS.getNode())) {
   4550     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
   4551     std::swap(LHS, RHS);
   4552   }
   4553 
   4554   switch (SetCCOpcode) {
   4555   default: break;
   4556   case ISD::SETOLT:
   4557   case ISD::SETOLE:
   4558   case ISD::SETUGT:
   4559   case ISD::SETUGE:
   4560     std::swap(LHS, RHS);
   4561     break;
   4562   }
   4563 
   4564   // On a floating point condition, the flags are set as follows:
   4565   // ZF  PF  CF   op
   4566   //  0 | 0 | 0 | X > Y
   4567   //  0 | 0 | 1 | X < Y
   4568   //  1 | 0 | 0 | X == Y
   4569   //  1 | 1 | 1 | unordered
   4570   switch (SetCCOpcode) {
   4571   default: llvm_unreachable("Condcode should be pre-legalized away");
   4572   case ISD::SETUEQ:
   4573   case ISD::SETEQ:   return X86::COND_E;
   4574   case ISD::SETOLT:              // flipped
   4575   case ISD::SETOGT:
   4576   case ISD::SETGT:   return X86::COND_A;
   4577   case ISD::SETOLE:              // flipped
   4578   case ISD::SETOGE:
   4579   case ISD::SETGE:   return X86::COND_AE;
   4580   case ISD::SETUGT:              // flipped
   4581   case ISD::SETULT:
   4582   case ISD::SETLT:   return X86::COND_B;
   4583   case ISD::SETUGE:              // flipped
   4584   case ISD::SETULE:
   4585   case ISD::SETLE:   return X86::COND_BE;
   4586   case ISD::SETONE:
   4587   case ISD::SETNE:   return X86::COND_NE;
   4588   case ISD::SETUO:   return X86::COND_P;
   4589   case ISD::SETO:    return X86::COND_NP;
   4590   case ISD::SETOEQ:
   4591   case ISD::SETUNE:  return X86::COND_INVALID;
   4592   }
   4593 }
   4594 
   4595 /// Is there a floating point cmov for the specific X86 condition code?
   4596 /// Current x86 isa includes the following FP cmov instructions:
   4597 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
   4598 static bool hasFPCMov(unsigned X86CC) {
   4599   switch (X86CC) {
   4600   default:
   4601     return false;
   4602   case X86::COND_B:
   4603   case X86::COND_BE:
   4604   case X86::COND_E:
   4605   case X86::COND_P:
   4606   case X86::COND_A:
   4607   case X86::COND_AE:
   4608   case X86::COND_NE:
   4609   case X86::COND_NP:
   4610     return true;
   4611   }
   4612 }
   4613 
   4614 
   4615 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   4616                                            const CallInst &I,
   4617                                            MachineFunction &MF,
   4618                                            unsigned Intrinsic) const {
   4619 
   4620   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
   4621   if (!IntrData)
   4622     return false;
   4623 
   4624   Info.opc = ISD::INTRINSIC_W_CHAIN;
   4625   Info.flags = MachineMemOperand::MONone;
   4626   Info.offset = 0;
   4627 
   4628   switch (IntrData->Type) {
   4629   case TRUNCATE_TO_MEM_VI8:
   4630   case TRUNCATE_TO_MEM_VI16:
   4631   case TRUNCATE_TO_MEM_VI32: {
   4632     Info.ptrVal = I.getArgOperand(0);
   4633     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
   4634     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
   4635     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
   4636       ScalarVT = MVT::i8;
   4637     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
   4638       ScalarVT = MVT::i16;
   4639     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
   4640       ScalarVT = MVT::i32;
   4641 
   4642     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
   4643     Info.align = 1;
   4644     Info.flags |= MachineMemOperand::MOStore;
   4645     break;
   4646   }
   4647   default:
   4648     return false;
   4649   }
   4650 
   4651   return true;
   4652 }
   4653 
   4654 /// Returns true if the target can instruction select the
   4655 /// specified FP immediate natively. If false, the legalizer will
   4656 /// materialize the FP immediate as a load from a constant pool.
   4657 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   4658   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
   4659     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
   4660       return true;
   4661   }
   4662   return false;
   4663 }
   4664 
   4665 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
   4666                                               ISD::LoadExtType ExtTy,
   4667                                               EVT NewVT) const {
   4668   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
   4669   // relocation target a movq or addq instruction: don't let the load shrink.
   4670   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
   4671   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
   4672     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
   4673       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
   4674   return true;
   4675 }
   4676 
   4677 /// Returns true if it is beneficial to convert a load of a constant
   4678 /// to just the constant itself.
   4679 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   4680                                                           Type *Ty) const {
   4681   assert(Ty->isIntegerTy());
   4682 
   4683   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   4684   if (BitSize == 0 || BitSize > 64)
   4685     return false;
   4686   return true;
   4687 }
   4688 
   4689 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
   4690   // TODO: It might be a win to ease or lift this restriction, but the generic
   4691   // folds in DAGCombiner conflict with vector folds for an AVX512 target.
   4692   if (VT.isVector() && Subtarget.hasAVX512())
   4693     return false;
   4694 
   4695   return true;
   4696 }
   4697 
   4698 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
   4699                                                 unsigned Index) const {
   4700   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
   4701     return false;
   4702 
   4703   // Mask vectors support all subregister combinations and operations that
   4704   // extract half of vector.
   4705   if (ResVT.getVectorElementType() == MVT::i1)
   4706     return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
   4707                           (Index == ResVT.getVectorNumElements()));
   4708 
   4709   return (Index % ResVT.getVectorNumElements()) == 0;
   4710 }
   4711 
   4712 bool X86TargetLowering::isCheapToSpeculateCttz() const {
   4713   // Speculate cttz only if we can directly use TZCNT.
   4714   return Subtarget.hasBMI();
   4715 }
   4716 
   4717 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
   4718   // Speculate ctlz only if we can directly use LZCNT.
   4719   return Subtarget.hasLZCNT();
   4720 }
   4721 
   4722 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
   4723                                                 EVT BitcastVT) const {
   4724   if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
   4725     return false;
   4726 
   4727   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
   4728 }
   4729 
   4730 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
   4731                                          const SelectionDAG &DAG) const {
   4732   // Do not merge to float value size (128 bytes) if no implicit
   4733   // float attribute is set.
   4734   bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
   4735       Attribute::NoImplicitFloat);
   4736 
   4737   if (NoFloat) {
   4738     unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
   4739     return (MemVT.getSizeInBits() <= MaxIntSize);
   4740   }
   4741   return true;
   4742 }
   4743 
   4744 bool X86TargetLowering::isCtlzFast() const {
   4745   return Subtarget.hasFastLZCNT();
   4746 }
   4747 
   4748 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
   4749     const Instruction &AndI) const {
   4750   return true;
   4751 }
   4752 
   4753 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   4754   EVT VT = Y.getValueType();
   4755 
   4756   if (VT.isVector())
   4757     return false;
   4758 
   4759   if (!Subtarget.hasBMI())
   4760     return false;
   4761 
   4762   // There are only 32-bit and 64-bit forms for 'andn'.
   4763   if (VT != MVT::i32 && VT != MVT::i64)
   4764     return false;
   4765 
   4766   // A mask and compare against constant is ok for an 'andn' too
   4767   // even though the BMI instruction doesn't have an immediate form.
   4768 
   4769   return true;
   4770 }
   4771 
   4772 bool X86TargetLowering::hasAndNot(SDValue Y) const {
   4773   EVT VT = Y.getValueType();
   4774 
   4775   if (!VT.isVector()) // x86 can't form 'andn' with an immediate.
   4776     return !isa<ConstantSDNode>(Y) && hasAndNotCompare(Y);
   4777 
   4778   // Vector.
   4779 
   4780   if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
   4781     return false;
   4782 
   4783   if (VT == MVT::v4i32)
   4784     return true;
   4785 
   4786   return Subtarget.hasSSE2();
   4787 }
   4788 
   4789 bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const {
   4790   EVT VT = Y.getValueType();
   4791 
   4792   // For vectors, we don't have a preference, but we probably want a mask.
   4793   if (VT.isVector())
   4794     return false;
   4795 
   4796   // 64-bit shifts on 32-bit targets produce really bad bloated code.
   4797   if (VT == MVT::i64 && !Subtarget.is64Bit())
   4798     return false;
   4799 
   4800   return true;
   4801 }
   4802 
   4803 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
   4804   MVT VT = MVT::getIntegerVT(NumBits);
   4805   if (isTypeLegal(VT))
   4806     return VT;
   4807 
   4808   // PMOVMSKB can handle this.
   4809   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
   4810     return MVT::v16i8;
   4811 
   4812   // VPMOVMSKB can handle this.
   4813   if (NumBits == 256 && isTypeLegal(MVT::v32i8))
   4814     return MVT::v32i8;
   4815 
   4816   // TODO: Allow 64-bit type for 32-bit target.
   4817   // TODO: 512-bit types should be allowed, but make sure that those
   4818   // cases are handled in combineVectorSizedSetCCEquality().
   4819 
   4820   return MVT::INVALID_SIMPLE_VALUE_TYPE;
   4821 }
   4822 
   4823 /// Val is the undef sentinel value or equal to the specified value.
   4824 static bool isUndefOrEqual(int Val, int CmpVal) {
   4825   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
   4826 }
   4827 
   4828 /// Val is either the undef or zero sentinel value.
   4829 static bool isUndefOrZero(int Val) {
   4830   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
   4831 }
   4832 
   4833 /// Return true if every element in Mask, beginning
   4834 /// from position Pos and ending in Pos+Size is the undef sentinel value.
   4835 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
   4836   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
   4837     if (Mask[i] != SM_SentinelUndef)
   4838       return false;
   4839   return true;
   4840 }
   4841 
   4842 /// Return true if Val falls within the specified range (L, H].
   4843 static bool isInRange(int Val, int Low, int Hi) {
   4844   return (Val >= Low && Val < Hi);
   4845 }
   4846 
   4847 /// Return true if the value of any element in Mask falls within the specified
   4848 /// range (L, H].
   4849 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
   4850   for (int M : Mask)
   4851     if (isInRange(M, Low, Hi))
   4852       return true;
   4853   return false;
   4854 }
   4855 
   4856 /// Return true if Val is undef or if its value falls within the
   4857 /// specified range (L, H].
   4858 static bool isUndefOrInRange(int Val, int Low, int Hi) {
   4859   return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
   4860 }
   4861 
   4862 /// Return true if every element in Mask is undef or if its value
   4863 /// falls within the specified range (L, H].
   4864 static bool isUndefOrInRange(ArrayRef<int> Mask,
   4865                              int Low, int Hi) {
   4866   for (int M : Mask)
   4867     if (!isUndefOrInRange(M, Low, Hi))
   4868       return false;
   4869   return true;
   4870 }
   4871 
   4872 /// Return true if Val is undef, zero or if its value falls within the
   4873 /// specified range (L, H].
   4874 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
   4875   return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
   4876 }
   4877 
   4878 /// Return true if every element in Mask is undef, zero or if its value
   4879 /// falls within the specified range (L, H].
   4880 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
   4881   for (int M : Mask)
   4882     if (!isUndefOrZeroOrInRange(M, Low, Hi))
   4883       return false;
   4884   return true;
   4885 }
   4886 
   4887 /// Return true if every element in Mask, beginning
   4888 /// from position Pos and ending in Pos + Size, falls within the specified
   4889 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
   4890 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
   4891                                        unsigned Size, int Low, int Step = 1) {
   4892   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
   4893     if (!isUndefOrEqual(Mask[i], Low))
   4894       return false;
   4895   return true;
   4896 }
   4897 
   4898 /// Return true if every element in Mask, beginning
   4899 /// from position Pos and ending in Pos+Size, falls within the specified
   4900 /// sequential range (Low, Low+Size], or is undef or is zero.
   4901 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
   4902                                              unsigned Size, int Low) {
   4903   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
   4904     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
   4905       return false;
   4906   return true;
   4907 }
   4908 
   4909 /// Return true if every element in Mask, beginning
   4910 /// from position Pos and ending in Pos+Size is undef or is zero.
   4911 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
   4912                                  unsigned Size) {
   4913   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
   4914     if (!isUndefOrZero(Mask[i]))
   4915       return false;
   4916   return true;
   4917 }
   4918 
   4919 /// Helper function to test whether a shuffle mask could be
   4920 /// simplified by widening the elements being shuffled.
   4921 ///
   4922 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
   4923 /// leaves it in an unspecified state.
   4924 ///
   4925 /// NOTE: This must handle normal vector shuffle masks and *target* vector
   4926 /// shuffle masks. The latter have the special property of a '-2' representing
   4927 /// a zero-ed lane of a vector.
   4928 static bool canWidenShuffleElements(ArrayRef<int> Mask,
   4929                                     SmallVectorImpl<int> &WidenedMask) {
   4930   WidenedMask.assign(Mask.size() / 2, 0);
   4931   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
   4932     int M0 = Mask[i];
   4933     int M1 = Mask[i + 1];
   4934 
   4935     // If both elements are undef, its trivial.
   4936     if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
   4937       WidenedMask[i / 2] = SM_SentinelUndef;
   4938       continue;
   4939     }
   4940 
   4941     // Check for an undef mask and a mask value properly aligned to fit with
   4942     // a pair of values. If we find such a case, use the non-undef mask's value.
   4943     if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
   4944       WidenedMask[i / 2] = M1 / 2;
   4945       continue;
   4946     }
   4947     if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
   4948       WidenedMask[i / 2] = M0 / 2;
   4949       continue;
   4950     }
   4951 
   4952     // When zeroing, we need to spread the zeroing across both lanes to widen.
   4953     if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
   4954       if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
   4955           (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
   4956         WidenedMask[i / 2] = SM_SentinelZero;
   4957         continue;
   4958       }
   4959       return false;
   4960     }
   4961 
   4962     // Finally check if the two mask values are adjacent and aligned with
   4963     // a pair.
   4964     if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
   4965       WidenedMask[i / 2] = M0 / 2;
   4966       continue;
   4967     }
   4968 
   4969     // Otherwise we can't safely widen the elements used in this shuffle.
   4970     return false;
   4971   }
   4972   assert(WidenedMask.size() == Mask.size() / 2 &&
   4973          "Incorrect size of mask after widening the elements!");
   4974 
   4975   return true;
   4976 }
   4977 
   4978 static bool canWidenShuffleElements(ArrayRef<int> Mask,
   4979                                     const APInt &Zeroable,
   4980                                     SmallVectorImpl<int> &WidenedMask) {
   4981   SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
   4982   for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
   4983     if (TargetMask[i] == SM_SentinelUndef)
   4984       continue;
   4985     if (Zeroable[i])
   4986       TargetMask[i] = SM_SentinelZero;
   4987   }
   4988   return canWidenShuffleElements(TargetMask, WidenedMask);
   4989 }
   4990 
   4991 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
   4992   SmallVector<int, 32> WidenedMask;
   4993   return canWidenShuffleElements(Mask, WidenedMask);
   4994 }
   4995 
   4996 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
   4997 bool X86::isZeroNode(SDValue Elt) {
   4998   return isNullConstant(Elt) || isNullFPConstant(Elt);
   4999 }
   5000 
   5001 // Build a vector of constants.
   5002 // Use an UNDEF node if MaskElt == -1.
   5003 // Split 64-bit constants in the 32-bit mode.
   5004 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
   5005                               const SDLoc &dl, bool IsMask = false) {
   5006 
   5007   SmallVector<SDValue, 32>  Ops;
   5008   bool Split = false;
   5009 
   5010   MVT ConstVecVT = VT;
   5011   unsigned NumElts = VT.getVectorNumElements();
   5012   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
   5013   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
   5014     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
   5015     Split = true;
   5016   }
   5017 
   5018   MVT EltVT = ConstVecVT.getVectorElementType();
   5019   for (unsigned i = 0; i < NumElts; ++i) {
   5020     bool IsUndef = Values[i] < 0 && IsMask;
   5021     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
   5022       DAG.getConstant(Values[i], dl, EltVT);
   5023     Ops.push_back(OpNode);
   5024     if (Split)
   5025       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
   5026                     DAG.getConstant(0, dl, EltVT));
   5027   }
   5028   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
   5029   if (Split)
   5030     ConstsNode = DAG.getBitcast(VT, ConstsNode);
   5031   return ConstsNode;
   5032 }
   5033 
   5034 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
   5035                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   5036   assert(Bits.size() == Undefs.getBitWidth() &&
   5037          "Unequal constant and undef arrays");
   5038   SmallVector<SDValue, 32> Ops;
   5039   bool Split = false;
   5040 
   5041   MVT ConstVecVT = VT;
   5042   unsigned NumElts = VT.getVectorNumElements();
   5043   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
   5044   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
   5045     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
   5046     Split = true;
   5047   }
   5048 
   5049   MVT EltVT = ConstVecVT.getVectorElementType();
   5050   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
   5051     if (Undefs[i]) {
   5052       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
   5053       continue;
   5054     }
   5055     const APInt &V = Bits[i];
   5056     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
   5057     if (Split) {
   5058       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
   5059       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
   5060     } else if (EltVT == MVT::f32) {
   5061       APFloat FV(APFloat::IEEEsingle(), V);
   5062       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
   5063     } else if (EltVT == MVT::f64) {
   5064       APFloat FV(APFloat::IEEEdouble(), V);
   5065       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
   5066     } else {
   5067       Ops.push_back(DAG.getConstant(V, dl, EltVT));
   5068     }
   5069   }
   5070 
   5071   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
   5072   return DAG.getBitcast(VT, ConstsNode);
   5073 }
   5074 
   5075 /// Returns a vector of specified type with all zero elements.
   5076 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
   5077                              SelectionDAG &DAG, const SDLoc &dl) {
   5078   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
   5079           VT.getVectorElementType() == MVT::i1) &&
   5080          "Unexpected vector type");
   5081 
   5082   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
   5083   // type. This ensures they get CSE'd. But if the integer type is not
   5084   // available, use a floating-point +0.0 instead.
   5085   SDValue Vec;
   5086   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
   5087     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
   5088   } else if (VT.getVectorElementType() == MVT::i1) {
   5089     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
   5090            "Unexpected vector type");
   5091     Vec = DAG.getConstant(0, dl, VT);
   5092   } else {
   5093     unsigned Num32BitElts = VT.getSizeInBits() / 32;
   5094     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
   5095   }
   5096   return DAG.getBitcast(VT, Vec);
   5097 }
   5098 
   5099 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
   5100                                 const SDLoc &dl, unsigned vectorWidth) {
   5101   EVT VT = Vec.getValueType();
   5102   EVT ElVT = VT.getVectorElementType();
   5103   unsigned Factor = VT.getSizeInBits()/vectorWidth;
   5104   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
   5105                                   VT.getVectorNumElements()/Factor);
   5106 
   5107   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
   5108   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
   5109   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
   5110 
   5111   // This is the index of the first element of the vectorWidth-bit chunk
   5112   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
   5113   IdxVal &= ~(ElemsPerChunk - 1);
   5114 
   5115   // If the input is a buildvector just emit a smaller one.
   5116   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
   5117     return DAG.getBuildVector(ResultVT, dl,
   5118                               Vec->ops().slice(IdxVal, ElemsPerChunk));
   5119 
   5120   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   5121   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
   5122 }
   5123 
   5124 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
   5125 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
   5126 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
   5127 /// instructions or a simple subregister reference. Idx is an index in the
   5128 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
   5129 /// lowering EXTRACT_VECTOR_ELT operations easier.
   5130 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
   5131                                    SelectionDAG &DAG, const SDLoc &dl) {
   5132   assert((Vec.getValueType().is256BitVector() ||
   5133           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
   5134   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
   5135 }
   5136 
   5137 /// Generate a DAG to grab 256-bits from a 512-bit vector.
   5138 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
   5139                                    SelectionDAG &DAG, const SDLoc &dl) {
   5140   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
   5141   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
   5142 }
   5143 
   5144 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   5145                                SelectionDAG &DAG, const SDLoc &dl,
   5146                                unsigned vectorWidth) {
   5147   assert((vectorWidth == 128 || vectorWidth == 256) &&
   5148          "Unsupported vector width");
   5149   // Inserting UNDEF is Result
   5150   if (Vec.isUndef())
   5151     return Result;
   5152   EVT VT = Vec.getValueType();
   5153   EVT ElVT = VT.getVectorElementType();
   5154   EVT ResultVT = Result.getValueType();
   5155 
   5156   // Insert the relevant vectorWidth bits.
   5157   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
   5158   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
   5159 
   5160   // This is the index of the first element of the vectorWidth-bit chunk
   5161   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
   5162   IdxVal &= ~(ElemsPerChunk - 1);
   5163 
   5164   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   5165   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
   5166 }
   5167 
   5168 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
   5169 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
   5170 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
   5171 /// simple superregister reference.  Idx is an index in the 128 bits
   5172 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
   5173 /// lowering INSERT_VECTOR_ELT operations easier.
   5174 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   5175                                   SelectionDAG &DAG, const SDLoc &dl) {
   5176   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
   5177   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
   5178 }
   5179 
   5180 /// Widen a vector to a larger size with the same scalar type, with the new
   5181 /// elements either zero or undef.
   5182 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
   5183                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
   5184                               const SDLoc &dl) {
   5185   assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
   5186          Vec.getValueType().getScalarType() == VT.getScalarType() &&
   5187          "Unsupported vector widening type");
   5188   SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
   5189                                 : DAG.getUNDEF(VT);
   5190   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
   5191                      DAG.getIntPtrConstant(0, dl));
   5192 }
   5193 
   5194 // Helper for splitting operands of an operation to legal target size and
   5195 // apply a function on each part.
   5196 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
   5197 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
   5198 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
   5199 // The argument Builder is a function that will be applied on each split part:
   5200 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
   5201 template <typename F>
   5202 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
   5203                          const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
   5204                          F Builder, bool CheckBWI = true) {
   5205   assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
   5206   unsigned NumSubs = 1;
   5207   if ((CheckBWI && Subtarget.useBWIRegs()) ||
   5208       (!CheckBWI && Subtarget.useAVX512Regs())) {
   5209     if (VT.getSizeInBits() > 512) {
   5210       NumSubs = VT.getSizeInBits() / 512;
   5211       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
   5212     }
   5213   } else if (Subtarget.hasAVX2()) {
   5214     if (VT.getSizeInBits() > 256) {
   5215       NumSubs = VT.getSizeInBits() / 256;
   5216       assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
   5217     }
   5218   } else {
   5219     if (VT.getSizeInBits() > 128) {
   5220       NumSubs = VT.getSizeInBits() / 128;
   5221       assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
   5222     }
   5223   }
   5224 
   5225   if (NumSubs == 1)
   5226     return Builder(DAG, DL, Ops);
   5227 
   5228   SmallVector<SDValue, 4> Subs;
   5229   for (unsigned i = 0; i != NumSubs; ++i) {
   5230     SmallVector<SDValue, 2> SubOps;
   5231     for (SDValue Op : Ops) {
   5232       EVT OpVT = Op.getValueType();
   5233       unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
   5234       unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
   5235       SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
   5236     }
   5237     Subs.push_back(Builder(DAG, DL, SubOps));
   5238   }
   5239   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
   5240 }
   5241 
   5242 // Return true if the instruction zeroes the unused upper part of the
   5243 // destination and accepts mask.
   5244 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
   5245   switch (Opcode) {
   5246   default:
   5247     return false;
   5248   case X86ISD::CMPM:
   5249   case X86ISD::CMPM_RND:
   5250   case ISD::SETCC:
   5251     return true;
   5252   }
   5253 }
   5254 
   5255 /// Insert i1-subvector to i1-vector.
   5256 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   5257                                 const X86Subtarget &Subtarget) {
   5258 
   5259   SDLoc dl(Op);
   5260   SDValue Vec = Op.getOperand(0);
   5261   SDValue SubVec = Op.getOperand(1);
   5262   SDValue Idx = Op.getOperand(2);
   5263 
   5264   if (!isa<ConstantSDNode>(Idx))
   5265     return SDValue();
   5266 
   5267   // Inserting undef is a nop. We can just return the original vector.
   5268   if (SubVec.isUndef())
   5269     return Vec;
   5270 
   5271   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   5272   if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
   5273     return Op;
   5274 
   5275   MVT OpVT = Op.getSimpleValueType();
   5276   unsigned NumElems = OpVT.getVectorNumElements();
   5277 
   5278   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
   5279 
   5280   // Extend to natively supported kshift.
   5281   MVT WideOpVT = OpVT;
   5282   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
   5283     WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
   5284 
   5285   // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
   5286   // if necessary.
   5287   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
   5288     // May need to promote to a legal type.
   5289     Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
   5290                      getZeroVector(WideOpVT, Subtarget, DAG, dl),
   5291                      SubVec, Idx);
   5292     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   5293   }
   5294 
   5295   MVT SubVecVT = SubVec.getSimpleValueType();
   5296   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
   5297 
   5298   assert(IdxVal + SubVecNumElems <= NumElems &&
   5299          IdxVal % SubVecVT.getSizeInBits() == 0 &&
   5300          "Unexpected index value in INSERT_SUBVECTOR");
   5301 
   5302   SDValue Undef = DAG.getUNDEF(WideOpVT);
   5303 
   5304   if (IdxVal == 0) {
   5305     // Zero lower bits of the Vec
   5306     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
   5307     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
   5308                       ZeroIdx);
   5309     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
   5310     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
   5311     // Merge them together, SubVec should be zero extended.
   5312     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
   5313                          getZeroVector(WideOpVT, Subtarget, DAG, dl),
   5314                          SubVec, ZeroIdx);
   5315     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
   5316     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   5317   }
   5318 
   5319   SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
   5320                        Undef, SubVec, ZeroIdx);
   5321 
   5322   if (Vec.isUndef()) {
   5323     assert(IdxVal != 0 && "Unexpected index");
   5324     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
   5325                          DAG.getConstant(IdxVal, dl, MVT::i8));
   5326     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
   5327   }
   5328 
   5329   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
   5330     assert(IdxVal != 0 && "Unexpected index");
   5331     NumElems = WideOpVT.getVectorNumElements();
   5332     unsigned ShiftLeft = NumElems - SubVecNumElems;
   5333     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
   5334     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
   5335                          DAG.getConstant(ShiftLeft, dl, MVT::i8));
   5336     if (ShiftRight != 0)
   5337       SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
   5338                            DAG.getConstant(ShiftRight, dl, MVT::i8));
   5339     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
   5340   }
   5341 
   5342   // Simple case when we put subvector in the upper part
   5343   if (IdxVal + SubVecNumElems == NumElems) {
   5344     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
   5345                          DAG.getConstant(IdxVal, dl, MVT::i8));
   5346     if (SubVecNumElems * 2 == NumElems) {
   5347       // Special case, use legal zero extending insert_subvector. This allows
   5348       // isel to opimitize when bits are known zero.
   5349       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
   5350       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
   5351                         getZeroVector(WideOpVT, Subtarget, DAG, dl),
   5352                         Vec, ZeroIdx);
   5353     } else {
   5354       // Otherwise use explicit shifts to zero the bits.
   5355       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
   5356                         Undef, Vec, ZeroIdx);
   5357       NumElems = WideOpVT.getVectorNumElements();
   5358       SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
   5359       Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
   5360       Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
   5361     }
   5362     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
   5363     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   5364   }
   5365 
   5366   // Inserting into the middle is more complicated.
   5367 
   5368   NumElems = WideOpVT.getVectorNumElements();
   5369 
   5370   // Widen the vector if needed.
   5371   Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
   5372   // Move the current value of the bit to be replace to the lsbs.
   5373   Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
   5374                    DAG.getConstant(IdxVal, dl, MVT::i8));
   5375   // Xor with the new bit.
   5376   Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
   5377   // Shift to MSB, filling bottom bits with 0.
   5378   unsigned ShiftLeft = NumElems - SubVecNumElems;
   5379   Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
   5380                    DAG.getConstant(ShiftLeft, dl, MVT::i8));
   5381   // Shift to the final position, filling upper bits with 0.
   5382   unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
   5383   Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
   5384                        DAG.getConstant(ShiftRight, dl, MVT::i8));
   5385   // Xor with original vector leaving the new value.
   5386   Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
   5387   // Reduce to original width if needed.
   5388   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   5389 }
   5390 
   5391 static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
   5392                                 unsigned NumElems, SelectionDAG &DAG,
   5393                                 const SDLoc &dl, unsigned VectorWidth) {
   5394   SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
   5395   return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
   5396 }
   5397 
   5398 /// Returns a vector of specified type with all bits set.
   5399 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
   5400 /// Then bitcast to their original type, ensuring they get CSE'd.
   5401 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   5402   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
   5403          "Expected a 128/256/512-bit vector type");
   5404 
   5405   APInt Ones = APInt::getAllOnesValue(32);
   5406   unsigned NumElts = VT.getSizeInBits() / 32;
   5407   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
   5408   return DAG.getBitcast(VT, Vec);
   5409 }
   5410 
   5411 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
   5412                               SelectionDAG &DAG) {
   5413   EVT InVT = In.getValueType();
   5414   assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
   5415 
   5416   if (VT.is128BitVector() && InVT.is128BitVector())
   5417     return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
   5418                                 : DAG.getZeroExtendVectorInReg(In, DL, VT);
   5419 
   5420   // For 256-bit vectors, we only need the lower (128-bit) input half.
   5421   // For 512-bit vectors, we only need the lower input half or quarter.
   5422   if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
   5423     int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
   5424     In = extractSubVector(In, 0, DAG, DL,
   5425                           std::max(128, (int)VT.getSizeInBits() / Scale));
   5426   }
   5427 
   5428   return DAG.getNode(Opc, DL, VT, In);
   5429 }
   5430 
   5431 /// Returns a vector_shuffle node for an unpackl operation.
   5432 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
   5433                           SDValue V1, SDValue V2) {
   5434   SmallVector<int, 8> Mask;
   5435   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
   5436   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
   5437 }
   5438 
   5439 /// Returns a vector_shuffle node for an unpackh operation.
   5440 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
   5441                           SDValue V1, SDValue V2) {
   5442   SmallVector<int, 8> Mask;
   5443   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
   5444   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
   5445 }
   5446 
   5447 /// Return a vector_shuffle of the specified vector of zero or undef vector.
   5448 /// This produces a shuffle where the low element of V2 is swizzled into the
   5449 /// zero/undef vector, landing at element Idx.
   5450 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
   5451 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
   5452                                            bool IsZero,
   5453                                            const X86Subtarget &Subtarget,
   5454                                            SelectionDAG &DAG) {
   5455   MVT VT = V2.getSimpleValueType();
   5456   SDValue V1 = IsZero
   5457     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   5458   int NumElems = VT.getVectorNumElements();
   5459   SmallVector<int, 16> MaskVec(NumElems);
   5460   for (int i = 0; i != NumElems; ++i)
   5461     // If this is the insertion idx, put the low elt of V2 here.
   5462     MaskVec[i] = (i == Idx) ? NumElems : i;
   5463   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
   5464 }
   5465 
   5466 static SDValue peekThroughBitcasts(SDValue V) {
   5467   while (V.getNode() && V.getOpcode() == ISD::BITCAST)
   5468     V = V.getOperand(0);
   5469   return V;
   5470 }
   5471 
   5472 static SDValue peekThroughOneUseBitcasts(SDValue V) {
   5473   while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
   5474          V.getOperand(0).hasOneUse())
   5475     V = V.getOperand(0);
   5476   return V;
   5477 }
   5478 
   5479 // Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
   5480 static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
   5481   while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
   5482     V = V.getOperand(0);
   5483   return V;
   5484 }
   5485 
   5486 static const Constant *getTargetConstantFromNode(SDValue Op) {
   5487   Op = peekThroughBitcasts(Op);
   5488 
   5489   auto *Load = dyn_cast<LoadSDNode>(Op);
   5490   if (!Load)
   5491     return nullptr;
   5492 
   5493   SDValue Ptr = Load->getBasePtr();
   5494   if (Ptr->getOpcode() == X86ISD::Wrapper ||
   5495       Ptr->getOpcode() == X86ISD::WrapperRIP)
   5496     Ptr = Ptr->getOperand(0);
   5497 
   5498   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
   5499   if (!CNode || CNode->isMachineConstantPoolEntry())
   5500     return nullptr;
   5501 
   5502   return dyn_cast<Constant>(CNode->getConstVal());
   5503 }
   5504 
   5505 // Extract raw constant bits from constant pools.
   5506 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   5507                                           APInt &UndefElts,
   5508                                           SmallVectorImpl<APInt> &EltBits,
   5509                                           bool AllowWholeUndefs = true,
   5510                                           bool AllowPartialUndefs = true) {
   5511   assert(EltBits.empty() && "Expected an empty EltBits vector");
   5512 
   5513   Op = peekThroughBitcasts(Op);
   5514 
   5515   EVT VT = Op.getValueType();
   5516   unsigned SizeInBits = VT.getSizeInBits();
   5517   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
   5518   unsigned NumElts = SizeInBits / EltSizeInBits;
   5519 
   5520   // Bitcast a source array of element bits to the target size.
   5521   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
   5522     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
   5523     unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
   5524     assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
   5525            "Constant bit sizes don't match");
   5526 
   5527     // Don't split if we don't allow undef bits.
   5528     bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
   5529     if (UndefSrcElts.getBoolValue() && !AllowUndefs)
   5530       return false;
   5531 
   5532     // If we're already the right size, don't bother bitcasting.
   5533     if (NumSrcElts == NumElts) {
   5534       UndefElts = UndefSrcElts;
   5535       EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
   5536       return true;
   5537     }
   5538 
   5539     // Extract all the undef/constant element data and pack into single bitsets.
   5540     APInt UndefBits(SizeInBits, 0);
   5541     APInt MaskBits(SizeInBits, 0);
   5542 
   5543     for (unsigned i = 0; i != NumSrcElts; ++i) {
   5544       unsigned BitOffset = i * SrcEltSizeInBits;
   5545       if (UndefSrcElts[i])
   5546         UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
   5547       MaskBits.insertBits(SrcEltBits[i], BitOffset);
   5548     }
   5549 
   5550     // Split the undef/constant single bitset data into the target elements.
   5551     UndefElts = APInt(NumElts, 0);
   5552     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
   5553 
   5554     for (unsigned i = 0; i != NumElts; ++i) {
   5555       unsigned BitOffset = i * EltSizeInBits;
   5556       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
   5557 
   5558       // Only treat an element as UNDEF if all bits are UNDEF.
   5559       if (UndefEltBits.isAllOnesValue()) {
   5560         if (!AllowWholeUndefs)
   5561           return false;
   5562         UndefElts.setBit(i);
   5563         continue;
   5564       }
   5565 
   5566       // If only some bits are UNDEF then treat them as zero (or bail if not
   5567       // supported).
   5568       if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
   5569         return false;
   5570 
   5571       APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
   5572       EltBits[i] = Bits.getZExtValue();
   5573     }
   5574     return true;
   5575   };
   5576 
   5577   // Collect constant bits and insert into mask/undef bit masks.
   5578   auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
   5579                                 unsigned UndefBitIndex) {
   5580     if (!Cst)
   5581       return false;
   5582     if (isa<UndefValue>(Cst)) {
   5583       Undefs.setBit(UndefBitIndex);
   5584       return true;
   5585     }
   5586     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
   5587       Mask = CInt->getValue();
   5588       return true;
   5589     }
   5590     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
   5591       Mask = CFP->getValueAPF().bitcastToAPInt();
   5592       return true;
   5593     }
   5594     return false;
   5595   };
   5596 
   5597   // Handle UNDEFs.
   5598   if (Op.isUndef()) {
   5599     APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
   5600     SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
   5601     return CastBitData(UndefSrcElts, SrcEltBits);
   5602   }
   5603 
   5604   // Extract scalar constant bits.
   5605   if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
   5606     APInt UndefSrcElts = APInt::getNullValue(1);
   5607     SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
   5608     return CastBitData(UndefSrcElts, SrcEltBits);
   5609   }
   5610   if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
   5611     APInt UndefSrcElts = APInt::getNullValue(1);
   5612     APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
   5613     SmallVector<APInt, 64> SrcEltBits(1, RawBits);
   5614     return CastBitData(UndefSrcElts, SrcEltBits);
   5615   }
   5616 
   5617   // Extract constant bits from build vector.
   5618   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
   5619     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
   5620     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
   5621 
   5622     APInt UndefSrcElts(NumSrcElts, 0);
   5623     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
   5624     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
   5625       const SDValue &Src = Op.getOperand(i);
   5626       if (Src.isUndef()) {
   5627         UndefSrcElts.setBit(i);
   5628         continue;
   5629       }
   5630       auto *Cst = cast<ConstantSDNode>(Src);
   5631       SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
   5632     }
   5633     return CastBitData(UndefSrcElts, SrcEltBits);
   5634   }
   5635 
   5636   // Extract constant bits from constant pool vector.
   5637   if (auto *Cst = getTargetConstantFromNode(Op)) {
   5638     Type *CstTy = Cst->getType();
   5639     if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
   5640       return false;
   5641 
   5642     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
   5643     unsigned NumSrcElts = CstTy->getVectorNumElements();
   5644 
   5645     APInt UndefSrcElts(NumSrcElts, 0);
   5646     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
   5647     for (unsigned i = 0; i != NumSrcElts; ++i)
   5648       if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
   5649                                UndefSrcElts, i))
   5650         return false;
   5651 
   5652     return CastBitData(UndefSrcElts, SrcEltBits);
   5653   }
   5654 
   5655   // Extract constant bits from a broadcasted constant pool scalar.
   5656   if (Op.getOpcode() == X86ISD::VBROADCAST &&
   5657       EltSizeInBits <= VT.getScalarSizeInBits()) {
   5658     if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
   5659       unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
   5660       unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
   5661 
   5662       APInt UndefSrcElts(NumSrcElts, 0);
   5663       SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
   5664       if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
   5665         if (UndefSrcElts[0])
   5666           UndefSrcElts.setBits(0, NumSrcElts);
   5667         SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
   5668         return CastBitData(UndefSrcElts, SrcEltBits);
   5669       }
   5670     }
   5671   }
   5672 
   5673   // Extract a rematerialized scalar constant insertion.
   5674   if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
   5675       Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
   5676       isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
   5677     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
   5678     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
   5679 
   5680     APInt UndefSrcElts(NumSrcElts, 0);
   5681     SmallVector<APInt, 64> SrcEltBits;
   5682     auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
   5683     SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
   5684     SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
   5685     return CastBitData(UndefSrcElts, SrcEltBits);
   5686   }
   5687 
   5688   return false;
   5689 }
   5690 
   5691 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
   5692                                         unsigned MaskEltSizeInBits,
   5693                                         SmallVectorImpl<uint64_t> &RawMask) {
   5694   APInt UndefElts;
   5695   SmallVector<APInt, 64> EltBits;
   5696 
   5697   // Extract the raw target constant bits.
   5698   // FIXME: We currently don't support UNDEF bits or mask entries.
   5699   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
   5700                                      EltBits, /* AllowWholeUndefs */ false,
   5701                                      /* AllowPartialUndefs */ false))
   5702     return false;
   5703 
   5704   // Insert the extracted elements into the mask.
   5705   for (APInt Elt : EltBits)
   5706     RawMask.push_back(Elt.getZExtValue());
   5707 
   5708   return true;
   5709 }
   5710 
   5711 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
   5712 /// Note: This ignores saturation, so inputs must be checked first.
   5713 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
   5714                                   bool Unary) {
   5715   assert(Mask.empty() && "Expected an empty shuffle mask vector");
   5716   unsigned NumElts = VT.getVectorNumElements();
   5717   unsigned NumLanes = VT.getSizeInBits() / 128;
   5718   unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
   5719   unsigned Offset = Unary ? 0 : NumElts;
   5720 
   5721   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
   5722     for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
   5723       Mask.push_back(Elt + (Lane * NumEltsPerLane));
   5724     for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
   5725       Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
   5726   }
   5727 }
   5728 
   5729 /// Calculates the shuffle mask corresponding to the target-specific opcode.
   5730 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
   5731 /// operands in \p Ops, and returns true.
   5732 /// Sets \p IsUnary to true if only one source is used. Note that this will set
   5733 /// IsUnary for shuffles which use a single input multiple times, and in those
   5734 /// cases it will adjust the mask to only have indices within that single input.
   5735 /// It is an error to call this with non-empty Mask/Ops vectors.
   5736 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   5737                                  SmallVectorImpl<SDValue> &Ops,
   5738                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   5739   unsigned NumElems = VT.getVectorNumElements();
   5740   SDValue ImmN;
   5741 
   5742   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
   5743   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
   5744 
   5745   IsUnary = false;
   5746   bool IsFakeUnary = false;
   5747   switch(N->getOpcode()) {
   5748   case X86ISD::BLENDI:
   5749     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5750     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5751     ImmN = N->getOperand(N->getNumOperands()-1);
   5752     DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   5753     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5754     break;
   5755   case X86ISD::SHUFP:
   5756     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5757     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5758     ImmN = N->getOperand(N->getNumOperands()-1);
   5759     DecodeSHUFPMask(NumElems, VT.getScalarSizeInBits(),
   5760                     cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   5761     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5762     break;
   5763   case X86ISD::INSERTPS:
   5764     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5765     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5766     ImmN = N->getOperand(N->getNumOperands()-1);
   5767     DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   5768     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5769     break;
   5770   case X86ISD::EXTRQI:
   5771     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5772     if (isa<ConstantSDNode>(N->getOperand(1)) &&
   5773         isa<ConstantSDNode>(N->getOperand(2))) {
   5774       int BitLen = N->getConstantOperandVal(1);
   5775       int BitIdx = N->getConstantOperandVal(2);
   5776       DecodeEXTRQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
   5777                        Mask);
   5778       IsUnary = true;
   5779     }
   5780     break;
   5781   case X86ISD::INSERTQI:
   5782     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5783     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5784     if (isa<ConstantSDNode>(N->getOperand(2)) &&
   5785         isa<ConstantSDNode>(N->getOperand(3))) {
   5786       int BitLen = N->getConstantOperandVal(2);
   5787       int BitIdx = N->getConstantOperandVal(3);
   5788       DecodeINSERTQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
   5789                          Mask);
   5790       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5791     }
   5792     break;
   5793   case X86ISD::UNPCKH:
   5794     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5795     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5796     DecodeUNPCKHMask(NumElems, VT.getScalarSizeInBits(), Mask);
   5797     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5798     break;
   5799   case X86ISD::UNPCKL:
   5800     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5801     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5802     DecodeUNPCKLMask(NumElems, VT.getScalarSizeInBits(), Mask);
   5803     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5804     break;
   5805   case X86ISD::MOVHLPS:
   5806     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5807     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5808     DecodeMOVHLPSMask(NumElems, Mask);
   5809     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5810     break;
   5811   case X86ISD::MOVLHPS:
   5812     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5813     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5814     DecodeMOVLHPSMask(NumElems, Mask);
   5815     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5816     break;
   5817   case X86ISD::PALIGNR:
   5818     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
   5819     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5820     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5821     ImmN = N->getOperand(N->getNumOperands()-1);
   5822     DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
   5823                       Mask);
   5824     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5825     Ops.push_back(N->getOperand(1));
   5826     Ops.push_back(N->getOperand(0));
   5827     break;
   5828   case X86ISD::VSHLDQ:
   5829     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
   5830     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5831     ImmN = N->getOperand(N->getNumOperands() - 1);
   5832     DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
   5833                      Mask);
   5834     IsUnary = true;
   5835     break;
   5836   case X86ISD::VSRLDQ:
   5837     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
   5838     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5839     ImmN = N->getOperand(N->getNumOperands() - 1);
   5840     DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
   5841                      Mask);
   5842     IsUnary = true;
   5843     break;
   5844   case X86ISD::PSHUFD:
   5845   case X86ISD::VPERMILPI:
   5846     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5847     ImmN = N->getOperand(N->getNumOperands()-1);
   5848     DecodePSHUFMask(NumElems, VT.getScalarSizeInBits(),
   5849                     cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   5850     IsUnary = true;
   5851     break;
   5852   case X86ISD::PSHUFHW:
   5853     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5854     ImmN = N->getOperand(N->getNumOperands()-1);
   5855     DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
   5856                       Mask);
   5857     IsUnary = true;
   5858     break;
   5859   case X86ISD::PSHUFLW:
   5860     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5861     ImmN = N->getOperand(N->getNumOperands()-1);
   5862     DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
   5863                       Mask);
   5864     IsUnary = true;
   5865     break;
   5866   case X86ISD::VZEXT_MOVL:
   5867     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5868     DecodeZeroMoveLowMask(NumElems, Mask);
   5869     IsUnary = true;
   5870     break;
   5871   case X86ISD::VBROADCAST: {
   5872     SDValue N0 = N->getOperand(0);
   5873     // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
   5874     // add the pre-extracted value to the Ops vector.
   5875     if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
   5876         N0.getOperand(0).getValueType() == VT &&
   5877         N0.getConstantOperandVal(1) == 0)
   5878       Ops.push_back(N0.getOperand(0));
   5879 
   5880     // We only decode broadcasts of same-sized vectors, unless the broadcast
   5881     // came from an extract from the original width. If we found one, we
   5882     // pushed it the Ops vector above.
   5883     if (N0.getValueType() == VT || !Ops.empty()) {
   5884       DecodeVectorBroadcast(NumElems, Mask);
   5885       IsUnary = true;
   5886       break;
   5887     }
   5888     return false;
   5889   }
   5890   case X86ISD::VPERMILPV: {
   5891     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5892     IsUnary = true;
   5893     SDValue MaskNode = N->getOperand(1);
   5894     unsigned MaskEltSize = VT.getScalarSizeInBits();
   5895     SmallVector<uint64_t, 32> RawMask;
   5896     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
   5897       DecodeVPERMILPMask(NumElems, VT.getScalarSizeInBits(), RawMask, Mask);
   5898       break;
   5899     }
   5900     if (auto *C = getTargetConstantFromNode(MaskNode)) {
   5901       DecodeVPERMILPMask(C, MaskEltSize, Mask);
   5902       break;
   5903     }
   5904     return false;
   5905   }
   5906   case X86ISD::PSHUFB: {
   5907     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
   5908     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5909     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5910     IsUnary = true;
   5911     SDValue MaskNode = N->getOperand(1);
   5912     SmallVector<uint64_t, 32> RawMask;
   5913     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
   5914       DecodePSHUFBMask(RawMask, Mask);
   5915       break;
   5916     }
   5917     if (auto *C = getTargetConstantFromNode(MaskNode)) {
   5918       DecodePSHUFBMask(C, Mask);
   5919       break;
   5920     }
   5921     return false;
   5922   }
   5923   case X86ISD::VPERMI:
   5924     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5925     ImmN = N->getOperand(N->getNumOperands()-1);
   5926     DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
   5927     IsUnary = true;
   5928     break;
   5929   case X86ISD::MOVSS:
   5930   case X86ISD::MOVSD:
   5931     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5932     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5933     DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
   5934     break;
   5935   case X86ISD::VPERM2X128:
   5936     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5937     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5938     ImmN = N->getOperand(N->getNumOperands()-1);
   5939     DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
   5940                          Mask);
   5941     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5942     break;
   5943   case X86ISD::SHUF128:
   5944     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5945     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5946     ImmN = N->getOperand(N->getNumOperands()-1);
   5947     decodeVSHUF64x2FamilyMask(NumElems, VT.getScalarSizeInBits(),
   5948                               cast<ConstantSDNode>(ImmN)->getZExtValue(),
   5949                               Mask);
   5950     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5951     break;
   5952   case X86ISD::MOVSLDUP:
   5953     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5954     DecodeMOVSLDUPMask(NumElems, Mask);
   5955     IsUnary = true;
   5956     break;
   5957   case X86ISD::MOVSHDUP:
   5958     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5959     DecodeMOVSHDUPMask(NumElems, Mask);
   5960     IsUnary = true;
   5961     break;
   5962   case X86ISD::MOVDDUP:
   5963     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5964     DecodeMOVDDUPMask(NumElems, Mask);
   5965     IsUnary = true;
   5966     break;
   5967   case X86ISD::VPERMIL2: {
   5968     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5969     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5970     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5971     unsigned MaskEltSize = VT.getScalarSizeInBits();
   5972     SDValue MaskNode = N->getOperand(2);
   5973     SDValue CtrlNode = N->getOperand(3);
   5974     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
   5975       unsigned CtrlImm = CtrlOp->getZExtValue();
   5976       SmallVector<uint64_t, 32> RawMask;
   5977       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
   5978         DecodeVPERMIL2PMask(NumElems, VT.getScalarSizeInBits(), CtrlImm,
   5979                             RawMask, Mask);
   5980         break;
   5981       }
   5982       if (auto *C = getTargetConstantFromNode(MaskNode)) {
   5983         DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
   5984         break;
   5985       }
   5986     }
   5987     return false;
   5988   }
   5989   case X86ISD::VPPERM: {
   5990     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   5991     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   5992     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
   5993     SDValue MaskNode = N->getOperand(2);
   5994     SmallVector<uint64_t, 32> RawMask;
   5995     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
   5996       DecodeVPPERMMask(RawMask, Mask);
   5997       break;
   5998     }
   5999     if (auto *C = getTargetConstantFromNode(MaskNode)) {
   6000       DecodeVPPERMMask(C, Mask);
   6001       break;
   6002     }
   6003     return false;
   6004   }
   6005   case X86ISD::VPERMV: {
   6006     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
   6007     IsUnary = true;
   6008     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
   6009     Ops.push_back(N->getOperand(1));
   6010     SDValue MaskNode = N->getOperand(0);
   6011     SmallVector<uint64_t, 32> RawMask;
   6012     unsigned MaskEltSize = VT.getScalarSizeInBits();
   6013     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
   6014       DecodeVPERMVMask(RawMask, Mask);
   6015       break;
   6016     }
   6017     if (auto *C = getTargetConstantFromNode(MaskNode)) {
   6018       DecodeVPERMVMask(C, MaskEltSize, Mask);
   6019       break;
   6020     }
   6021     return false;
   6022   }
   6023   case X86ISD::VPERMV3: {
   6024     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
   6025     assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
   6026     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
   6027     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
   6028     Ops.push_back(N->getOperand(0));
   6029     Ops.push_back(N->getOperand(2));
   6030     SDValue MaskNode = N->getOperand(1);
   6031     unsigned MaskEltSize = VT.getScalarSizeInBits();
   6032     if (auto *C = getTargetConstantFromNode(MaskNode)) {
   6033       DecodeVPERMV3Mask(C, MaskEltSize, Mask);
   6034       break;
   6035     }
   6036     return false;
   6037   }
   6038   default: llvm_unreachable("unknown target shuffle node");
   6039   }
   6040 
   6041   // Empty mask indicates the decode failed.
   6042   if (Mask.empty())
   6043     return false;
   6044 
   6045   // Check if we're getting a shuffle mask with zero'd elements.
   6046   if (!AllowSentinelZero)
   6047     if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
   6048       return false;
   6049 
   6050   // If we have a fake unary shuffle, the shuffle mask is spread across two
   6051   // inputs that are actually the same node. Re-map the mask to always point
   6052   // into the first input.
   6053   if (IsFakeUnary)
   6054     for (int &M : Mask)
   6055       if (M >= (int)Mask.size())
   6056         M -= Mask.size();
   6057 
   6058   // If we didn't already add operands in the opcode-specific code, default to
   6059   // adding 1 or 2 operands starting at 0.
   6060   if (Ops.empty()) {
   6061     Ops.push_back(N->getOperand(0));
   6062     if (!IsUnary || IsFakeUnary)
   6063       Ops.push_back(N->getOperand(1));
   6064   }
   6065 
   6066   return true;
   6067 }
   6068 
   6069 /// Check a target shuffle mask's inputs to see if we can set any values to
   6070 /// SM_SentinelZero - this is for elements that are known to be zero
   6071 /// (not just zeroable) from their inputs.
   6072 /// Returns true if the target shuffle mask was decoded.
   6073 static bool setTargetShuffleZeroElements(SDValue N,
   6074                                          SmallVectorImpl<int> &Mask,
   6075                                          SmallVectorImpl<SDValue> &Ops) {
   6076   bool IsUnary;
   6077   if (!isTargetShuffle(N.getOpcode()))
   6078     return false;
   6079 
   6080   MVT VT = N.getSimpleValueType();
   6081   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
   6082     return false;
   6083 
   6084   SDValue V1 = Ops[0];
   6085   SDValue V2 = IsUnary ? V1 : Ops[1];
   6086 
   6087   V1 = peekThroughBitcasts(V1);
   6088   V2 = peekThroughBitcasts(V2);
   6089 
   6090   assert((VT.getSizeInBits() % Mask.size()) == 0 &&
   6091          "Illegal split of shuffle value type");
   6092   unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
   6093 
   6094   // Extract known constant input data.
   6095   APInt UndefSrcElts[2];
   6096   SmallVector<APInt, 32> SrcEltBits[2];
   6097   bool IsSrcConstant[2] = {
   6098       getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
   6099                                     SrcEltBits[0], true, false),
   6100       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
   6101                                     SrcEltBits[1], true, false)};
   6102 
   6103   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   6104     int M = Mask[i];
   6105 
   6106     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
   6107     if (M < 0)
   6108       continue;
   6109 
   6110     // Determine shuffle input and normalize the mask.
   6111     unsigned SrcIdx = M / Size;
   6112     SDValue V = M < Size ? V1 : V2;
   6113     M %= Size;
   6114 
   6115     // We are referencing an UNDEF input.
   6116     if (V.isUndef()) {
   6117       Mask[i] = SM_SentinelUndef;
   6118       continue;
   6119     }
   6120 
   6121     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
   6122     // TODO: We currently only set UNDEF for integer types - floats use the same
   6123     // registers as vectors and many of the scalar folded loads rely on the
   6124     // SCALAR_TO_VECTOR pattern.
   6125     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
   6126         (Size % V.getValueType().getVectorNumElements()) == 0) {
   6127       int Scale = Size / V.getValueType().getVectorNumElements();
   6128       int Idx = M / Scale;
   6129       if (Idx != 0 && !VT.isFloatingPoint())
   6130         Mask[i] = SM_SentinelUndef;
   6131       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
   6132         Mask[i] = SM_SentinelZero;
   6133       continue;
   6134     }
   6135 
   6136     // Attempt to extract from the source's constant bits.
   6137     if (IsSrcConstant[SrcIdx]) {
   6138       if (UndefSrcElts[SrcIdx][M])
   6139         Mask[i] = SM_SentinelUndef;
   6140       else if (SrcEltBits[SrcIdx][M] == 0)
   6141         Mask[i] = SM_SentinelZero;
   6142     }
   6143   }
   6144 
   6145   assert(VT.getVectorNumElements() == Mask.size() &&
   6146          "Different mask size from vector size!");
   6147   return true;
   6148 }
   6149 
   6150 // Attempt to decode ops that could be represented as a shuffle mask.
   6151 // The decoded shuffle mask may contain a different number of elements to the
   6152 // destination value type.
   6153 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
   6154                                SmallVectorImpl<SDValue> &Ops,
   6155                                const SelectionDAG &DAG) {
   6156   Mask.clear();
   6157   Ops.clear();
   6158 
   6159   MVT VT = N.getSimpleValueType();
   6160   unsigned NumElts = VT.getVectorNumElements();
   6161   unsigned NumSizeInBits = VT.getSizeInBits();
   6162   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
   6163   assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
   6164          "Expected byte aligned value types");
   6165 
   6166   unsigned Opcode = N.getOpcode();
   6167   switch (Opcode) {
   6168   case ISD::VECTOR_SHUFFLE: {
   6169     // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
   6170     ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
   6171     if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
   6172       Mask.append(ShuffleMask.begin(), ShuffleMask.end());
   6173       Ops.push_back(N.getOperand(0));
   6174       Ops.push_back(N.getOperand(1));
   6175       return true;
   6176     }
   6177     return false;
   6178   }
   6179   case ISD::AND:
   6180   case X86ISD::ANDNP: {
   6181     // Attempt to decode as a per-byte mask.
   6182     APInt UndefElts;
   6183     SmallVector<APInt, 32> EltBits;
   6184     SDValue N0 = N.getOperand(0);
   6185     SDValue N1 = N.getOperand(1);
   6186     bool IsAndN = (X86ISD::ANDNP == Opcode);
   6187     uint64_t ZeroMask = IsAndN ? 255 : 0;
   6188     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
   6189       return false;
   6190     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
   6191       if (UndefElts[i]) {
   6192         Mask.push_back(SM_SentinelUndef);
   6193         continue;
   6194       }
   6195       uint64_t ByteBits = EltBits[i].getZExtValue();
   6196       if (ByteBits != 0 && ByteBits != 255)
   6197         return false;
   6198       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
   6199     }
   6200     Ops.push_back(IsAndN ? N1 : N0);
   6201     return true;
   6202   }
   6203   case ISD::SCALAR_TO_VECTOR: {
   6204     // Match against a scalar_to_vector of an extract from a vector,
   6205     // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
   6206     SDValue N0 = N.getOperand(0);
   6207     SDValue SrcExtract;
   6208 
   6209     if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   6210          N0.getOperand(0).getValueType() == VT) ||
   6211         (N0.getOpcode() == X86ISD::PEXTRW &&
   6212          N0.getOperand(0).getValueType() == MVT::v8i16) ||
   6213         (N0.getOpcode() == X86ISD::PEXTRB &&
   6214          N0.getOperand(0).getValueType() == MVT::v16i8)) {
   6215       SrcExtract = N0;
   6216     }
   6217 
   6218     if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
   6219       return false;
   6220 
   6221     SDValue SrcVec = SrcExtract.getOperand(0);
   6222     EVT SrcVT = SrcVec.getValueType();
   6223     unsigned NumSrcElts = SrcVT.getVectorNumElements();
   6224     unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
   6225 
   6226     unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
   6227     if (NumSrcElts <= SrcIdx)
   6228       return false;
   6229 
   6230     Ops.push_back(SrcVec);
   6231     Mask.push_back(SrcIdx);
   6232     Mask.append(NumZeros, SM_SentinelZero);
   6233     Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
   6234     return true;
   6235   }
   6236   case X86ISD::PINSRB:
   6237   case X86ISD::PINSRW: {
   6238     SDValue InVec = N.getOperand(0);
   6239     SDValue InScl = N.getOperand(1);
   6240     SDValue InIndex = N.getOperand(2);
   6241     if (!isa<ConstantSDNode>(InIndex) ||
   6242         cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
   6243       return false;
   6244     uint64_t InIdx = N.getConstantOperandVal(2);
   6245 
   6246     // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
   6247     if (X86::isZeroNode(InScl)) {
   6248       Ops.push_back(InVec);
   6249       for (unsigned i = 0; i != NumElts; ++i)
   6250         Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
   6251       return true;
   6252     }
   6253 
   6254     // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
   6255     // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
   6256     unsigned ExOp =
   6257         (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
   6258     if (InScl.getOpcode() != ExOp)
   6259       return false;
   6260 
   6261     SDValue ExVec = InScl.getOperand(0);
   6262     SDValue ExIndex = InScl.getOperand(1);
   6263     if (!isa<ConstantSDNode>(ExIndex) ||
   6264         cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
   6265       return false;
   6266     uint64_t ExIdx = InScl.getConstantOperandVal(1);
   6267 
   6268     Ops.push_back(InVec);
   6269     Ops.push_back(ExVec);
   6270     for (unsigned i = 0; i != NumElts; ++i)
   6271       Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
   6272     return true;
   6273   }
   6274   case X86ISD::PACKSS:
   6275   case X86ISD::PACKUS: {
   6276     SDValue N0 = N.getOperand(0);
   6277     SDValue N1 = N.getOperand(1);
   6278     assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
   6279            N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
   6280            "Unexpected input value type");
   6281 
   6282     // If we know input saturation won't happen we can treat this
   6283     // as a truncation shuffle.
   6284     if (Opcode == X86ISD::PACKSS) {
   6285       if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
   6286           (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
   6287         return false;
   6288     } else {
   6289       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
   6290       if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
   6291           (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
   6292         return false;
   6293     }
   6294 
   6295     bool IsUnary = (N0 == N1);
   6296 
   6297     Ops.push_back(N0);
   6298     if (!IsUnary)
   6299       Ops.push_back(N1);
   6300 
   6301     createPackShuffleMask(VT, Mask, IsUnary);
   6302     return true;
   6303   }
   6304   case X86ISD::VSHLI:
   6305   case X86ISD::VSRLI: {
   6306     uint64_t ShiftVal = N.getConstantOperandVal(1);
   6307     // Out of range bit shifts are guaranteed to be zero.
   6308     if (NumBitsPerElt <= ShiftVal) {
   6309       Mask.append(NumElts, SM_SentinelZero);
   6310       return true;
   6311     }
   6312 
   6313     // We can only decode 'whole byte' bit shifts as shuffles.
   6314     if ((ShiftVal % 8) != 0)
   6315       break;
   6316 
   6317     uint64_t ByteShift = ShiftVal / 8;
   6318     unsigned NumBytes = NumSizeInBits / 8;
   6319     unsigned NumBytesPerElt = NumBitsPerElt / 8;
   6320     Ops.push_back(N.getOperand(0));
   6321 
   6322     // Clear mask to all zeros and insert the shifted byte indices.
   6323     Mask.append(NumBytes, SM_SentinelZero);
   6324 
   6325     if (X86ISD::VSHLI == Opcode) {
   6326       for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
   6327         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
   6328           Mask[i + j] = i + j - ByteShift;
   6329     } else {
   6330       for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
   6331         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
   6332           Mask[i + j - ByteShift] = i + j;
   6333     }
   6334     return true;
   6335   }
   6336   case ISD::ZERO_EXTEND_VECTOR_INREG:
   6337   case X86ISD::VZEXT: {
   6338     // TODO - add support for VPMOVZX with smaller input vector types.
   6339     SDValue Src = N.getOperand(0);
   6340     MVT SrcVT = Src.getSimpleValueType();
   6341     if (NumSizeInBits != SrcVT.getSizeInBits())
   6342       break;
   6343     DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), VT.getScalarSizeInBits(),
   6344                          VT.getVectorNumElements(), Mask);
   6345     Ops.push_back(Src);
   6346     return true;
   6347   }
   6348   }
   6349 
   6350   return false;
   6351 }
   6352 
   6353 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
   6354 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
   6355                                               SmallVectorImpl<int> &Mask) {
   6356   int MaskWidth = Mask.size();
   6357   SmallVector<SDValue, 16> UsedInputs;
   6358   for (int i = 0, e = Inputs.size(); i < e; ++i) {
   6359     int lo = UsedInputs.size() * MaskWidth;
   6360     int hi = lo + MaskWidth;
   6361 
   6362     // Strip UNDEF input usage.
   6363     if (Inputs[i].isUndef())
   6364       for (int &M : Mask)
   6365         if ((lo <= M) && (M < hi))
   6366           M = SM_SentinelUndef;
   6367 
   6368     // Check for unused inputs.
   6369     if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
   6370       UsedInputs.push_back(Inputs[i]);
   6371       continue;
   6372     }
   6373     for (int &M : Mask)
   6374       if (lo <= M)
   6375         M -= MaskWidth;
   6376   }
   6377   Inputs = UsedInputs;
   6378 }
   6379 
   6380 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
   6381 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
   6382 /// remaining input indices in case we now have a unary shuffle and adjust the
   6383 /// inputs accordingly.
   6384 /// Returns true if the target shuffle mask was decoded.
   6385 static bool resolveTargetShuffleInputs(SDValue Op,
   6386                                        SmallVectorImpl<SDValue> &Inputs,
   6387                                        SmallVectorImpl<int> &Mask,
   6388                                        const SelectionDAG &DAG) {
   6389   if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
   6390     if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
   6391       return false;
   6392 
   6393   resolveTargetShuffleInputsAndMask(Inputs, Mask);
   6394   return true;
   6395 }
   6396 
   6397 /// Returns the scalar element that will make up the ith
   6398 /// element of the result of the vector shuffle.
   6399 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
   6400                                    unsigned Depth) {
   6401   if (Depth == 6)
   6402     return SDValue();  // Limit search depth.
   6403 
   6404   SDValue V = SDValue(N, 0);
   6405   EVT VT = V.getValueType();
   6406   unsigned Opcode = V.getOpcode();
   6407 
   6408   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
   6409   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
   6410     int Elt = SV->getMaskElt(Index);
   6411 
   6412     if (Elt < 0)
   6413       return DAG.getUNDEF(VT.getVectorElementType());
   6414 
   6415     unsigned NumElems = VT.getVectorNumElements();
   6416     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
   6417                                          : SV->getOperand(1);
   6418     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
   6419   }
   6420 
   6421   // Recurse into target specific vector shuffles to find scalars.
   6422   if (isTargetShuffle(Opcode)) {
   6423     MVT ShufVT = V.getSimpleValueType();
   6424     MVT ShufSVT = ShufVT.getVectorElementType();
   6425     int NumElems = (int)ShufVT.getVectorNumElements();
   6426     SmallVector<int, 16> ShuffleMask;
   6427     SmallVector<SDValue, 16> ShuffleOps;
   6428     bool IsUnary;
   6429 
   6430     if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
   6431       return SDValue();
   6432 
   6433     int Elt = ShuffleMask[Index];
   6434     if (Elt == SM_SentinelZero)
   6435       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
   6436                                  : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
   6437     if (Elt == SM_SentinelUndef)
   6438       return DAG.getUNDEF(ShufSVT);
   6439 
   6440     assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
   6441     SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
   6442     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
   6443                                Depth+1);
   6444   }
   6445 
   6446   // Actual nodes that may contain scalar elements
   6447   if (Opcode == ISD::BITCAST) {
   6448     V = V.getOperand(0);
   6449     EVT SrcVT = V.getValueType();
   6450     unsigned NumElems = VT.getVectorNumElements();
   6451 
   6452     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
   6453       return SDValue();
   6454   }
   6455 
   6456   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
   6457     return (Index == 0) ? V.getOperand(0)
   6458                         : DAG.getUNDEF(VT.getVectorElementType());
   6459 
   6460   if (V.getOpcode() == ISD::BUILD_VECTOR)
   6461     return V.getOperand(Index);
   6462 
   6463   return SDValue();
   6464 }
   6465 
   6466 // Use PINSRB/PINSRW/PINSRD to create a build vector.
   6467 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
   6468                                         unsigned NumNonZero, unsigned NumZero,
   6469                                         SelectionDAG &DAG,
   6470                                         const X86Subtarget &Subtarget) {
   6471   MVT VT = Op.getSimpleValueType();
   6472   unsigned NumElts = VT.getVectorNumElements();
   6473   assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
   6474           ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
   6475          "Illegal vector insertion");
   6476 
   6477   SDLoc dl(Op);
   6478   SDValue V;
   6479   bool First = true;
   6480 
   6481   for (unsigned i = 0; i < NumElts; ++i) {
   6482     bool IsNonZero = (NonZeros & (1 << i)) != 0;
   6483     if (!IsNonZero)
   6484       continue;
   6485 
   6486     // If the build vector contains zeros or our first insertion is not the
   6487     // first index then insert into zero vector to break any register
   6488     // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
   6489     if (First) {
   6490       First = false;
   6491       if (NumZero || 0 != i)
   6492         V = getZeroVector(VT, Subtarget, DAG, dl);
   6493       else {
   6494         assert(0 == i && "Expected insertion into zero-index");
   6495         V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
   6496         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
   6497         V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
   6498         V = DAG.getBitcast(VT, V);
   6499         continue;
   6500       }
   6501     }
   6502     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
   6503                     DAG.getIntPtrConstant(i, dl));
   6504   }
   6505 
   6506   return V;
   6507 }
   6508 
   6509 /// Custom lower build_vector of v16i8.
   6510 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   6511                                      unsigned NumNonZero, unsigned NumZero,
   6512                                      SelectionDAG &DAG,
   6513                                      const X86Subtarget &Subtarget) {
   6514   if (NumNonZero > 8 && !Subtarget.hasSSE41())
   6515     return SDValue();
   6516 
   6517   // SSE4.1 - use PINSRB to insert each byte directly.
   6518   if (Subtarget.hasSSE41())
   6519     return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
   6520                                     Subtarget);
   6521 
   6522   SDLoc dl(Op);
   6523   SDValue V;
   6524   bool First = true;
   6525 
   6526   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
   6527   for (unsigned i = 0; i < 16; ++i) {
   6528     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
   6529     if (ThisIsNonZero && First) {
   6530       if (NumZero)
   6531         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
   6532       else
   6533         V = DAG.getUNDEF(MVT::v8i16);
   6534       First = false;
   6535     }
   6536 
   6537     if ((i & 1) != 0) {
   6538       // FIXME: Investigate extending to i32 instead of just i16.
   6539       // FIXME: Investigate combining the first 4 bytes as a i32 instead.
   6540       SDValue ThisElt, LastElt;
   6541       bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
   6542       if (LastIsNonZero) {
   6543         LastElt =
   6544             DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
   6545       }
   6546       if (ThisIsNonZero) {
   6547         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
   6548         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
   6549                               DAG.getConstant(8, dl, MVT::i8));
   6550         if (LastIsNonZero)
   6551           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
   6552       } else
   6553         ThisElt = LastElt;
   6554 
   6555       if (ThisElt) {
   6556         if (1 == i) {
   6557           V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
   6558                       : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
   6559           V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
   6560           V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
   6561           V = DAG.getBitcast(MVT::v8i16, V);
   6562         } else {
   6563           V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
   6564                           DAG.getIntPtrConstant(i / 2, dl));
   6565         }
   6566       }
   6567     }
   6568   }
   6569 
   6570   return DAG.getBitcast(MVT::v16i8, V);
   6571 }
   6572 
   6573 /// Custom lower build_vector of v8i16.
   6574 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   6575                                      unsigned NumNonZero, unsigned NumZero,
   6576                                      SelectionDAG &DAG,
   6577                                      const X86Subtarget &Subtarget) {
   6578   if (NumNonZero > 4 && !Subtarget.hasSSE41())
   6579     return SDValue();
   6580 
   6581   // Use PINSRW to insert each byte directly.
   6582   return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
   6583                                   Subtarget);
   6584 }
   6585 
   6586 /// Custom lower build_vector of v4i32 or v4f32.
   6587 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
   6588                                      const X86Subtarget &Subtarget) {
   6589   // Find all zeroable elements.
   6590   std::bitset<4> Zeroable;
   6591   for (int i=0; i < 4; ++i) {
   6592     SDValue Elt = Op->getOperand(i);
   6593     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
   6594   }
   6595   assert(Zeroable.size() - Zeroable.count() > 1 &&
   6596          "We expect at least two non-zero elements!");
   6597 
   6598   // We only know how to deal with build_vector nodes where elements are either
   6599   // zeroable or extract_vector_elt with constant index.
   6600   SDValue FirstNonZero;
   6601   unsigned FirstNonZeroIdx;
   6602   for (unsigned i=0; i < 4; ++i) {
   6603     if (Zeroable[i])
   6604       continue;
   6605     SDValue Elt = Op->getOperand(i);
   6606     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   6607         !isa<ConstantSDNode>(Elt.getOperand(1)))
   6608       return SDValue();
   6609     // Make sure that this node is extracting from a 128-bit vector.
   6610     MVT VT = Elt.getOperand(0).getSimpleValueType();
   6611     if (!VT.is128BitVector())
   6612       return SDValue();
   6613     if (!FirstNonZero.getNode()) {
   6614       FirstNonZero = Elt;
   6615       FirstNonZeroIdx = i;
   6616     }
   6617   }
   6618 
   6619   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
   6620   SDValue V1 = FirstNonZero.getOperand(0);
   6621   MVT VT = V1.getSimpleValueType();
   6622 
   6623   // See if this build_vector can be lowered as a blend with zero.
   6624   SDValue Elt;
   6625   unsigned EltMaskIdx, EltIdx;
   6626   int Mask[4];
   6627   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
   6628     if (Zeroable[EltIdx]) {
   6629       // The zero vector will be on the right hand side.
   6630       Mask[EltIdx] = EltIdx+4;
   6631       continue;
   6632     }
   6633 
   6634     Elt = Op->getOperand(EltIdx);
   6635     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
   6636     EltMaskIdx = Elt.getConstantOperandVal(1);
   6637     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
   6638       break;
   6639     Mask[EltIdx] = EltIdx;
   6640   }
   6641 
   6642   if (EltIdx == 4) {
   6643     // Let the shuffle legalizer deal with blend operations.
   6644     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
   6645     if (V1.getSimpleValueType() != VT)
   6646       V1 = DAG.getBitcast(VT, V1);
   6647     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
   6648   }
   6649 
   6650   // See if we can lower this build_vector to a INSERTPS.
   6651   if (!Subtarget.hasSSE41())
   6652     return SDValue();
   6653 
   6654   SDValue V2 = Elt.getOperand(0);
   6655   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
   6656     V1 = SDValue();
   6657 
   6658   bool CanFold = true;
   6659   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
   6660     if (Zeroable[i])
   6661       continue;
   6662 
   6663     SDValue Current = Op->getOperand(i);
   6664     SDValue SrcVector = Current->getOperand(0);
   6665     if (!V1.getNode())
   6666       V1 = SrcVector;
   6667     CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
   6668   }
   6669 
   6670   if (!CanFold)
   6671     return SDValue();
   6672 
   6673   assert(V1.getNode() && "Expected at least two non-zero elements!");
   6674   if (V1.getSimpleValueType() != MVT::v4f32)
   6675     V1 = DAG.getBitcast(MVT::v4f32, V1);
   6676   if (V2.getSimpleValueType() != MVT::v4f32)
   6677     V2 = DAG.getBitcast(MVT::v4f32, V2);
   6678 
   6679   // Ok, we can emit an INSERTPS instruction.
   6680   unsigned ZMask = Zeroable.to_ulong();
   6681 
   6682   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
   6683   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
   6684   SDLoc DL(Op);
   6685   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
   6686                                DAG.getIntPtrConstant(InsertPSMask, DL));
   6687   return DAG.getBitcast(VT, Result);
   6688 }
   6689 
   6690 /// Return a vector logical shift node.
   6691 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
   6692                          SelectionDAG &DAG, const TargetLowering &TLI,
   6693                          const SDLoc &dl) {
   6694   assert(VT.is128BitVector() && "Unknown type for VShift");
   6695   MVT ShVT = MVT::v16i8;
   6696   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   6697   SrcOp = DAG.getBitcast(ShVT, SrcOp);
   6698   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
   6699   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
   6700   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
   6701 }
   6702 
   6703 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
   6704                                       SelectionDAG &DAG) {
   6705 
   6706   // Check if the scalar load can be widened into a vector load. And if
   6707   // the address is "base + cst" see if the cst can be "absorbed" into
   6708   // the shuffle mask.
   6709   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
   6710     SDValue Ptr = LD->getBasePtr();
   6711     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
   6712       return SDValue();
   6713     EVT PVT = LD->getValueType(0);
   6714     if (PVT != MVT::i32 && PVT != MVT::f32)
   6715       return SDValue();
   6716 
   6717     int FI = -1;
   6718     int64_t Offset = 0;
   6719     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
   6720       FI = FINode->getIndex();
   6721       Offset = 0;
   6722     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
   6723                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
   6724       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
   6725       Offset = Ptr.getConstantOperandVal(1);
   6726       Ptr = Ptr.getOperand(0);
   6727     } else {
   6728       return SDValue();
   6729     }
   6730 
   6731     // FIXME: 256-bit vector instructions don't require a strict alignment,
   6732     // improve this code to support it better.
   6733     unsigned RequiredAlign = VT.getSizeInBits()/8;
   6734     SDValue Chain = LD->getChain();
   6735     // Make sure the stack object alignment is at least 16 or 32.
   6736     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   6737     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
   6738       if (MFI.isFixedObjectIndex(FI)) {
   6739         // Can't change the alignment. FIXME: It's possible to compute
   6740         // the exact stack offset and reference FI + adjust offset instead.
   6741         // If someone *really* cares about this. That's the way to implement it.
   6742         return SDValue();
   6743       } else {
   6744         MFI.setObjectAlignment(FI, RequiredAlign);
   6745       }
   6746     }
   6747 
   6748     // (Offset % 16 or 32) must be multiple of 4. Then address is then
   6749     // Ptr + (Offset & ~15).
   6750     if (Offset < 0)
   6751       return SDValue();
   6752     if ((Offset % RequiredAlign) & 3)
   6753       return SDValue();
   6754     int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
   6755     if (StartOffset) {
   6756       SDLoc DL(Ptr);
   6757       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
   6758                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
   6759     }
   6760 
   6761     int EltNo = (Offset - StartOffset) >> 2;
   6762     unsigned NumElems = VT.getVectorNumElements();
   6763 
   6764     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
   6765     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
   6766                              LD->getPointerInfo().getWithOffset(StartOffset));
   6767 
   6768     SmallVector<int, 8> Mask(NumElems, EltNo);
   6769 
   6770     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
   6771   }
   6772 
   6773   return SDValue();
   6774 }
   6775 
   6776 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
   6777 /// elements can be replaced by a single large load which has the same value as
   6778 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
   6779 ///
   6780 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
   6781 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   6782                                         const SDLoc &DL, SelectionDAG &DAG,
   6783                                         const X86Subtarget &Subtarget,
   6784                                         bool isAfterLegalize) {
   6785   unsigned NumElems = Elts.size();
   6786 
   6787   int LastLoadedElt = -1;
   6788   SmallBitVector LoadMask(NumElems, false);
   6789   SmallBitVector ZeroMask(NumElems, false);
   6790   SmallBitVector UndefMask(NumElems, false);
   6791 
   6792   // For each element in the initializer, see if we've found a load, zero or an
   6793   // undef.
   6794   for (unsigned i = 0; i < NumElems; ++i) {
   6795     SDValue Elt = peekThroughBitcasts(Elts[i]);
   6796     if (!Elt.getNode())
   6797       return SDValue();
   6798 
   6799     if (Elt.isUndef())
   6800       UndefMask[i] = true;
   6801     else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
   6802       ZeroMask[i] = true;
   6803     else if (ISD::isNON_EXTLoad(Elt.getNode())) {
   6804       LoadMask[i] = true;
   6805       LastLoadedElt = i;
   6806       // Each loaded element must be the correct fractional portion of the
   6807       // requested vector load.
   6808       if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
   6809         return SDValue();
   6810     } else
   6811       return SDValue();
   6812   }
   6813   assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
   6814          "Incomplete element masks");
   6815 
   6816   // Handle Special Cases - all undef or undef/zero.
   6817   if (UndefMask.count() == NumElems)
   6818     return DAG.getUNDEF(VT);
   6819 
   6820   // FIXME: Should we return this as a BUILD_VECTOR instead?
   6821   if ((ZeroMask | UndefMask).count() == NumElems)
   6822     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
   6823                           : DAG.getConstantFP(0.0, DL, VT);
   6824 
   6825   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   6826   int FirstLoadedElt = LoadMask.find_first();
   6827   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
   6828   LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
   6829   EVT LDBaseVT = EltBase.getValueType();
   6830 
   6831   // Consecutive loads can contain UNDEFS but not ZERO elements.
   6832   // Consecutive loads with UNDEFs and ZEROs elements require a
   6833   // an additional shuffle stage to clear the ZERO elements.
   6834   bool IsConsecutiveLoad = true;
   6835   bool IsConsecutiveLoadWithZeros = true;
   6836   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
   6837     if (LoadMask[i]) {
   6838       SDValue Elt = peekThroughBitcasts(Elts[i]);
   6839       LoadSDNode *LD = cast<LoadSDNode>(Elt);
   6840       if (!DAG.areNonVolatileConsecutiveLoads(
   6841               LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
   6842               i - FirstLoadedElt)) {
   6843         IsConsecutiveLoad = false;
   6844         IsConsecutiveLoadWithZeros = false;
   6845         break;
   6846       }
   6847     } else if (ZeroMask[i]) {
   6848       IsConsecutiveLoad = false;
   6849     }
   6850   }
   6851 
   6852   SmallVector<LoadSDNode *, 8> Loads;
   6853   for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
   6854     if (LoadMask[i])
   6855       Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
   6856 
   6857   auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
   6858     auto MMOFlags = LDBase->getMemOperand()->getFlags();
   6859     assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
   6860            "Cannot merge volatile loads.");
   6861     SDValue NewLd =
   6862         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
   6863                     LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
   6864     for (auto *LD : Loads)
   6865       DAG.makeEquivalentMemoryOrdering(LD, NewLd);
   6866     return NewLd;
   6867   };
   6868 
   6869   // LOAD - all consecutive load/undefs (must start/end with a load).
   6870   // If we have found an entire vector of loads and undefs, then return a large
   6871   // load of the entire vector width starting at the base pointer.
   6872   // If the vector contains zeros, then attempt to shuffle those elements.
   6873   if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
   6874       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
   6875     assert(LDBase && "Did not find base load for merging consecutive loads");
   6876     EVT EltVT = LDBase->getValueType(0);
   6877     // Ensure that the input vector size for the merged loads matches the
   6878     // cumulative size of the input elements.
   6879     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
   6880       return SDValue();
   6881 
   6882     if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
   6883       return SDValue();
   6884 
   6885     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
   6886     // will lower to regular temporal loads and use the cache.
   6887     if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
   6888         VT.is256BitVector() && !Subtarget.hasInt256())
   6889       return SDValue();
   6890 
   6891     if (IsConsecutiveLoad)
   6892       return CreateLoad(VT, LDBase);
   6893 
   6894     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
   6895     // vector and a zero vector to clear out the zero elements.
   6896     if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
   6897       SmallVector<int, 4> ClearMask(NumElems, -1);
   6898       for (unsigned i = 0; i < NumElems; ++i) {
   6899         if (ZeroMask[i])
   6900           ClearMask[i] = i + NumElems;
   6901         else if (LoadMask[i])
   6902           ClearMask[i] = i;
   6903       }
   6904       SDValue V = CreateLoad(VT, LDBase);
   6905       SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
   6906                                  : DAG.getConstantFP(0.0, DL, VT);
   6907       return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
   6908     }
   6909   }
   6910 
   6911   int LoadSize =
   6912       (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
   6913 
   6914   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
   6915   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
   6916       (LoadSize == 32 || LoadSize == 64) &&
   6917       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
   6918     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
   6919                                       : MVT::getIntegerVT(LoadSize);
   6920     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
   6921     if (TLI.isTypeLegal(VecVT)) {
   6922       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
   6923       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
   6924       SDValue ResNode =
   6925           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
   6926                                   LDBase->getPointerInfo(),
   6927                                   LDBase->getAlignment(),
   6928                                   MachineMemOperand::MOLoad);
   6929       for (auto *LD : Loads)
   6930         DAG.makeEquivalentMemoryOrdering(LD, ResNode);
   6931       return DAG.getBitcast(VT, ResNode);
   6932     }
   6933   }
   6934 
   6935   return SDValue();
   6936 }
   6937 
   6938 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
   6939                                    unsigned SplatBitSize, LLVMContext &C) {
   6940   unsigned ScalarSize = VT.getScalarSizeInBits();
   6941   unsigned NumElm = SplatBitSize / ScalarSize;
   6942 
   6943   SmallVector<Constant *, 32> ConstantVec;
   6944   for (unsigned i = 0; i < NumElm; i++) {
   6945     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
   6946     Constant *Const;
   6947     if (VT.isFloatingPoint()) {
   6948       if (ScalarSize == 32) {
   6949         Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
   6950       } else {
   6951         assert(ScalarSize == 64 && "Unsupported floating point scalar size");
   6952         Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
   6953       }
   6954     } else
   6955       Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
   6956     ConstantVec.push_back(Const);
   6957   }
   6958   return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
   6959 }
   6960 
   6961 static bool isUseOfShuffle(SDNode *N) {
   6962   for (auto *U : N->uses()) {
   6963     if (isTargetShuffle(U->getOpcode()))
   6964       return true;
   6965     if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
   6966       return isUseOfShuffle(U);
   6967   }
   6968   return false;
   6969 }
   6970 
   6971 // Check if the current node of build vector is a zero extended vector.
   6972 // // If so, return the value extended.
   6973 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
   6974 // // NumElt - return the number of zero extended identical values.
   6975 // // EltType - return the type of the value include the zero extend.
   6976 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
   6977                                    unsigned &NumElt, MVT &EltType) {
   6978   SDValue ExtValue = Op->getOperand(0);
   6979   unsigned NumElts = Op->getNumOperands();
   6980   unsigned Delta = NumElts;
   6981 
   6982   for (unsigned i = 1; i < NumElts; i++) {
   6983     if (Op->getOperand(i) == ExtValue) {
   6984       Delta = i;
   6985       break;
   6986     }
   6987     if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
   6988       return SDValue();
   6989   }
   6990   if (!isPowerOf2_32(Delta) || Delta == 1)
   6991     return SDValue();
   6992 
   6993   for (unsigned i = Delta; i < NumElts; i++) {
   6994     if (i % Delta == 0) {
   6995       if (Op->getOperand(i) != ExtValue)
   6996         return SDValue();
   6997     } else if (!(isNullConstant(Op->getOperand(i)) ||
   6998                  Op->getOperand(i).isUndef()))
   6999       return SDValue();
   7000   }
   7001   unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
   7002   unsigned ExtVTSize = EltSize * Delta;
   7003   EltType = MVT::getIntegerVT(ExtVTSize);
   7004   NumElt = NumElts / Delta;
   7005   return ExtValue;
   7006 }
   7007 
   7008 /// Attempt to use the vbroadcast instruction to generate a splat value
   7009 /// from a splat BUILD_VECTOR which uses:
   7010 ///  a. A single scalar load, or a constant.
   7011 ///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
   7012 ///
   7013 /// The VBROADCAST node is returned when a pattern is found,
   7014 /// or SDValue() otherwise.
   7015 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
   7016                                            const X86Subtarget &Subtarget,
   7017                                            SelectionDAG &DAG) {
   7018   // VBROADCAST requires AVX.
   7019   // TODO: Splats could be generated for non-AVX CPUs using SSE
   7020   // instructions, but there's less potential gain for only 128-bit vectors.
   7021   if (!Subtarget.hasAVX())
   7022     return SDValue();
   7023 
   7024   MVT VT = BVOp->getSimpleValueType(0);
   7025   SDLoc dl(BVOp);
   7026 
   7027   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
   7028          "Unsupported vector type for broadcast.");
   7029 
   7030   BitVector UndefElements;
   7031   SDValue Ld = BVOp->getSplatValue(&UndefElements);
   7032 
   7033   // Attempt to use VBROADCASTM
   7034   // From this paterrn:
   7035   // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
   7036   // b. t1 = (build_vector t0 t0)
   7037   //
   7038   // Create (VBROADCASTM v2i1 X)
   7039   if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
   7040     MVT EltType = VT.getScalarType();
   7041     unsigned NumElts = VT.getVectorNumElements();
   7042     SDValue BOperand;
   7043     SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
   7044     if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
   7045         (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
   7046          Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
   7047       if (ZeroExtended)
   7048         BOperand = ZeroExtended.getOperand(0);
   7049       else
   7050         BOperand = Ld.getOperand(0).getOperand(0);
   7051       MVT MaskVT = BOperand.getSimpleValueType();
   7052       if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
   7053           (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
   7054         SDValue Brdcst =
   7055             DAG.getNode(X86ISD::VBROADCASTM, dl,
   7056                         MVT::getVectorVT(EltType, NumElts), BOperand);
   7057         return DAG.getBitcast(VT, Brdcst);
   7058       }
   7059     }
   7060   }
   7061 
   7062   // We need a splat of a single value to use broadcast, and it doesn't
   7063   // make any sense if the value is only in one element of the vector.
   7064   if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
   7065     APInt SplatValue, Undef;
   7066     unsigned SplatBitSize;
   7067     bool HasUndef;
   7068     // Check if this is a repeated constant pattern suitable for broadcasting.
   7069     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
   7070         SplatBitSize > VT.getScalarSizeInBits() &&
   7071         SplatBitSize < VT.getSizeInBits()) {
   7072       // Avoid replacing with broadcast when it's a use of a shuffle
   7073       // instruction to preserve the present custom lowering of shuffles.
   7074       if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
   7075         return SDValue();
   7076       // replace BUILD_VECTOR with broadcast of the repeated constants.
   7077       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   7078       LLVMContext *Ctx = DAG.getContext();
   7079       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
   7080       if (Subtarget.hasAVX()) {
   7081         if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
   7082             !(SplatBitSize == 64 && Subtarget.is32Bit())) {
   7083           // Splatted value can fit in one INTEGER constant in constant pool.
   7084           // Load the constant and broadcast it.
   7085           MVT CVT = MVT::getIntegerVT(SplatBitSize);
   7086           Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
   7087           Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
   7088           SDValue CP = DAG.getConstantPool(C, PVT);
   7089           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
   7090 
   7091           unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   7092           Ld = DAG.getLoad(
   7093               CVT, dl, DAG.getEntryNode(), CP,
   7094               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   7095               Alignment);
   7096           SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
   7097                                        MVT::getVectorVT(CVT, Repeat), Ld);
   7098           return DAG.getBitcast(VT, Brdcst);
   7099         } else if (SplatBitSize == 32 || SplatBitSize == 64) {
   7100           // Splatted value can fit in one FLOAT constant in constant pool.
   7101           // Load the constant and broadcast it.
   7102           // AVX have support for 32 and 64 bit broadcast for floats only.
   7103           // No 64bit integer in 32bit subtarget.
   7104           MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
   7105           // Lower the splat via APFloat directly, to avoid any conversion.
   7106           Constant *C =
   7107               SplatBitSize == 32
   7108                   ? ConstantFP::get(*Ctx,
   7109                                     APFloat(APFloat::IEEEsingle(), SplatValue))
   7110                   : ConstantFP::get(*Ctx,
   7111                                     APFloat(APFloat::IEEEdouble(), SplatValue));
   7112           SDValue CP = DAG.getConstantPool(C, PVT);
   7113           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
   7114 
   7115           unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   7116           Ld = DAG.getLoad(
   7117               CVT, dl, DAG.getEntryNode(), CP,
   7118               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   7119               Alignment);
   7120           SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
   7121                                        MVT::getVectorVT(CVT, Repeat), Ld);
   7122           return DAG.getBitcast(VT, Brdcst);
   7123         } else if (SplatBitSize > 64) {
   7124           // Load the vector of constants and broadcast it.
   7125           MVT CVT = VT.getScalarType();
   7126           Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
   7127                                              *Ctx);
   7128           SDValue VCP = DAG.getConstantPool(VecC, PVT);
   7129           unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
   7130           unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
   7131           Ld = DAG.getLoad(
   7132               MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
   7133               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   7134               Alignment);
   7135           SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
   7136           return DAG.getBitcast(VT, Brdcst);
   7137         }
   7138       }
   7139     }
   7140     return SDValue();
   7141   }
   7142 
   7143   bool ConstSplatVal =
   7144       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
   7145 
   7146   // Make sure that all of the users of a non-constant load are from the
   7147   // BUILD_VECTOR node.
   7148   if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
   7149     return SDValue();
   7150 
   7151   unsigned ScalarSize = Ld.getValueSizeInBits();
   7152   bool IsGE256 = (VT.getSizeInBits() >= 256);
   7153 
   7154   // When optimizing for size, generate up to 5 extra bytes for a broadcast
   7155   // instruction to save 8 or more bytes of constant pool data.
   7156   // TODO: If multiple splats are generated to load the same constant,
   7157   // it may be detrimental to overall size. There needs to be a way to detect
   7158   // that condition to know if this is truly a size win.
   7159   bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
   7160 
   7161   // Handle broadcasting a single constant scalar from the constant pool
   7162   // into a vector.
   7163   // On Sandybridge (no AVX2), it is still better to load a constant vector
   7164   // from the constant pool and not to broadcast it from a scalar.
   7165   // But override that restriction when optimizing for size.
   7166   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
   7167   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
   7168     EVT CVT = Ld.getValueType();
   7169     assert(!CVT.isVector() && "Must not broadcast a vector type");
   7170 
   7171     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
   7172     // For size optimization, also splat v2f64 and v2i64, and for size opt
   7173     // with AVX2, also splat i8 and i16.
   7174     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
   7175     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
   7176         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
   7177       const Constant *C = nullptr;
   7178       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
   7179         C = CI->getConstantIntValue();
   7180       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
   7181         C = CF->getConstantFPValue();
   7182 
   7183       assert(C && "Invalid constant type");
   7184 
   7185       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   7186       SDValue CP =
   7187           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
   7188       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
   7189       Ld = DAG.getLoad(
   7190           CVT, dl, DAG.getEntryNode(), CP,
   7191           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   7192           Alignment);
   7193 
   7194       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   7195     }
   7196   }
   7197 
   7198   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
   7199 
   7200   // Handle AVX2 in-register broadcasts.
   7201   if (!IsLoad && Subtarget.hasInt256() &&
   7202       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
   7203     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   7204 
   7205   // The scalar source must be a normal load.
   7206   if (!IsLoad)
   7207     return SDValue();
   7208 
   7209   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
   7210       (Subtarget.hasVLX() && ScalarSize == 64))
   7211     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   7212 
   7213   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   7214   // double since there is no vbroadcastsd xmm
   7215   if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
   7216     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
   7217       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   7218   }
   7219 
   7220   // Unsupported broadcast.
   7221   return SDValue();
   7222 }
   7223 
   7224 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
   7225 /// underlying vector and index.
   7226 ///
   7227 /// Modifies \p ExtractedFromVec to the real vector and returns the real
   7228 /// index.
   7229 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
   7230                                          SDValue ExtIdx) {
   7231   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
   7232   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
   7233     return Idx;
   7234 
   7235   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
   7236   // lowered this:
   7237   //   (extract_vector_elt (v8f32 %1), Constant<6>)
   7238   // to:
   7239   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
   7240   //                           (extract_subvector (v8f32 %0), Constant<4>),
   7241   //                           undef)
   7242   //                       Constant<0>)
   7243   // In this case the vector is the extract_subvector expression and the index
   7244   // is 2, as specified by the shuffle.
   7245   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
   7246   SDValue ShuffleVec = SVOp->getOperand(0);
   7247   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
   7248   assert(ShuffleVecVT.getVectorElementType() ==
   7249          ExtractedFromVec.getSimpleValueType().getVectorElementType());
   7250 
   7251   int ShuffleIdx = SVOp->getMaskElt(Idx);
   7252   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
   7253     ExtractedFromVec = ShuffleVec;
   7254     return ShuffleIdx;
   7255   }
   7256   return Idx;
   7257 }
   7258 
   7259 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
   7260   MVT VT = Op.getSimpleValueType();
   7261 
   7262   // Skip if insert_vec_elt is not supported.
   7263   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   7264   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
   7265     return SDValue();
   7266 
   7267   SDLoc DL(Op);
   7268   unsigned NumElems = Op.getNumOperands();
   7269 
   7270   SDValue VecIn1;
   7271   SDValue VecIn2;
   7272   SmallVector<unsigned, 4> InsertIndices;
   7273   SmallVector<int, 8> Mask(NumElems, -1);
   7274 
   7275   for (unsigned i = 0; i != NumElems; ++i) {
   7276     unsigned Opc = Op.getOperand(i).getOpcode();
   7277 
   7278     if (Opc == ISD::UNDEF)
   7279       continue;
   7280 
   7281     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
   7282       // Quit if more than 1 elements need inserting.
   7283       if (InsertIndices.size() > 1)
   7284         return SDValue();
   7285 
   7286       InsertIndices.push_back(i);
   7287       continue;
   7288     }
   7289 
   7290     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
   7291     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
   7292 
   7293     // Quit if non-constant index.
   7294     if (!isa<ConstantSDNode>(ExtIdx))
   7295       return SDValue();
   7296     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
   7297 
   7298     // Quit if extracted from vector of different type.
   7299     if (ExtractedFromVec.getValueType() != VT)
   7300       return SDValue();
   7301 
   7302     if (!VecIn1.getNode())
   7303       VecIn1 = ExtractedFromVec;
   7304     else if (VecIn1 != ExtractedFromVec) {
   7305       if (!VecIn2.getNode())
   7306         VecIn2 = ExtractedFromVec;
   7307       else if (VecIn2 != ExtractedFromVec)
   7308         // Quit if more than 2 vectors to shuffle
   7309         return SDValue();
   7310     }
   7311 
   7312     if (ExtractedFromVec == VecIn1)
   7313       Mask[i] = Idx;
   7314     else if (ExtractedFromVec == VecIn2)
   7315       Mask[i] = Idx + NumElems;
   7316   }
   7317 
   7318   if (!VecIn1.getNode())
   7319     return SDValue();
   7320 
   7321   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
   7322   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
   7323 
   7324   for (unsigned Idx : InsertIndices)
   7325     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
   7326                      DAG.getIntPtrConstant(Idx, DL));
   7327 
   7328   return NV;
   7329 }
   7330 
   7331 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
   7332   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
   7333          Op.getScalarValueSizeInBits() == 1 &&
   7334          "Can not convert non-constant vector");
   7335   uint64_t Immediate = 0;
   7336   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
   7337     SDValue In = Op.getOperand(idx);
   7338     if (!In.isUndef())
   7339       Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
   7340   }
   7341   SDLoc dl(Op);
   7342   MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
   7343   return DAG.getConstant(Immediate, dl, VT);
   7344 }
   7345 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
   7346 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
   7347                                      const X86Subtarget &Subtarget) {
   7348 
   7349   MVT VT = Op.getSimpleValueType();
   7350   assert((VT.getVectorElementType() == MVT::i1) &&
   7351          "Unexpected type in LowerBUILD_VECTORvXi1!");
   7352 
   7353   SDLoc dl(Op);
   7354   if (ISD::isBuildVectorAllZeros(Op.getNode()))
   7355     return Op;
   7356 
   7357   if (ISD::isBuildVectorAllOnes(Op.getNode()))
   7358     return Op;
   7359 
   7360   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
   7361     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
   7362       // Split the pieces.
   7363       SDValue Lower =
   7364           DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
   7365       SDValue Upper =
   7366           DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
   7367       // We have to manually lower both halves so getNode doesn't try to
   7368       // reassemble the build_vector.
   7369       Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
   7370       Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
   7371       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
   7372     }
   7373     SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
   7374     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
   7375       return DAG.getBitcast(VT, Imm);
   7376     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
   7377     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
   7378                         DAG.getIntPtrConstant(0, dl));
   7379   }
   7380 
   7381   // Vector has one or more non-const elements
   7382   uint64_t Immediate = 0;
   7383   SmallVector<unsigned, 16> NonConstIdx;
   7384   bool IsSplat = true;
   7385   bool HasConstElts = false;
   7386   int SplatIdx = -1;
   7387   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
   7388     SDValue In = Op.getOperand(idx);
   7389     if (In.isUndef())
   7390       continue;
   7391     if (!isa<ConstantSDNode>(In))
   7392       NonConstIdx.push_back(idx);
   7393     else {
   7394       Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
   7395       HasConstElts = true;
   7396     }
   7397     if (SplatIdx < 0)
   7398       SplatIdx = idx;
   7399     else if (In != Op.getOperand(SplatIdx))
   7400       IsSplat = false;
   7401   }
   7402 
   7403   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
   7404   if (IsSplat)
   7405     return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
   7406                          DAG.getConstant(1, dl, VT),
   7407                          DAG.getConstant(0, dl, VT));
   7408 
   7409   // insert elements one by one
   7410   SDValue DstVec;
   7411   SDValue Imm;
   7412   if (Immediate) {
   7413     MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
   7414     Imm = DAG.getConstant(Immediate, dl, ImmVT);
   7415   }
   7416   else if (HasConstElts)
   7417     Imm = DAG.getConstant(0, dl, VT);
   7418   else
   7419     Imm = DAG.getUNDEF(VT);
   7420   if (Imm.getValueSizeInBits() == VT.getSizeInBits())
   7421     DstVec = DAG.getBitcast(VT, Imm);
   7422   else {
   7423     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
   7424     DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
   7425                          DAG.getIntPtrConstant(0, dl));
   7426   }
   7427 
   7428   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
   7429     unsigned InsertIdx = NonConstIdx[i];
   7430     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
   7431                          Op.getOperand(InsertIdx),
   7432                          DAG.getIntPtrConstant(InsertIdx, dl));
   7433   }
   7434   return DstVec;
   7435 }
   7436 
   7437 /// Return true if \p N implements a horizontal binop and return the
   7438 /// operands for the horizontal binop into V0 and V1.
   7439 ///
   7440 /// This is a helper function of LowerToHorizontalOp().
   7441 /// This function checks that the build_vector \p N in input implements a
   7442 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
   7443 /// operation to match.
   7444 /// For example, if \p Opcode is equal to ISD::ADD, then this function
   7445 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
   7446 /// is equal to ISD::SUB, then this function checks if this is a horizontal
   7447 /// arithmetic sub.
   7448 ///
   7449 /// This function only analyzes elements of \p N whose indices are
   7450 /// in range [BaseIdx, LastIdx).
   7451 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
   7452                               SelectionDAG &DAG,
   7453                               unsigned BaseIdx, unsigned LastIdx,
   7454                               SDValue &V0, SDValue &V1) {
   7455   EVT VT = N->getValueType(0);
   7456 
   7457   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
   7458   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
   7459          "Invalid Vector in input!");
   7460 
   7461   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
   7462   bool CanFold = true;
   7463   unsigned ExpectedVExtractIdx = BaseIdx;
   7464   unsigned NumElts = LastIdx - BaseIdx;
   7465   V0 = DAG.getUNDEF(VT);
   7466   V1 = DAG.getUNDEF(VT);
   7467 
   7468   // Check if N implements a horizontal binop.
   7469   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
   7470     SDValue Op = N->getOperand(i + BaseIdx);
   7471 
   7472     // Skip UNDEFs.
   7473     if (Op->isUndef()) {
   7474       // Update the expected vector extract index.
   7475       if (i * 2 == NumElts)
   7476         ExpectedVExtractIdx = BaseIdx;
   7477       ExpectedVExtractIdx += 2;
   7478       continue;
   7479     }
   7480 
   7481     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
   7482 
   7483     if (!CanFold)
   7484       break;
   7485 
   7486     SDValue Op0 = Op.getOperand(0);
   7487     SDValue Op1 = Op.getOperand(1);
   7488 
   7489     // Try to match the following pattern:
   7490     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
   7491     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   7492         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   7493         Op0.getOperand(0) == Op1.getOperand(0) &&
   7494         isa<ConstantSDNode>(Op0.getOperand(1)) &&
   7495         isa<ConstantSDNode>(Op1.getOperand(1)));
   7496     if (!CanFold)
   7497       break;
   7498 
   7499     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
   7500     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
   7501 
   7502     if (i * 2 < NumElts) {
   7503       if (V0.isUndef()) {
   7504         V0 = Op0.getOperand(0);
   7505         if (V0.getValueType() != VT)
   7506           return false;
   7507       }
   7508     } else {
   7509       if (V1.isUndef()) {
   7510         V1 = Op0.getOperand(0);
   7511         if (V1.getValueType() != VT)
   7512           return false;
   7513       }
   7514       if (i * 2 == NumElts)
   7515         ExpectedVExtractIdx = BaseIdx;
   7516     }
   7517 
   7518     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
   7519     if (I0 == ExpectedVExtractIdx)
   7520       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
   7521     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
   7522       // Try to match the following dag sequence:
   7523       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
   7524       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
   7525     } else
   7526       CanFold = false;
   7527 
   7528     ExpectedVExtractIdx += 2;
   7529   }
   7530 
   7531   return CanFold;
   7532 }
   7533 
   7534 /// Emit a sequence of two 128-bit horizontal add/sub followed by
   7535 /// a concat_vector.
   7536 ///
   7537 /// This is a helper function of LowerToHorizontalOp().
   7538 /// This function expects two 256-bit vectors called V0 and V1.
   7539 /// At first, each vector is split into two separate 128-bit vectors.
   7540 /// Then, the resulting 128-bit vectors are used to implement two
   7541 /// horizontal binary operations.
   7542 ///
   7543 /// The kind of horizontal binary operation is defined by \p X86Opcode.
   7544 ///
   7545 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
   7546 /// the two new horizontal binop.
   7547 /// When Mode is set, the first horizontal binop dag node would take as input
   7548 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
   7549 /// horizontal binop dag node would take as input the lower 128-bit of V1
   7550 /// and the upper 128-bit of V1.
   7551 ///   Example:
   7552 ///     HADD V0_LO, V0_HI
   7553 ///     HADD V1_LO, V1_HI
   7554 ///
   7555 /// Otherwise, the first horizontal binop dag node takes as input the lower
   7556 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
   7557 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
   7558 ///   Example:
   7559 ///     HADD V0_LO, V1_LO
   7560 ///     HADD V0_HI, V1_HI
   7561 ///
   7562 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
   7563 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
   7564 /// the upper 128-bits of the result.
   7565 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
   7566                                      const SDLoc &DL, SelectionDAG &DAG,
   7567                                      unsigned X86Opcode, bool Mode,
   7568                                      bool isUndefLO, bool isUndefHI) {
   7569   MVT VT = V0.getSimpleValueType();
   7570   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
   7571          "Invalid nodes in input!");
   7572 
   7573   unsigned NumElts = VT.getVectorNumElements();
   7574   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
   7575   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
   7576   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
   7577   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
   7578   MVT NewVT = V0_LO.getSimpleValueType();
   7579 
   7580   SDValue LO = DAG.getUNDEF(NewVT);
   7581   SDValue HI = DAG.getUNDEF(NewVT);
   7582 
   7583   if (Mode) {
   7584     // Don't emit a horizontal binop if the result is expected to be UNDEF.
   7585     if (!isUndefLO && !V0->isUndef())
   7586       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
   7587     if (!isUndefHI && !V1->isUndef())
   7588       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
   7589   } else {
   7590     // Don't emit a horizontal binop if the result is expected to be UNDEF.
   7591     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
   7592       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
   7593 
   7594     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
   7595       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
   7596   }
   7597 
   7598   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
   7599 }
   7600 
   7601 /// Returns true iff \p BV builds a vector with the result equivalent to
   7602 /// the result of ADDSUB/SUBADD operation.
   7603 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
   7604 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
   7605 /// \p Opnd0 and \p Opnd1.
   7606 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
   7607                              const X86Subtarget &Subtarget, SelectionDAG &DAG,
   7608                              SDValue &Opnd0, SDValue &Opnd1,
   7609                              unsigned &NumExtracts,
   7610                              bool &IsSubAdd) {
   7611 
   7612   MVT VT = BV->getSimpleValueType(0);
   7613   if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
   7614     return false;
   7615 
   7616   unsigned NumElts = VT.getVectorNumElements();
   7617   SDValue InVec0 = DAG.getUNDEF(VT);
   7618   SDValue InVec1 = DAG.getUNDEF(VT);
   7619 
   7620   NumExtracts = 0;
   7621 
   7622   // Odd-numbered elements in the input build vector are obtained from
   7623   // adding/subtracting two integer/float elements.
   7624   // Even-numbered elements in the input build vector are obtained from
   7625   // subtracting/adding two integer/float elements.
   7626   unsigned Opc[2] {0, 0};
   7627   for (unsigned i = 0, e = NumElts; i != e; ++i) {
   7628     SDValue Op = BV->getOperand(i);
   7629 
   7630     // Skip 'undef' values.
   7631     unsigned Opcode = Op.getOpcode();
   7632     if (Opcode == ISD::UNDEF)
   7633       continue;
   7634 
   7635     // Early exit if we found an unexpected opcode.
   7636     if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
   7637       return false;
   7638 
   7639     SDValue Op0 = Op.getOperand(0);
   7640     SDValue Op1 = Op.getOperand(1);
   7641 
   7642     // Try to match the following pattern:
   7643     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
   7644     // Early exit if we cannot match that sequence.
   7645     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   7646         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   7647         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
   7648         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
   7649         Op0.getOperand(1) != Op1.getOperand(1))
   7650       return false;
   7651 
   7652     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
   7653     if (I0 != i)
   7654       return false;
   7655 
   7656     // We found a valid add/sub node, make sure its the same opcode as previous
   7657     // elements for this parity.
   7658     if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
   7659       return false;
   7660     Opc[i % 2] = Opcode;
   7661 
   7662     // Update InVec0 and InVec1.
   7663     if (InVec0.isUndef()) {
   7664       InVec0 = Op0.getOperand(0);
   7665       if (InVec0.getSimpleValueType() != VT)
   7666         return false;
   7667     }
   7668     if (InVec1.isUndef()) {
   7669       InVec1 = Op1.getOperand(0);
   7670       if (InVec1.getSimpleValueType() != VT)
   7671         return false;
   7672     }
   7673 
   7674     // Make sure that operands in input to each add/sub node always
   7675     // come from a same pair of vectors.
   7676     if (InVec0 != Op0.getOperand(0)) {
   7677       if (Opcode == ISD::FSUB)
   7678         return false;
   7679 
   7680       // FADD is commutable. Try to commute the operands
   7681       // and then test again.
   7682       std::swap(Op0, Op1);
   7683       if (InVec0 != Op0.getOperand(0))
   7684         return false;
   7685     }
   7686 
   7687     if (InVec1 != Op1.getOperand(0))
   7688       return false;
   7689 
   7690     // Increment the number of extractions done.
   7691     ++NumExtracts;
   7692   }
   7693 
   7694   // Ensure we have found an opcode for both parities and that they are
   7695   // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
   7696   // inputs are undef.
   7697   if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
   7698       InVec0.isUndef() || InVec1.isUndef())
   7699     return false;
   7700 
   7701   IsSubAdd = Opc[0] == ISD::FADD;
   7702 
   7703   Opnd0 = InVec0;
   7704   Opnd1 = InVec1;
   7705   return true;
   7706 }
   7707 
   7708 /// Returns true if is possible to fold MUL and an idiom that has already been
   7709 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
   7710 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
   7711 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
   7712 ///
   7713 /// Prior to calling this function it should be known that there is some
   7714 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
   7715 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
   7716 /// before replacement of such SDNode with ADDSUB operation. Thus the number
   7717 /// of \p Opnd0 uses is expected to be equal to 2.
   7718 /// For example, this function may be called for the following IR:
   7719 ///    %AB = fmul fast <2 x double> %A, %B
   7720 ///    %Sub = fsub fast <2 x double> %AB, %C
   7721 ///    %Add = fadd fast <2 x double> %AB, %C
   7722 ///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
   7723 ///                            <2 x i32> <i32 0, i32 3>
   7724 /// There is a def for %Addsub here, which potentially can be replaced by
   7725 /// X86ISD::ADDSUB operation:
   7726 ///    %Addsub = X86ISD::ADDSUB %AB, %C
   7727 /// and such ADDSUB can further be replaced with FMADDSUB:
   7728 ///    %Addsub = FMADDSUB %A, %B, %C.
   7729 ///
   7730 /// The main reason why this method is called before the replacement of the
   7731 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
   7732 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
   7733 /// FMADDSUB is.
   7734 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
   7735                                  SelectionDAG &DAG,
   7736                                  SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
   7737                                  unsigned ExpectedUses) {
   7738   if (Opnd0.getOpcode() != ISD::FMUL ||
   7739       !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
   7740     return false;
   7741 
   7742   // FIXME: These checks must match the similar ones in
   7743   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
   7744   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
   7745   // or MUL + ADDSUB to FMADDSUB.
   7746   const TargetOptions &Options = DAG.getTarget().Options;
   7747   bool AllowFusion =
   7748       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
   7749   if (!AllowFusion)
   7750     return false;
   7751 
   7752   Opnd2 = Opnd1;
   7753   Opnd1 = Opnd0.getOperand(1);
   7754   Opnd0 = Opnd0.getOperand(0);
   7755 
   7756   return true;
   7757 }
   7758 
   7759 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
   7760 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
   7761 /// X86ISD::FMSUBADD node.
   7762 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
   7763                                        const X86Subtarget &Subtarget,
   7764                                        SelectionDAG &DAG) {
   7765   SDValue Opnd0, Opnd1;
   7766   unsigned NumExtracts;
   7767   bool IsSubAdd;
   7768   if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
   7769                         IsSubAdd))
   7770     return SDValue();
   7771 
   7772   MVT VT = BV->getSimpleValueType(0);
   7773   SDLoc DL(BV);
   7774 
   7775   // Try to generate X86ISD::FMADDSUB node here.
   7776   SDValue Opnd2;
   7777   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
   7778     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
   7779     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
   7780   }
   7781 
   7782   // We only support ADDSUB.
   7783   if (IsSubAdd)
   7784     return SDValue();
   7785 
   7786   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
   7787   // the ADDSUB idiom has been successfully recognized. There are no known
   7788   // X86 targets with 512-bit ADDSUB instructions!
   7789   // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
   7790   // recognition.
   7791   if (VT.is512BitVector())
   7792     return SDValue();
   7793 
   7794   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
   7795 }
   7796 
   7797 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
   7798 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
   7799                                    const X86Subtarget &Subtarget,
   7800                                    SelectionDAG &DAG) {
   7801   MVT VT = BV->getSimpleValueType(0);
   7802   unsigned NumElts = VT.getVectorNumElements();
   7803   unsigned NumUndefsLO = 0;
   7804   unsigned NumUndefsHI = 0;
   7805   unsigned Half = NumElts/2;
   7806 
   7807   // Count the number of UNDEF operands in the build_vector in input.
   7808   for (unsigned i = 0, e = Half; i != e; ++i)
   7809     if (BV->getOperand(i)->isUndef())
   7810       NumUndefsLO++;
   7811 
   7812   for (unsigned i = Half, e = NumElts; i != e; ++i)
   7813     if (BV->getOperand(i)->isUndef())
   7814       NumUndefsHI++;
   7815 
   7816   // Early exit if this is either a build_vector of all UNDEFs or all the
   7817   // operands but one are UNDEF.
   7818   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
   7819     return SDValue();
   7820 
   7821   SDLoc DL(BV);
   7822   SDValue InVec0, InVec1;
   7823   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
   7824     // Try to match an SSE3 float HADD/HSUB.
   7825     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
   7826       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
   7827 
   7828     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
   7829       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   7830   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
   7831     // Try to match an SSSE3 integer HADD/HSUB.
   7832     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
   7833       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
   7834 
   7835     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
   7836       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
   7837   }
   7838 
   7839   if (!Subtarget.hasAVX())
   7840     return SDValue();
   7841 
   7842   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
   7843     // Try to match an AVX horizontal add/sub of packed single/double
   7844     // precision floating point values from 256-bit vectors.
   7845     SDValue InVec2, InVec3;
   7846     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
   7847         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
   7848         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
   7849         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
   7850       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
   7851 
   7852     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
   7853         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
   7854         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
   7855         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
   7856       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   7857   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
   7858     // Try to match an AVX2 horizontal add/sub of signed integers.
   7859     SDValue InVec2, InVec3;
   7860     unsigned X86Opcode;
   7861     bool CanFold = true;
   7862 
   7863     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
   7864         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
   7865         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
   7866         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
   7867       X86Opcode = X86ISD::HADD;
   7868     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
   7869         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
   7870         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
   7871         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
   7872       X86Opcode = X86ISD::HSUB;
   7873     else
   7874       CanFold = false;
   7875 
   7876     if (CanFold) {
   7877       // Fold this build_vector into a single horizontal add/sub.
   7878       // Do this only if the target has AVX2.
   7879       if (Subtarget.hasAVX2())
   7880         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
   7881 
   7882       // Do not try to expand this build_vector into a pair of horizontal
   7883       // add/sub if we can emit a pair of scalar add/sub.
   7884       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
   7885         return SDValue();
   7886 
   7887       // Convert this build_vector into a pair of horizontal binop followed by
   7888       // a concat vector.
   7889       bool isUndefLO = NumUndefsLO == Half;
   7890       bool isUndefHI = NumUndefsHI == Half;
   7891       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
   7892                                    isUndefLO, isUndefHI);
   7893     }
   7894   }
   7895 
   7896   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
   7897        VT == MVT::v16i16) && Subtarget.hasAVX()) {
   7898     unsigned X86Opcode;
   7899     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
   7900       X86Opcode = X86ISD::HADD;
   7901     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
   7902       X86Opcode = X86ISD::HSUB;
   7903     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
   7904       X86Opcode = X86ISD::FHADD;
   7905     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
   7906       X86Opcode = X86ISD::FHSUB;
   7907     else
   7908       return SDValue();
   7909 
   7910     // Don't try to expand this build_vector into a pair of horizontal add/sub
   7911     // if we can simply emit a pair of scalar add/sub.
   7912     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
   7913       return SDValue();
   7914 
   7915     // Convert this build_vector into two horizontal add/sub followed by
   7916     // a concat vector.
   7917     bool isUndefLO = NumUndefsLO == Half;
   7918     bool isUndefHI = NumUndefsHI == Half;
   7919     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
   7920                                  isUndefLO, isUndefHI);
   7921   }
   7922 
   7923   return SDValue();
   7924 }
   7925 
   7926 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
   7927 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
   7928 /// just apply the bit to the vectors.
   7929 /// NOTE: Its not in our interest to start make a general purpose vectorizer
   7930 /// from this, but enough scalar bit operations are created from the later
   7931 /// legalization + scalarization stages to need basic support.
   7932 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
   7933                                        SelectionDAG &DAG) {
   7934   SDLoc DL(Op);
   7935   MVT VT = Op->getSimpleValueType(0);
   7936   unsigned NumElems = VT.getVectorNumElements();
   7937   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   7938 
   7939   // Check that all elements have the same opcode.
   7940   // TODO: Should we allow UNDEFS and if so how many?
   7941   unsigned Opcode = Op->getOperand(0).getOpcode();
   7942   for (unsigned i = 1; i < NumElems; ++i)
   7943     if (Opcode != Op->getOperand(i).getOpcode())
   7944       return SDValue();
   7945 
   7946   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
   7947   switch (Opcode) {
   7948   default:
   7949     return SDValue();
   7950   case ISD::AND:
   7951   case ISD::XOR:
   7952   case ISD::OR:
   7953     // Don't do this if the buildvector is a splat - we'd replace one
   7954     // constant with an entire vector.
   7955     if (Op->getSplatValue())
   7956       return SDValue();
   7957     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
   7958       return SDValue();
   7959     break;
   7960   }
   7961 
   7962   SmallVector<SDValue, 4> LHSElts, RHSElts;
   7963   for (SDValue Elt : Op->ops()) {
   7964     SDValue LHS = Elt.getOperand(0);
   7965     SDValue RHS = Elt.getOperand(1);
   7966 
   7967     // We expect the canonicalized RHS operand to be the constant.
   7968     if (!isa<ConstantSDNode>(RHS))
   7969       return SDValue();
   7970     LHSElts.push_back(LHS);
   7971     RHSElts.push_back(RHS);
   7972   }
   7973 
   7974   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
   7975   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
   7976   return DAG.getNode(Opcode, DL, VT, LHS, RHS);
   7977 }
   7978 
   7979 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
   7980 /// functionality to do this, so it's all zeros, all ones, or some derivation
   7981 /// that is cheap to calculate.
   7982 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
   7983                                          const X86Subtarget &Subtarget) {
   7984   SDLoc DL(Op);
   7985   MVT VT = Op.getSimpleValueType();
   7986 
   7987   // Vectors containing all zeros can be matched by pxor and xorps.
   7988   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
   7989     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
   7990     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
   7991     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
   7992       return Op;
   7993 
   7994     return getZeroVector(VT, Subtarget, DAG, DL);
   7995   }
   7996 
   7997   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   7998   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   7999   // vpcmpeqd on 256-bit vectors.
   8000   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
   8001     if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
   8002         (VT == MVT::v8i32 && Subtarget.hasInt256()))
   8003       return Op;
   8004 
   8005     return getOnesVector(VT, DAG, DL);
   8006   }
   8007 
   8008   return SDValue();
   8009 }
   8010 
   8011 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
   8012 /// from a vector of source values and a vector of extraction indices.
   8013 /// The vectors might be manipulated to match the type of the permute op.
   8014 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
   8015                                      SDLoc &DL, SelectionDAG &DAG,
   8016                                      const X86Subtarget &Subtarget) {
   8017   MVT ShuffleVT = VT;
   8018   EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
   8019   unsigned NumElts = VT.getVectorNumElements();
   8020   unsigned SizeInBits = VT.getSizeInBits();
   8021 
   8022   // Adjust IndicesVec to match VT size.
   8023   assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
   8024          "Illegal variable permute mask size");
   8025   if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
   8026     IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
   8027                                   NumElts * VT.getScalarSizeInBits());
   8028   IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
   8029 
   8030   // Handle SrcVec that don't match VT type.
   8031   if (SrcVec.getValueSizeInBits() != SizeInBits) {
   8032     if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
   8033       // Handle larger SrcVec by treating it as a larger permute.
   8034       unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
   8035       VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
   8036       IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
   8037       IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
   8038                                   Subtarget, DAG, SDLoc(IndicesVec));
   8039       return extractSubVector(
   8040           createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
   8041           DAG, DL, SizeInBits);
   8042     } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
   8043       // Widen smaller SrcVec to match VT.
   8044       SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
   8045     } else
   8046       return SDValue();
   8047   }
   8048 
   8049   auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
   8050     assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
   8051     EVT SrcVT = Idx.getValueType();
   8052     unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
   8053     uint64_t IndexScale = 0;
   8054     uint64_t IndexOffset = 0;
   8055 
   8056     // If we're scaling a smaller permute op, then we need to repeat the
   8057     // indices, scaling and offsetting them as well.
   8058     // e.g. v4i32 -> v16i8 (Scale = 4)
   8059     // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
   8060     // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
   8061     for (uint64_t i = 0; i != Scale; ++i) {
   8062       IndexScale |= Scale << (i * NumDstBits);
   8063       IndexOffset |= i << (i * NumDstBits);
   8064     }
   8065 
   8066     Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
   8067                       DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
   8068     Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
   8069                       DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
   8070     return Idx;
   8071   };
   8072 
   8073   unsigned Opcode = 0;
   8074   switch (VT.SimpleTy) {
   8075   default:
   8076     break;
   8077   case MVT::v16i8:
   8078     if (Subtarget.hasSSSE3())
   8079       Opcode = X86ISD::PSHUFB;
   8080     break;
   8081   case MVT::v8i16:
   8082     if (Subtarget.hasVLX() && Subtarget.hasBWI())
   8083       Opcode = X86ISD::VPERMV;
   8084     else if (Subtarget.hasSSSE3()) {
   8085       Opcode = X86ISD::PSHUFB;
   8086       ShuffleVT = MVT::v16i8;
   8087     }
   8088     break;
   8089   case MVT::v4f32:
   8090   case MVT::v4i32:
   8091     if (Subtarget.hasAVX()) {
   8092       Opcode = X86ISD::VPERMILPV;
   8093       ShuffleVT = MVT::v4f32;
   8094     } else if (Subtarget.hasSSSE3()) {
   8095       Opcode = X86ISD::PSHUFB;
   8096       ShuffleVT = MVT::v16i8;
   8097     }
   8098     break;
   8099   case MVT::v2f64:
   8100   case MVT::v2i64:
   8101     if (Subtarget.hasAVX()) {
   8102       // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
   8103       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
   8104       Opcode = X86ISD::VPERMILPV;
   8105       ShuffleVT = MVT::v2f64;
   8106     } else if (Subtarget.hasSSE41()) {
   8107       // SSE41 can compare v2i64 - select between indices 0 and 1.
   8108       return DAG.getSelectCC(
   8109           DL, IndicesVec,
   8110           getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
   8111           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
   8112           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
   8113           ISD::CondCode::SETEQ);
   8114     }
   8115     break;
   8116   case MVT::v32i8:
   8117     if (Subtarget.hasVLX() && Subtarget.hasVBMI())
   8118       Opcode = X86ISD::VPERMV;
   8119     else if (Subtarget.hasXOP()) {
   8120       SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
   8121       SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
   8122       SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
   8123       SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
   8124       return DAG.getNode(
   8125           ISD::CONCAT_VECTORS, DL, VT,
   8126           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
   8127           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
   8128     } else if (Subtarget.hasAVX()) {
   8129       SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
   8130       SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
   8131       SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
   8132       SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
   8133       auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   8134                               ArrayRef<SDValue> Ops) {
   8135         // Permute Lo and Hi and then select based on index range.
   8136         // This works as SHUFB uses bits[3:0] to permute elements and we don't
   8137         // care about the bit[7] as its just an index vector.
   8138         SDValue Idx = Ops[2];
   8139         EVT VT = Idx.getValueType();
   8140         return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
   8141                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
   8142                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
   8143                                ISD::CondCode::SETGT);
   8144       };
   8145       SDValue Ops[] = {LoLo, HiHi, IndicesVec};
   8146       return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
   8147                               PSHUFBBuilder);
   8148     }
   8149     break;
   8150   case MVT::v16i16:
   8151     if (Subtarget.hasVLX() && Subtarget.hasBWI())
   8152       Opcode = X86ISD::VPERMV;
   8153     else if (Subtarget.hasAVX()) {
   8154       // Scale to v32i8 and perform as v32i8.
   8155       IndicesVec = ScaleIndices(IndicesVec, 2);
   8156       return DAG.getBitcast(
   8157           VT, createVariablePermute(
   8158                   MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
   8159                   DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
   8160     }
   8161     break;
   8162   case MVT::v8f32:
   8163   case MVT::v8i32:
   8164     if (Subtarget.hasAVX2())
   8165       Opcode = X86ISD::VPERMV;
   8166     else if (Subtarget.hasAVX()) {
   8167       SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
   8168       SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
   8169                                           {0, 1, 2, 3, 0, 1, 2, 3});
   8170       SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
   8171                                           {4, 5, 6, 7, 4, 5, 6, 7});
   8172       if (Subtarget.hasXOP())
   8173         return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
   8174                                               LoLo, HiHi, IndicesVec,
   8175                                               DAG.getConstant(0, DL, MVT::i8)));
   8176       // Permute Lo and Hi and then select based on index range.
   8177       // This works as VPERMILPS only uses index bits[0:1] to permute elements.
   8178       SDValue Res = DAG.getSelectCC(
   8179           DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
   8180           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
   8181           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
   8182           ISD::CondCode::SETGT);
   8183       return DAG.getBitcast(VT, Res);
   8184     }
   8185     break;
   8186   case MVT::v4i64:
   8187   case MVT::v4f64:
   8188     if (Subtarget.hasAVX512()) {
   8189       if (!Subtarget.hasVLX()) {
   8190         MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
   8191         SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
   8192                                 SDLoc(SrcVec));
   8193         IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
   8194                                     DAG, SDLoc(IndicesVec));
   8195         SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
   8196                                             DAG, Subtarget);
   8197         return extract256BitVector(Res, 0, DAG, DL);
   8198       }
   8199       Opcode = X86ISD::VPERMV;
   8200     } else if (Subtarget.hasAVX()) {
   8201       SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
   8202       SDValue LoLo =
   8203           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
   8204       SDValue HiHi =
   8205           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
   8206       // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
   8207       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
   8208       if (Subtarget.hasXOP())
   8209         return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
   8210                                               LoLo, HiHi, IndicesVec,
   8211                                               DAG.getConstant(0, DL, MVT::i8)));
   8212       // Permute Lo and Hi and then select based on index range.
   8213       // This works as VPERMILPD only uses index bit[1] to permute elements.
   8214       SDValue Res = DAG.getSelectCC(
   8215           DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
   8216           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
   8217           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
   8218           ISD::CondCode::SETGT);
   8219       return DAG.getBitcast(VT, Res);
   8220     }
   8221     break;
   8222   case MVT::v64i8:
   8223     if (Subtarget.hasVBMI())
   8224       Opcode = X86ISD::VPERMV;
   8225     break;
   8226   case MVT::v32i16:
   8227     if (Subtarget.hasBWI())
   8228       Opcode = X86ISD::VPERMV;
   8229     break;
   8230   case MVT::v16f32:
   8231   case MVT::v16i32:
   8232   case MVT::v8f64:
   8233   case MVT::v8i64:
   8234     if (Subtarget.hasAVX512())
   8235       Opcode = X86ISD::VPERMV;
   8236     break;
   8237   }
   8238   if (!Opcode)
   8239     return SDValue();
   8240 
   8241   assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
   8242          (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
   8243          "Illegal variable permute shuffle type");
   8244 
   8245   uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
   8246   if (Scale > 1)
   8247     IndicesVec = ScaleIndices(IndicesVec, Scale);
   8248 
   8249   EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
   8250   IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
   8251 
   8252   SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
   8253   SDValue Res = Opcode == X86ISD::VPERMV
   8254                     ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
   8255                     : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
   8256   return DAG.getBitcast(VT, Res);
   8257 }
   8258 
   8259 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
   8260 // reasoned to be a permutation of a vector by indices in a non-constant vector.
   8261 // (build_vector (extract_elt V, (extract_elt I, 0)),
   8262 //               (extract_elt V, (extract_elt I, 1)),
   8263 //                    ...
   8264 // ->
   8265 // (vpermv I, V)
   8266 //
   8267 // TODO: Handle undefs
   8268 // TODO: Utilize pshufb and zero mask blending to support more efficient
   8269 // construction of vectors with constant-0 elements.
   8270 static SDValue
   8271 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
   8272                                    const X86Subtarget &Subtarget) {
   8273   SDValue SrcVec, IndicesVec;
   8274   // Check for a match of the permute source vector and permute index elements.
   8275   // This is done by checking that the i-th build_vector operand is of the form:
   8276   // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
   8277   for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
   8278     SDValue Op = V.getOperand(Idx);
   8279     if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   8280       return SDValue();
   8281 
   8282     // If this is the first extract encountered in V, set the source vector,
   8283     // otherwise verify the extract is from the previously defined source
   8284     // vector.
   8285     if (!SrcVec)
   8286       SrcVec = Op.getOperand(0);
   8287     else if (SrcVec != Op.getOperand(0))
   8288       return SDValue();
   8289     SDValue ExtractedIndex = Op->getOperand(1);
   8290     // Peek through extends.
   8291     if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
   8292         ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
   8293       ExtractedIndex = ExtractedIndex.getOperand(0);
   8294     if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   8295       return SDValue();
   8296 
   8297     // If this is the first extract from the index vector candidate, set the
   8298     // indices vector, otherwise verify the extract is from the previously
   8299     // defined indices vector.
   8300     if (!IndicesVec)
   8301       IndicesVec = ExtractedIndex.getOperand(0);
   8302     else if (IndicesVec != ExtractedIndex.getOperand(0))
   8303       return SDValue();
   8304 
   8305     auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
   8306     if (!PermIdx || PermIdx->getZExtValue() != Idx)
   8307       return SDValue();
   8308   }
   8309 
   8310   SDLoc DL(V);
   8311   MVT VT = V.getSimpleValueType();
   8312   return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
   8313 }
   8314 
   8315 SDValue
   8316 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   8317   SDLoc dl(Op);
   8318 
   8319   MVT VT = Op.getSimpleValueType();
   8320   MVT EltVT = VT.getVectorElementType();
   8321   unsigned NumElems = Op.getNumOperands();
   8322 
   8323   // Generate vectors for predicate vectors.
   8324   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
   8325     return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
   8326 
   8327   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
   8328     return VectorConstant;
   8329 
   8330   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
   8331   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
   8332     return AddSub;
   8333   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
   8334     return HorizontalOp;
   8335   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
   8336     return Broadcast;
   8337   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
   8338     return BitOp;
   8339 
   8340   unsigned EVTBits = EltVT.getSizeInBits();
   8341 
   8342   unsigned NumZero  = 0;
   8343   unsigned NumNonZero = 0;
   8344   uint64_t NonZeros = 0;
   8345   bool IsAllConstants = true;
   8346   SmallSet<SDValue, 8> Values;
   8347   unsigned NumConstants = NumElems;
   8348   for (unsigned i = 0; i < NumElems; ++i) {
   8349     SDValue Elt = Op.getOperand(i);
   8350     if (Elt.isUndef())
   8351       continue;
   8352     Values.insert(Elt);
   8353     if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
   8354       IsAllConstants = false;
   8355       NumConstants--;
   8356     }
   8357     if (X86::isZeroNode(Elt))
   8358       NumZero++;
   8359     else {
   8360       assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
   8361       NonZeros |= ((uint64_t)1 << i);
   8362       NumNonZero++;
   8363     }
   8364   }
   8365 
   8366   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
   8367   if (NumNonZero == 0)
   8368     return DAG.getUNDEF(VT);
   8369 
   8370   // If we are inserting one variable into a vector of non-zero constants, try
   8371   // to avoid loading each constant element as a scalar. Load the constants as a
   8372   // vector and then insert the variable scalar element. If insertion is not
   8373   // supported, we assume that we will fall back to a shuffle to get the scalar
   8374   // blended with the constants. Insertion into a zero vector is handled as a
   8375   // special-case somewhere below here.
   8376   if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
   8377       (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
   8378        isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
   8379     // Create an all-constant vector. The variable element in the old
   8380     // build vector is replaced by undef in the constant vector. Save the
   8381     // variable scalar element and its index for use in the insertelement.
   8382     LLVMContext &Context = *DAG.getContext();
   8383     Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
   8384     SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
   8385     SDValue VarElt;
   8386     SDValue InsIndex;
   8387     for (unsigned i = 0; i != NumElems; ++i) {
   8388       SDValue Elt = Op.getOperand(i);
   8389       if (auto *C = dyn_cast<ConstantSDNode>(Elt))
   8390         ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
   8391       else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
   8392         ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
   8393       else if (!Elt.isUndef()) {
   8394         assert(!VarElt.getNode() && !InsIndex.getNode() &&
   8395                "Expected one variable element in this vector");
   8396         VarElt = Elt;
   8397         InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
   8398       }
   8399     }
   8400     Constant *CV = ConstantVector::get(ConstVecOps);
   8401     SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
   8402 
   8403     // The constants we just created may not be legal (eg, floating point). We
   8404     // must lower the vector right here because we can not guarantee that we'll
   8405     // legalize it before loading it. This is also why we could not just create
   8406     // a new build vector here. If the build vector contains illegal constants,
   8407     // it could get split back up into a series of insert elements.
   8408     // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
   8409     SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
   8410     MachineFunction &MF = DAG.getMachineFunction();
   8411     MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
   8412     SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
   8413     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
   8414   }
   8415 
   8416   // Special case for single non-zero, non-undef, element.
   8417   if (NumNonZero == 1) {
   8418     unsigned Idx = countTrailingZeros(NonZeros);
   8419     SDValue Item = Op.getOperand(Idx);
   8420 
   8421     // If we have a constant or non-constant insertion into the low element of
   8422     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
   8423     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
   8424     // depending on what the source datatype is.
   8425     if (Idx == 0) {
   8426       if (NumZero == 0)
   8427         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   8428 
   8429       if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
   8430           (EltVT == MVT::i64 && Subtarget.is64Bit())) {
   8431         assert((VT.is128BitVector() || VT.is256BitVector() ||
   8432                 VT.is512BitVector()) &&
   8433                "Expected an SSE value type!");
   8434         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   8435         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
   8436         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   8437       }
   8438 
   8439       // We can't directly insert an i8 or i16 into a vector, so zero extend
   8440       // it to i32 first.
   8441       if (EltVT == MVT::i16 || EltVT == MVT::i8) {
   8442         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
   8443         if (VT.getSizeInBits() >= 256) {
   8444           MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
   8445           if (Subtarget.hasAVX()) {
   8446             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
   8447             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   8448           } else {
   8449             // Without AVX, we need to extend to a 128-bit vector and then
   8450             // insert into the 256-bit vector.
   8451             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
   8452             SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
   8453             Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
   8454           }
   8455         } else {
   8456           assert(VT.is128BitVector() && "Expected an SSE value type!");
   8457           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
   8458           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
   8459         }
   8460         return DAG.getBitcast(VT, Item);
   8461       }
   8462     }
   8463 
   8464     // Is it a vector logical left shift?
   8465     if (NumElems == 2 && Idx == 1 &&
   8466         X86::isZeroNode(Op.getOperand(0)) &&
   8467         !X86::isZeroNode(Op.getOperand(1))) {
   8468       unsigned NumBits = VT.getSizeInBits();
   8469       return getVShift(true, VT,
   8470                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   8471                                    VT, Op.getOperand(1)),
   8472                        NumBits/2, DAG, *this, dl);
   8473     }
   8474 
   8475     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
   8476       return SDValue();
   8477 
   8478     // Otherwise, if this is a vector with i32 or f32 elements, and the element
   8479     // is a non-constant being inserted into an element other than the low one,
   8480     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
   8481     // movd/movss) to move this into the low element, then shuffle it into
   8482     // place.
   8483     if (EVTBits == 32) {
   8484       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
   8485       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
   8486     }
   8487   }
   8488 
   8489   // Splat is obviously ok. Let legalizer expand it to a shuffle.
   8490   if (Values.size() == 1) {
   8491     if (EVTBits == 32) {
   8492       // Instead of a shuffle like this:
   8493       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
   8494       // Check if it's possible to issue this instead.
   8495       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
   8496       unsigned Idx = countTrailingZeros(NonZeros);
   8497       SDValue Item = Op.getOperand(Idx);
   8498       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
   8499         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
   8500     }
   8501     return SDValue();
   8502   }
   8503 
   8504   // A vector full of immediates; various special cases are already
   8505   // handled, so this is best done with a single constant-pool load.
   8506   if (IsAllConstants)
   8507     return SDValue();
   8508 
   8509   if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
   8510       return V;
   8511 
   8512   // See if we can use a vector load to get all of the elements.
   8513   {
   8514     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
   8515     if (SDValue LD =
   8516             EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
   8517       return LD;
   8518   }
   8519 
   8520   // If this is a splat of pairs of 32-bit elements, we can use a narrower
   8521   // build_vector and broadcast it.
   8522   // TODO: We could probably generalize this more.
   8523   if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
   8524     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
   8525                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
   8526     auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
   8527       // Make sure all the even/odd operands match.
   8528       for (unsigned i = 2; i != NumElems; ++i)
   8529         if (Ops[i % 2] != Op.getOperand(i))
   8530           return false;
   8531       return true;
   8532     };
   8533     if (CanSplat(Op, NumElems, Ops)) {
   8534       MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
   8535       MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
   8536       // Create a new build vector and cast to v2i64/v2f64.
   8537       SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
   8538                                      DAG.getBuildVector(NarrowVT, dl, Ops));
   8539       // Broadcast from v2i64/v2f64 and cast to final VT.
   8540       MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
   8541       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
   8542                                             NewBV));
   8543     }
   8544   }
   8545 
   8546   // For AVX-length vectors, build the individual 128-bit pieces and use
   8547   // shuffles to put them in place.
   8548   if (VT.getSizeInBits() > 128) {
   8549     MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
   8550 
   8551     // Build both the lower and upper subvector.
   8552     SDValue Lower =
   8553         DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
   8554     SDValue Upper = DAG.getBuildVector(
   8555         HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
   8556 
   8557     // Recreate the wider vector with the lower and upper part.
   8558     return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
   8559                             VT.getSizeInBits() / 2);
   8560   }
   8561 
   8562   // Let legalizer expand 2-wide build_vectors.
   8563   if (EVTBits == 64) {
   8564     if (NumNonZero == 1) {
   8565       // One half is zero or undef.
   8566       unsigned Idx = countTrailingZeros(NonZeros);
   8567       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
   8568                                Op.getOperand(Idx));
   8569       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
   8570     }
   8571     return SDValue();
   8572   }
   8573 
   8574   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   8575   if (EVTBits == 8 && NumElems == 16)
   8576     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
   8577                                           DAG, Subtarget))
   8578       return V;
   8579 
   8580   if (EVTBits == 16 && NumElems == 8)
   8581     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
   8582                                           DAG, Subtarget))
   8583       return V;
   8584 
   8585   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
   8586   if (EVTBits == 32 && NumElems == 4)
   8587     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
   8588       return V;
   8589 
   8590   // If element VT is == 32 bits, turn it into a number of shuffles.
   8591   if (NumElems == 4 && NumZero > 0) {
   8592     SmallVector<SDValue, 8> Ops(NumElems);
   8593     for (unsigned i = 0; i < 4; ++i) {
   8594       bool isZero = !(NonZeros & (1ULL << i));
   8595       if (isZero)
   8596         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
   8597       else
   8598         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   8599     }
   8600 
   8601     for (unsigned i = 0; i < 2; ++i) {
   8602       switch ((NonZeros >> (i*2)) & 0x3) {
   8603         default: llvm_unreachable("Unexpected NonZero count");
   8604         case 0:
   8605           Ops[i] = Ops[i*2];  // Must be a zero vector.
   8606           break;
   8607         case 1:
   8608           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
   8609           break;
   8610         case 2:
   8611           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
   8612           break;
   8613         case 3:
   8614           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
   8615           break;
   8616       }
   8617     }
   8618 
   8619     bool Reverse1 = (NonZeros & 0x3) == 2;
   8620     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
   8621     int MaskVec[] = {
   8622       Reverse1 ? 1 : 0,
   8623       Reverse1 ? 0 : 1,
   8624       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
   8625       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
   8626     };
   8627     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
   8628   }
   8629 
   8630   assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
   8631 
   8632   // Check for a build vector from mostly shuffle plus few inserting.
   8633   if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
   8634     return Sh;
   8635 
   8636   // For SSE 4.1, use insertps to put the high elements into the low element.
   8637   if (Subtarget.hasSSE41()) {
   8638     SDValue Result;
   8639     if (!Op.getOperand(0).isUndef())
   8640       Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
   8641     else
   8642       Result = DAG.getUNDEF(VT);
   8643 
   8644     for (unsigned i = 1; i < NumElems; ++i) {
   8645       if (Op.getOperand(i).isUndef()) continue;
   8646       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
   8647                            Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
   8648     }
   8649     return Result;
   8650   }
   8651 
   8652   // Otherwise, expand into a number of unpckl*, start by extending each of
   8653   // our (non-undef) elements to the full vector width with the element in the
   8654   // bottom slot of the vector (which generates no code for SSE).
   8655   SmallVector<SDValue, 8> Ops(NumElems);
   8656   for (unsigned i = 0; i < NumElems; ++i) {
   8657     if (!Op.getOperand(i).isUndef())
   8658       Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
   8659     else
   8660       Ops[i] = DAG.getUNDEF(VT);
   8661   }
   8662 
   8663   // Next, we iteratively mix elements, e.g. for v4f32:
   8664   //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
   8665   //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
   8666   //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
   8667   for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
   8668     // Generate scaled UNPCKL shuffle mask.
   8669     SmallVector<int, 16> Mask;
   8670     for(unsigned i = 0; i != Scale; ++i)
   8671       Mask.push_back(i);
   8672     for (unsigned i = 0; i != Scale; ++i)
   8673       Mask.push_back(NumElems+i);
   8674     Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
   8675 
   8676     for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
   8677       Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
   8678   }
   8679   return Ops[0];
   8680 }
   8681 
   8682 // 256-bit AVX can use the vinsertf128 instruction
   8683 // to create 256-bit vectors from two other 128-bit ones.
   8684 // TODO: Detect subvector broadcast here instead of DAG combine?
   8685 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
   8686                                       const X86Subtarget &Subtarget) {
   8687   SDLoc dl(Op);
   8688   MVT ResVT = Op.getSimpleValueType();
   8689 
   8690   assert((ResVT.is256BitVector() ||
   8691           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
   8692 
   8693   unsigned NumOperands = Op.getNumOperands();
   8694   unsigned NumZero = 0;
   8695   unsigned NumNonZero = 0;
   8696   unsigned NonZeros = 0;
   8697   for (unsigned i = 0; i != NumOperands; ++i) {
   8698     SDValue SubVec = Op.getOperand(i);
   8699     if (SubVec.isUndef())
   8700       continue;
   8701     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
   8702       ++NumZero;
   8703     else {
   8704       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
   8705       NonZeros |= 1 << i;
   8706       ++NumNonZero;
   8707     }
   8708   }
   8709 
   8710   // If we have more than 2 non-zeros, build each half separately.
   8711   if (NumNonZero > 2) {
   8712     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
   8713                                   ResVT.getVectorNumElements()/2);
   8714     ArrayRef<SDUse> Ops = Op->ops();
   8715     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
   8716                              Ops.slice(0, NumOperands/2));
   8717     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
   8718                              Ops.slice(NumOperands/2));
   8719     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   8720   }
   8721 
   8722   // Otherwise, build it up through insert_subvectors.
   8723   SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
   8724                         : DAG.getUNDEF(ResVT);
   8725 
   8726   MVT SubVT = Op.getOperand(0).getSimpleValueType();
   8727   unsigned NumSubElems = SubVT.getVectorNumElements();
   8728   for (unsigned i = 0; i != NumOperands; ++i) {
   8729     if ((NonZeros & (1 << i)) == 0)
   8730       continue;
   8731 
   8732     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
   8733                       Op.getOperand(i),
   8734                       DAG.getIntPtrConstant(i * NumSubElems, dl));
   8735   }
   8736 
   8737   return Vec;
   8738 }
   8739 
   8740 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
   8741 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
   8742 static bool isExpandWithZeros(const SDValue &Op) {
   8743   assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
   8744          "Expand with zeros only possible in CONCAT_VECTORS nodes!");
   8745 
   8746   for (unsigned i = 1; i < Op.getNumOperands(); i++)
   8747     if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
   8748       return false;
   8749 
   8750   return true;
   8751 }
   8752 
   8753 // Returns true if the given node is a type promotion (by concatenating i1
   8754 // zeros) of the result of a node that already zeros all upper bits of
   8755 // k-register.
   8756 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
   8757   unsigned Opc = Op.getOpcode();
   8758 
   8759   assert(Opc == ISD::CONCAT_VECTORS &&
   8760          Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
   8761          "Unexpected node to check for type promotion!");
   8762 
   8763   // As long as we are concatenating zeros to the upper part of a previous node
   8764   // result, climb up the tree until a node with different opcode is
   8765   // encountered
   8766   while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
   8767     if (Opc == ISD::INSERT_SUBVECTOR) {
   8768       if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
   8769           Op.getConstantOperandVal(2) == 0)
   8770         Op = Op.getOperand(1);
   8771       else
   8772         return SDValue();
   8773     } else { // Opc == ISD::CONCAT_VECTORS
   8774       if (isExpandWithZeros(Op))
   8775         Op = Op.getOperand(0);
   8776       else
   8777         return SDValue();
   8778     }
   8779     Opc = Op.getOpcode();
   8780   }
   8781 
   8782   // Check if the first inserted node zeroes the upper bits, or an 'and' result
   8783   // of a node that zeros the upper bits (its masked version).
   8784   if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
   8785       (Op.getOpcode() == ISD::AND &&
   8786        (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
   8787         isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
   8788     return Op;
   8789   }
   8790 
   8791   return SDValue();
   8792 }
   8793 
   8794 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
   8795 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   8796                                        const X86Subtarget &Subtarget,
   8797                                        SelectionDAG & DAG) {
   8798   SDLoc dl(Op);
   8799   MVT ResVT = Op.getSimpleValueType();
   8800   unsigned NumOperands = Op.getNumOperands();
   8801 
   8802   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
   8803          "Unexpected number of operands in CONCAT_VECTORS");
   8804 
   8805   // If this node promotes - by concatenating zeroes - the type of the result
   8806   // of a node with instruction that zeroes all upper (irrelevant) bits of the
   8807   // output register, mark it as legal and catch the pattern in instruction
   8808   // selection to avoid emitting extra instructions (for zeroing upper bits).
   8809   if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
   8810     return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
   8811 
   8812   unsigned NumZero = 0;
   8813   unsigned NumNonZero = 0;
   8814   uint64_t NonZeros = 0;
   8815   for (unsigned i = 0; i != NumOperands; ++i) {
   8816     SDValue SubVec = Op.getOperand(i);
   8817     if (SubVec.isUndef())
   8818       continue;
   8819     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
   8820       ++NumZero;
   8821     else {
   8822       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
   8823       NonZeros |= (uint64_t)1 << i;
   8824       ++NumNonZero;
   8825     }
   8826   }
   8827 
   8828 
   8829   // If there are zero or one non-zeros we can handle this very simply.
   8830   if (NumNonZero <= 1) {
   8831     SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
   8832                           : DAG.getUNDEF(ResVT);
   8833     if (!NumNonZero)
   8834       return Vec;
   8835     unsigned Idx = countTrailingZeros(NonZeros);
   8836     SDValue SubVec = Op.getOperand(Idx);
   8837     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
   8838     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
   8839                        DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
   8840   }
   8841 
   8842   if (NumOperands > 2) {
   8843     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
   8844                                   ResVT.getVectorNumElements()/2);
   8845     ArrayRef<SDUse> Ops = Op->ops();
   8846     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
   8847                              Ops.slice(0, NumOperands/2));
   8848     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
   8849                              Ops.slice(NumOperands/2));
   8850     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   8851   }
   8852 
   8853   assert(NumNonZero == 2 && "Simple cases not handled?");
   8854 
   8855   if (ResVT.getVectorNumElements() >= 16)
   8856     return Op; // The operation is legal with KUNPCK
   8857 
   8858   SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
   8859                             DAG.getUNDEF(ResVT), Op.getOperand(0),
   8860                             DAG.getIntPtrConstant(0, dl));
   8861   unsigned NumElems = ResVT.getVectorNumElements();
   8862   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
   8863                      DAG.getIntPtrConstant(NumElems/2, dl));
   8864 }
   8865 
   8866 static SDValue LowerCONCAT_VECTORS(SDValue Op,
   8867                                    const X86Subtarget &Subtarget,
   8868                                    SelectionDAG &DAG) {
   8869   MVT VT = Op.getSimpleValueType();
   8870   if (VT.getVectorElementType() == MVT::i1)
   8871     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
   8872 
   8873   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
   8874          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
   8875           Op.getNumOperands() == 4)));
   8876 
   8877   // AVX can use the vinsertf128 instruction to create 256-bit vectors
   8878   // from two other 128-bit ones.
   8879 
   8880   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
   8881   return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
   8882 }
   8883 
   8884 //===----------------------------------------------------------------------===//
   8885 // Vector shuffle lowering
   8886 //
   8887 // This is an experimental code path for lowering vector shuffles on x86. It is
   8888 // designed to handle arbitrary vector shuffles and blends, gracefully
   8889 // degrading performance as necessary. It works hard to recognize idiomatic
   8890 // shuffles and lower them to optimal instruction patterns without leaving
   8891 // a framework that allows reasonably efficient handling of all vector shuffle
   8892 // patterns.
   8893 //===----------------------------------------------------------------------===//
   8894 
   8895 /// Tiny helper function to identify a no-op mask.
   8896 ///
   8897 /// This is a somewhat boring predicate function. It checks whether the mask
   8898 /// array input, which is assumed to be a single-input shuffle mask of the kind
   8899 /// used by the X86 shuffle instructions (not a fully general
   8900 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
   8901 /// in-place shuffle are 'no-op's.
   8902 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
   8903   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   8904     assert(Mask[i] >= -1 && "Out of bound mask element!");
   8905     if (Mask[i] >= 0 && Mask[i] != i)
   8906       return false;
   8907   }
   8908   return true;
   8909 }
   8910 
   8911 /// Test whether there are elements crossing 128-bit lanes in this
   8912 /// shuffle mask.
   8913 ///
   8914 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
   8915 /// and we routinely test for these.
   8916 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
   8917   int LaneSize = 128 / VT.getScalarSizeInBits();
   8918   int Size = Mask.size();
   8919   for (int i = 0; i < Size; ++i)
   8920     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
   8921       return true;
   8922   return false;
   8923 }
   8924 
   8925 /// Test whether a shuffle mask is equivalent within each sub-lane.
   8926 ///
   8927 /// This checks a shuffle mask to see if it is performing the same
   8928 /// lane-relative shuffle in each sub-lane. This trivially implies
   8929 /// that it is also not lane-crossing. It may however involve a blend from the
   8930 /// same lane of a second vector.
   8931 ///
   8932 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
   8933 /// non-trivial to compute in the face of undef lanes. The representation is
   8934 /// suitable for use with existing 128-bit shuffles as entries from the second
   8935 /// vector have been remapped to [LaneSize, 2*LaneSize).
   8936 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
   8937                                   ArrayRef<int> Mask,
   8938                                   SmallVectorImpl<int> &RepeatedMask) {
   8939   auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
   8940   RepeatedMask.assign(LaneSize, -1);
   8941   int Size = Mask.size();
   8942   for (int i = 0; i < Size; ++i) {
   8943     assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
   8944     if (Mask[i] < 0)
   8945       continue;
   8946     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
   8947       // This entry crosses lanes, so there is no way to model this shuffle.
   8948       return false;
   8949 
   8950     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
   8951     // Adjust second vector indices to start at LaneSize instead of Size.
   8952     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
   8953                                 : Mask[i] % LaneSize + LaneSize;
   8954     if (RepeatedMask[i % LaneSize] < 0)
   8955       // This is the first non-undef entry in this slot of a 128-bit lane.
   8956       RepeatedMask[i % LaneSize] = LocalM;
   8957     else if (RepeatedMask[i % LaneSize] != LocalM)
   8958       // Found a mismatch with the repeated mask.
   8959       return false;
   8960   }
   8961   return true;
   8962 }
   8963 
   8964 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
   8965 static bool
   8966 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
   8967                                 SmallVectorImpl<int> &RepeatedMask) {
   8968   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
   8969 }
   8970 
   8971 static bool
   8972 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
   8973   SmallVector<int, 32> RepeatedMask;
   8974   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
   8975 }
   8976 
   8977 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
   8978 static bool
   8979 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
   8980                                 SmallVectorImpl<int> &RepeatedMask) {
   8981   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
   8982 }
   8983 
   8984 /// Test whether a target shuffle mask is equivalent within each sub-lane.
   8985 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
   8986 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
   8987                                         ArrayRef<int> Mask,
   8988                                         SmallVectorImpl<int> &RepeatedMask) {
   8989   int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
   8990   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
   8991   int Size = Mask.size();
   8992   for (int i = 0; i < Size; ++i) {
   8993     assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
   8994     if (Mask[i] == SM_SentinelUndef)
   8995       continue;
   8996     if (Mask[i] == SM_SentinelZero) {
   8997       if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
   8998         return false;
   8999       RepeatedMask[i % LaneSize] = SM_SentinelZero;
   9000       continue;
   9001     }
   9002     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
   9003       // This entry crosses lanes, so there is no way to model this shuffle.
   9004       return false;
   9005 
   9006     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
   9007     // Adjust second vector indices to start at LaneSize instead of Size.
   9008     int LocalM =
   9009         Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
   9010     if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
   9011       // This is the first non-undef entry in this slot of a 128-bit lane.
   9012       RepeatedMask[i % LaneSize] = LocalM;
   9013     else if (RepeatedMask[i % LaneSize] != LocalM)
   9014       // Found a mismatch with the repeated mask.
   9015       return false;
   9016   }
   9017   return true;
   9018 }
   9019 
   9020 /// Checks whether a shuffle mask is equivalent to an explicit list of
   9021 /// arguments.
   9022 ///
   9023 /// This is a fast way to test a shuffle mask against a fixed pattern:
   9024 ///
   9025 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
   9026 ///
   9027 /// It returns true if the mask is exactly as wide as the argument list, and
   9028 /// each element of the mask is either -1 (signifying undef) or the value given
   9029 /// in the argument.
   9030 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
   9031                                 ArrayRef<int> ExpectedMask) {
   9032   if (Mask.size() != ExpectedMask.size())
   9033     return false;
   9034 
   9035   int Size = Mask.size();
   9036 
   9037   // If the values are build vectors, we can look through them to find
   9038   // equivalent inputs that make the shuffles equivalent.
   9039   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
   9040   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
   9041 
   9042   for (int i = 0; i < Size; ++i) {
   9043     assert(Mask[i] >= -1 && "Out of bound mask element!");
   9044     if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
   9045       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
   9046       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
   9047       if (!MaskBV || !ExpectedBV ||
   9048           MaskBV->getOperand(Mask[i] % Size) !=
   9049               ExpectedBV->getOperand(ExpectedMask[i] % Size))
   9050         return false;
   9051     }
   9052   }
   9053 
   9054   return true;
   9055 }
   9056 
   9057 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
   9058 ///
   9059 /// The masks must be exactly the same width.
   9060 ///
   9061 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
   9062 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
   9063 ///
   9064 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
   9065 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
   9066                                       ArrayRef<int> ExpectedMask) {
   9067   int Size = Mask.size();
   9068   if (Size != (int)ExpectedMask.size())
   9069     return false;
   9070 
   9071   for (int i = 0; i < Size; ++i)
   9072     if (Mask[i] == SM_SentinelUndef)
   9073       continue;
   9074     else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
   9075       return false;
   9076     else if (Mask[i] != ExpectedMask[i])
   9077       return false;
   9078 
   9079   return true;
   9080 }
   9081 
   9082 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
   9083 // mask.
   9084 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
   9085                                                     const APInt &Zeroable) {
   9086   int NumElts = Mask.size();
   9087   assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
   9088 
   9089   SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
   9090   for (int i = 0; i != NumElts; ++i) {
   9091     int M = Mask[i];
   9092     if (M == SM_SentinelUndef)
   9093       continue;
   9094     assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
   9095     TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
   9096   }
   9097   return TargetMask;
   9098 }
   9099 
   9100 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
   9101 // instructions.
   9102 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
   9103   if (VT != MVT::v8i32 && VT != MVT::v8f32)
   9104     return false;
   9105 
   9106   SmallVector<int, 8> Unpcklwd;
   9107   createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
   9108                           /* Unary = */ false);
   9109   SmallVector<int, 8> Unpckhwd;
   9110   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
   9111                           /* Unary = */ false);
   9112   bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
   9113                          isTargetShuffleEquivalent(Mask, Unpckhwd));
   9114   return IsUnpackwdMask;
   9115 }
   9116 
   9117 /// Get a 4-lane 8-bit shuffle immediate for a mask.
   9118 ///
   9119 /// This helper function produces an 8-bit shuffle immediate corresponding to
   9120 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
   9121 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
   9122 /// example.
   9123 ///
   9124 /// NB: We rely heavily on "undef" masks preserving the input lane.
   9125 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
   9126   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
   9127   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
   9128   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
   9129   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
   9130   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
   9131 
   9132   unsigned Imm = 0;
   9133   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
   9134   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
   9135   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
   9136   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
   9137   return Imm;
   9138 }
   9139 
   9140 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
   9141                                           SelectionDAG &DAG) {
   9142   return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
   9143 }
   9144 
   9145 /// Compute whether each element of a shuffle is zeroable.
   9146 ///
   9147 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
   9148 /// Either it is an undef element in the shuffle mask, the element of the input
   9149 /// referenced is undef, or the element of the input referenced is known to be
   9150 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
   9151 /// as many lanes with this technique as possible to simplify the remaining
   9152 /// shuffle.
   9153 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
   9154                                             SDValue V1, SDValue V2) {
   9155   APInt Zeroable(Mask.size(), 0);
   9156   V1 = peekThroughBitcasts(V1);
   9157   V2 = peekThroughBitcasts(V2);
   9158 
   9159   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
   9160   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
   9161 
   9162   int VectorSizeInBits = V1.getValueSizeInBits();
   9163   int ScalarSizeInBits = VectorSizeInBits / Mask.size();
   9164   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
   9165 
   9166   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   9167     int M = Mask[i];
   9168     // Handle the easy cases.
   9169     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
   9170       Zeroable.setBit(i);
   9171       continue;
   9172     }
   9173 
   9174     // Determine shuffle input and normalize the mask.
   9175     SDValue V = M < Size ? V1 : V2;
   9176     M %= Size;
   9177 
   9178     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
   9179     if (V.getOpcode() != ISD::BUILD_VECTOR)
   9180       continue;
   9181 
   9182     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
   9183     // the (larger) source element must be UNDEF/ZERO.
   9184     if ((Size % V.getNumOperands()) == 0) {
   9185       int Scale = Size / V->getNumOperands();
   9186       SDValue Op = V.getOperand(M / Scale);
   9187       if (Op.isUndef() || X86::isZeroNode(Op))
   9188         Zeroable.setBit(i);
   9189       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
   9190         APInt Val = Cst->getAPIntValue();
   9191         Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
   9192         Val = Val.getLoBits(ScalarSizeInBits);
   9193         if (Val == 0)
   9194           Zeroable.setBit(i);
   9195       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
   9196         APInt Val = Cst->getValueAPF().bitcastToAPInt();
   9197         Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
   9198         Val = Val.getLoBits(ScalarSizeInBits);
   9199         if (Val == 0)
   9200           Zeroable.setBit(i);
   9201       }
   9202       continue;
   9203     }
   9204 
   9205     // If the BUILD_VECTOR has more elements then all the (smaller) source
   9206     // elements must be UNDEF or ZERO.
   9207     if ((V.getNumOperands() % Size) == 0) {
   9208       int Scale = V->getNumOperands() / Size;
   9209       bool AllZeroable = true;
   9210       for (int j = 0; j < Scale; ++j) {
   9211         SDValue Op = V.getOperand((M * Scale) + j);
   9212         AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
   9213       }
   9214       if (AllZeroable)
   9215         Zeroable.setBit(i);
   9216       continue;
   9217     }
   9218   }
   9219 
   9220   return Zeroable;
   9221 }
   9222 
   9223 // The Shuffle result is as follow:
   9224 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
   9225 // Each Zeroable's element correspond to a particular Mask's element.
   9226 // As described in computeZeroableShuffleElements function.
   9227 //
   9228 // The function looks for a sub-mask that the nonzero elements are in
   9229 // increasing order. If such sub-mask exist. The function returns true.
   9230 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
   9231                                      ArrayRef<int> Mask, const EVT &VectorType,
   9232                                      bool &IsZeroSideLeft) {
   9233   int NextElement = -1;
   9234   // Check if the Mask's nonzero elements are in increasing order.
   9235   for (int i = 0, e = Mask.size(); i < e; i++) {
   9236     // Checks if the mask's zeros elements are built from only zeros.
   9237     assert(Mask[i] >= -1 && "Out of bound mask element!");
   9238     if (Mask[i] < 0)
   9239       return false;
   9240     if (Zeroable[i])
   9241       continue;
   9242     // Find the lowest non zero element
   9243     if (NextElement < 0) {
   9244       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
   9245       IsZeroSideLeft = NextElement != 0;
   9246     }
   9247     // Exit if the mask's non zero elements are not in increasing order.
   9248     if (NextElement != Mask[i])
   9249       return false;
   9250     NextElement++;
   9251   }
   9252   return true;
   9253 }
   9254 
   9255 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
   9256 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
   9257                                             ArrayRef<int> Mask, SDValue V1,
   9258                                             SDValue V2,
   9259                                             const APInt &Zeroable,
   9260                                             const X86Subtarget &Subtarget,
   9261                                             SelectionDAG &DAG) {
   9262   int Size = Mask.size();
   9263   int LaneSize = 128 / VT.getScalarSizeInBits();
   9264   const int NumBytes = VT.getSizeInBits() / 8;
   9265   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
   9266 
   9267   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
   9268          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
   9269          (Subtarget.hasBWI() && VT.is512BitVector()));
   9270 
   9271   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
   9272   // Sign bit set in i8 mask means zero element.
   9273   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
   9274 
   9275   SDValue V;
   9276   for (int i = 0; i < NumBytes; ++i) {
   9277     int M = Mask[i / NumEltBytes];
   9278     if (M < 0) {
   9279       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
   9280       continue;
   9281     }
   9282     if (Zeroable[i / NumEltBytes]) {
   9283       PSHUFBMask[i] = ZeroMask;
   9284       continue;
   9285     }
   9286 
   9287     // We can only use a single input of V1 or V2.
   9288     SDValue SrcV = (M >= Size ? V2 : V1);
   9289     if (V && V != SrcV)
   9290       return SDValue();
   9291     V = SrcV;
   9292     M %= Size;
   9293 
   9294     // PSHUFB can't cross lanes, ensure this doesn't happen.
   9295     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
   9296       return SDValue();
   9297 
   9298     M = M % LaneSize;
   9299     M = M * NumEltBytes + (i % NumEltBytes);
   9300     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
   9301   }
   9302   assert(V && "Failed to find a source input");
   9303 
   9304   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
   9305   return DAG.getBitcast(
   9306       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
   9307                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
   9308 }
   9309 
   9310 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
   9311                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
   9312                            const SDLoc &dl);
   9313 
   9314 // X86 has dedicated shuffle that can be lowered to VEXPAND
   9315 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
   9316                                           const APInt &Zeroable,
   9317                                           ArrayRef<int> Mask, SDValue &V1,
   9318                                           SDValue &V2, SelectionDAG &DAG,
   9319                                           const X86Subtarget &Subtarget) {
   9320   bool IsLeftZeroSide = true;
   9321   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
   9322                                 IsLeftZeroSide))
   9323     return SDValue();
   9324   unsigned VEXPANDMask = (~Zeroable).getZExtValue();
   9325   MVT IntegerType =
   9326       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
   9327   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
   9328   unsigned NumElts = VT.getVectorNumElements();
   9329   assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
   9330          "Unexpected number of vector elements");
   9331   SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
   9332                               Subtarget, DAG, DL);
   9333   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
   9334   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
   9335   return DAG.getSelect(DL, VT, VMask,
   9336                        DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
   9337                        ZeroVector);
   9338 }
   9339 
   9340 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
   9341                                         unsigned &UnpackOpcode, bool IsUnary,
   9342                                         ArrayRef<int> TargetMask,
   9343                                         const SDLoc &DL, SelectionDAG &DAG,
   9344                                         const X86Subtarget &Subtarget) {
   9345   int NumElts = VT.getVectorNumElements();
   9346 
   9347   bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
   9348   for (int i = 0; i != NumElts; i += 2) {
   9349     int M1 = TargetMask[i + 0];
   9350     int M2 = TargetMask[i + 1];
   9351     Undef1 &= (SM_SentinelUndef == M1);
   9352     Undef2 &= (SM_SentinelUndef == M2);
   9353     Zero1 &= isUndefOrZero(M1);
   9354     Zero2 &= isUndefOrZero(M2);
   9355   }
   9356   assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
   9357          "Zeroable shuffle detected");
   9358 
   9359   // Attempt to match the target mask against the unpack lo/hi mask patterns.
   9360   SmallVector<int, 64> Unpckl, Unpckh;
   9361   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
   9362   if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
   9363     UnpackOpcode = X86ISD::UNPCKL;
   9364     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
   9365     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
   9366     return true;
   9367   }
   9368 
   9369   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
   9370   if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
   9371     UnpackOpcode = X86ISD::UNPCKH;
   9372     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
   9373     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
   9374     return true;
   9375   }
   9376 
   9377   // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
   9378   if (IsUnary && (Zero1 || Zero2)) {
   9379     // Don't bother if we can blend instead.
   9380     if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
   9381         isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
   9382       return false;
   9383 
   9384     bool MatchLo = true, MatchHi = true;
   9385     for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
   9386       int M = TargetMask[i];
   9387 
   9388       // Ignore if the input is known to be zero or the index is undef.
   9389       if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
   9390           (M == SM_SentinelUndef))
   9391         continue;
   9392 
   9393       MatchLo &= (M == Unpckl[i]);
   9394       MatchHi &= (M == Unpckh[i]);
   9395     }
   9396 
   9397     if (MatchLo || MatchHi) {
   9398       UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
   9399       V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
   9400       V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
   9401       return true;
   9402     }
   9403   }
   9404 
   9405   // If a binary shuffle, commute and try again.
   9406   if (!IsUnary) {
   9407     ShuffleVectorSDNode::commuteMask(Unpckl);
   9408     if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
   9409       UnpackOpcode = X86ISD::UNPCKL;
   9410       std::swap(V1, V2);
   9411       return true;
   9412     }
   9413 
   9414     ShuffleVectorSDNode::commuteMask(Unpckh);
   9415     if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
   9416       UnpackOpcode = X86ISD::UNPCKH;
   9417       std::swap(V1, V2);
   9418       return true;
   9419     }
   9420   }
   9421 
   9422   return false;
   9423 }
   9424 
   9425 // X86 has dedicated unpack instructions that can handle specific blend
   9426 // operations: UNPCKH and UNPCKL.
   9427 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
   9428                                            ArrayRef<int> Mask, SDValue V1,
   9429                                            SDValue V2, SelectionDAG &DAG) {
   9430   SmallVector<int, 8> Unpckl;
   9431   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
   9432   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
   9433     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
   9434 
   9435   SmallVector<int, 8> Unpckh;
   9436   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
   9437   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
   9438     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
   9439 
   9440   // Commute and try again.
   9441   ShuffleVectorSDNode::commuteMask(Unpckl);
   9442   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
   9443     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
   9444 
   9445   ShuffleVectorSDNode::commuteMask(Unpckh);
   9446   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
   9447     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
   9448 
   9449   return SDValue();
   9450 }
   9451 
   9452 static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
   9453                                          int Delta) {
   9454   int Size = (int)Mask.size();
   9455   int Split = Size / Delta;
   9456   int TruncatedVectorStart = SwappedOps ? Size : 0;
   9457 
   9458   // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
   9459   if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
   9460     return false;
   9461 
   9462   // The rest of the mask should not refer to the truncated vector's elements.
   9463   if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
   9464                    TruncatedVectorStart + Size))
   9465     return false;
   9466 
   9467   return true;
   9468 }
   9469 
   9470 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
   9471 //
   9472 // An example is the following:
   9473 //
   9474 // t0: ch = EntryToken
   9475 //           t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
   9476 //         t25: v4i32 = truncate t2
   9477 //       t41: v8i16 = bitcast t25
   9478 //       t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
   9479 //       Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
   9480 //     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
   9481 //   t18: v2i64 = bitcast t51
   9482 //
   9483 // Without avx512vl, this is lowered to:
   9484 //
   9485 // vpmovqd %zmm0, %ymm0
   9486 // vpshufb {{.*#+}} xmm0 =
   9487 // xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
   9488 //
   9489 // But when avx512vl is available, one can just use a single vpmovdw
   9490 // instruction.
   9491 static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
   9492                                            MVT VT, SDValue V1, SDValue V2,
   9493                                            SelectionDAG &DAG,
   9494                                            const X86Subtarget &Subtarget) {
   9495   if (VT != MVT::v16i8 && VT != MVT::v8i16)
   9496     return SDValue();
   9497 
   9498   if (Mask.size() != VT.getVectorNumElements())
   9499     return SDValue();
   9500 
   9501   bool SwappedOps = false;
   9502 
   9503   if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
   9504     if (!ISD::isBuildVectorAllZeros(V1.getNode()))
   9505       return SDValue();
   9506 
   9507     std::swap(V1, V2);
   9508     SwappedOps = true;
   9509   }
   9510 
   9511   // Look for:
   9512   //
   9513   // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
   9514   // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
   9515   //
   9516   // and similar ones.
   9517   if (V1.getOpcode() != ISD::BITCAST)
   9518     return SDValue();
   9519   if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
   9520     return SDValue();
   9521 
   9522   SDValue Src = V1.getOperand(0).getOperand(0);
   9523   MVT SrcVT = Src.getSimpleValueType();
   9524 
   9525   // The vptrunc** instructions truncating 128 bit and 256 bit vectors
   9526   // are only available with avx512vl.
   9527   if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
   9528     return SDValue();
   9529 
   9530   // Down Convert Word to Byte is only available with avx512bw. The case with
   9531   // 256-bit output doesn't contain a shuffle and is therefore not handled here.
   9532   if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
   9533       !Subtarget.hasBWI())
   9534     return SDValue();
   9535 
   9536   // The first half/quarter of the mask should refer to every second/fourth
   9537   // element of the vector truncated and bitcasted.
   9538   if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
   9539       !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
   9540     return SDValue();
   9541 
   9542   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
   9543 }
   9544 
   9545 // X86 has dedicated pack instructions that can handle specific truncation
   9546 // operations: PACKSS and PACKUS.
   9547 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
   9548                                        SDValue &V2, unsigned &PackOpcode,
   9549                                        ArrayRef<int> TargetMask,
   9550                                        SelectionDAG &DAG,
   9551                                        const X86Subtarget &Subtarget) {
   9552   unsigned NumElts = VT.getVectorNumElements();
   9553   unsigned BitSize = VT.getScalarSizeInBits();
   9554   MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
   9555   MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
   9556 
   9557   auto MatchPACK = [&](SDValue N1, SDValue N2) {
   9558     SDValue VV1 = DAG.getBitcast(PackVT, N1);
   9559     SDValue VV2 = DAG.getBitcast(PackVT, N2);
   9560     if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
   9561       APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
   9562       if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
   9563           (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
   9564         V1 = VV1;
   9565         V2 = VV2;
   9566         SrcVT = PackVT;
   9567         PackOpcode = X86ISD::PACKUS;
   9568         return true;
   9569       }
   9570     }
   9571     if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
   9572         (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
   9573       V1 = VV1;
   9574       V2 = VV2;
   9575       SrcVT = PackVT;
   9576       PackOpcode = X86ISD::PACKSS;
   9577       return true;
   9578     }
   9579     return false;
   9580   };
   9581 
   9582   // Try binary shuffle.
   9583   SmallVector<int, 32> BinaryMask;
   9584   createPackShuffleMask(VT, BinaryMask, false);
   9585   if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
   9586     if (MatchPACK(V1, V2))
   9587       return true;
   9588 
   9589   // Try unary shuffle.
   9590   SmallVector<int, 32> UnaryMask;
   9591   createPackShuffleMask(VT, UnaryMask, true);
   9592   if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
   9593     if (MatchPACK(V1, V1))
   9594       return true;
   9595 
   9596   return false;
   9597 }
   9598 
   9599 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
   9600                                           ArrayRef<int> Mask, SDValue V1,
   9601                                           SDValue V2, SelectionDAG &DAG,
   9602                                           const X86Subtarget &Subtarget) {
   9603   MVT PackVT;
   9604   unsigned PackOpcode;
   9605   if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
   9606                                  Subtarget))
   9607     return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
   9608                        DAG.getBitcast(PackVT, V2));
   9609 
   9610   return SDValue();
   9611 }
   9612 
   9613 /// Try to emit a bitmask instruction for a shuffle.
   9614 ///
   9615 /// This handles cases where we can model a blend exactly as a bitmask due to
   9616 /// one of the inputs being zeroable.
   9617 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
   9618                                            SDValue V2, ArrayRef<int> Mask,
   9619                                            const APInt &Zeroable,
   9620                                            SelectionDAG &DAG) {
   9621   assert(!VT.isFloatingPoint() && "Floating point types are not supported");
   9622   MVT EltVT = VT.getVectorElementType();
   9623   SDValue Zero = DAG.getConstant(0, DL, EltVT);
   9624   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
   9625   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
   9626   SDValue V;
   9627   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   9628     if (Zeroable[i])
   9629       continue;
   9630     if (Mask[i] % Size != i)
   9631       return SDValue(); // Not a blend.
   9632     if (!V)
   9633       V = Mask[i] < Size ? V1 : V2;
   9634     else if (V != (Mask[i] < Size ? V1 : V2))
   9635       return SDValue(); // Can only let one input through the mask.
   9636 
   9637     VMaskOps[i] = AllOnes;
   9638   }
   9639   if (!V)
   9640     return SDValue(); // No non-zeroable elements!
   9641 
   9642   SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
   9643   return DAG.getNode(ISD::AND, DL, VT, V, VMask);
   9644 }
   9645 
   9646 /// Try to emit a blend instruction for a shuffle using bit math.
   9647 ///
   9648 /// This is used as a fallback approach when first class blend instructions are
   9649 /// unavailable. Currently it is only suitable for integer vectors, but could
   9650 /// be generalized for floating point vectors if desirable.
   9651 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
   9652                                             SDValue V2, ArrayRef<int> Mask,
   9653                                             SelectionDAG &DAG) {
   9654   assert(VT.isInteger() && "Only supports integer vector types!");
   9655   MVT EltVT = VT.getVectorElementType();
   9656   SDValue Zero = DAG.getConstant(0, DL, EltVT);
   9657   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
   9658   SmallVector<SDValue, 16> MaskOps;
   9659   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   9660     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
   9661       return SDValue(); // Shuffled input!
   9662     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
   9663   }
   9664 
   9665   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
   9666   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
   9667   // We have to cast V2 around.
   9668   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
   9669   V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
   9670                                       DAG.getBitcast(MaskVT, V1Mask),
   9671                                       DAG.getBitcast(MaskVT, V2)));
   9672   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
   9673 }
   9674 
   9675 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
   9676                                     SDValue PreservedSrc,
   9677                                     const X86Subtarget &Subtarget,
   9678                                     SelectionDAG &DAG);
   9679 
   9680 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
   9681                                       MutableArrayRef<int> TargetMask,
   9682                                       bool &ForceV1Zero, bool &ForceV2Zero,
   9683                                       uint64_t &BlendMask) {
   9684   bool V1IsZeroOrUndef =
   9685       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
   9686   bool V2IsZeroOrUndef =
   9687       V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
   9688 
   9689   BlendMask = 0;
   9690   ForceV1Zero = false, ForceV2Zero = false;
   9691   assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
   9692 
   9693   // Attempt to generate the binary blend mask. If an input is zero then
   9694   // we can use any lane.
   9695   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
   9696   for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
   9697     int M = TargetMask[i];
   9698     if (M == SM_SentinelUndef)
   9699       continue;
   9700     if (M == i)
   9701       continue;
   9702     if (M == i + Size) {
   9703       BlendMask |= 1ull << i;
   9704       continue;
   9705     }
   9706     if (M == SM_SentinelZero) {
   9707       if (V1IsZeroOrUndef) {
   9708         ForceV1Zero = true;
   9709         TargetMask[i] = i;
   9710         continue;
   9711       }
   9712       if (V2IsZeroOrUndef) {
   9713         ForceV2Zero = true;
   9714         BlendMask |= 1ull << i;
   9715         TargetMask[i] = i + Size;
   9716         continue;
   9717       }
   9718     }
   9719     return false;
   9720   }
   9721   return true;
   9722 }
   9723 
   9724 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
   9725                                             int Scale) {
   9726   uint64_t ScaledMask = 0;
   9727   for (int i = 0; i != Size; ++i)
   9728     if (BlendMask & (1ull << i))
   9729       ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
   9730   return ScaledMask;
   9731 }
   9732 
   9733 /// Try to emit a blend instruction for a shuffle.
   9734 ///
   9735 /// This doesn't do any checks for the availability of instructions for blending
   9736 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
   9737 /// be matched in the backend with the type given. What it does check for is
   9738 /// that the shuffle mask is a blend, or convertible into a blend with zero.
   9739 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
   9740                                          SDValue V2, ArrayRef<int> Original,
   9741                                          const APInt &Zeroable,
   9742                                          const X86Subtarget &Subtarget,
   9743                                          SelectionDAG &DAG) {
   9744   SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
   9745 
   9746   uint64_t BlendMask = 0;
   9747   bool ForceV1Zero = false, ForceV2Zero = false;
   9748   if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
   9749                                  BlendMask))
   9750     return SDValue();
   9751 
   9752   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
   9753   if (ForceV1Zero)
   9754     V1 = getZeroVector(VT, Subtarget, DAG, DL);
   9755   if (ForceV2Zero)
   9756     V2 = getZeroVector(VT, Subtarget, DAG, DL);
   9757 
   9758   switch (VT.SimpleTy) {
   9759   case MVT::v2f64:
   9760   case MVT::v4f32:
   9761   case MVT::v4f64:
   9762   case MVT::v8f32:
   9763     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
   9764                        DAG.getConstant(BlendMask, DL, MVT::i8));
   9765 
   9766   case MVT::v4i64:
   9767   case MVT::v8i32:
   9768     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
   9769     LLVM_FALLTHROUGH;
   9770   case MVT::v2i64:
   9771   case MVT::v4i32:
   9772     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
   9773     // that instruction.
   9774     if (Subtarget.hasAVX2()) {
   9775       // Scale the blend by the number of 32-bit dwords per element.
   9776       int Scale =  VT.getScalarSizeInBits() / 32;
   9777       BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
   9778       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
   9779       V1 = DAG.getBitcast(BlendVT, V1);
   9780       V2 = DAG.getBitcast(BlendVT, V2);
   9781       return DAG.getBitcast(
   9782           VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
   9783                           DAG.getConstant(BlendMask, DL, MVT::i8)));
   9784     }
   9785     LLVM_FALLTHROUGH;
   9786   case MVT::v8i16: {
   9787     // For integer shuffles we need to expand the mask and cast the inputs to
   9788     // v8i16s prior to blending.
   9789     int Scale = 8 / VT.getVectorNumElements();
   9790     BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
   9791     V1 = DAG.getBitcast(MVT::v8i16, V1);
   9792     V2 = DAG.getBitcast(MVT::v8i16, V2);
   9793     return DAG.getBitcast(VT,
   9794                           DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
   9795                                       DAG.getConstant(BlendMask, DL, MVT::i8)));
   9796   }
   9797 
   9798   case MVT::v16i16: {
   9799     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
   9800     SmallVector<int, 8> RepeatedMask;
   9801     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
   9802       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
   9803       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
   9804       BlendMask = 0;
   9805       for (int i = 0; i < 8; ++i)
   9806         if (RepeatedMask[i] >= 8)
   9807           BlendMask |= 1ull << i;
   9808       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
   9809                          DAG.getConstant(BlendMask, DL, MVT::i8));
   9810     }
   9811     LLVM_FALLTHROUGH;
   9812   }
   9813   case MVT::v16i8:
   9814   case MVT::v32i8: {
   9815     assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
   9816            "256-bit byte-blends require AVX2 support!");
   9817 
   9818     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
   9819       MVT IntegerType =
   9820           MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
   9821       SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
   9822       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
   9823     }
   9824 
   9825     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
   9826     if (SDValue Masked =
   9827             lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
   9828       return Masked;
   9829 
   9830     // Scale the blend by the number of bytes per element.
   9831     int Scale = VT.getScalarSizeInBits() / 8;
   9832 
   9833     // This form of blend is always done on bytes. Compute the byte vector
   9834     // type.
   9835     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
   9836 
   9837     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
   9838     // mix of LLVM's code generator and the x86 backend. We tell the code
   9839     // generator that boolean values in the elements of an x86 vector register
   9840     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
   9841     // mapping a select to operand #1, and 'false' mapping to operand #2. The
   9842     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
   9843     // of the element (the remaining are ignored) and 0 in that high bit would
   9844     // mean operand #1 while 1 in the high bit would mean operand #2. So while
   9845     // the LLVM model for boolean values in vector elements gets the relevant
   9846     // bit set, it is set backwards and over constrained relative to x86's
   9847     // actual model.
   9848     SmallVector<SDValue, 32> VSELECTMask;
   9849     for (int i = 0, Size = Mask.size(); i < Size; ++i)
   9850       for (int j = 0; j < Scale; ++j)
   9851         VSELECTMask.push_back(
   9852             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
   9853                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
   9854                                           MVT::i8));
   9855 
   9856     V1 = DAG.getBitcast(BlendVT, V1);
   9857     V2 = DAG.getBitcast(BlendVT, V2);
   9858     return DAG.getBitcast(
   9859         VT,
   9860         DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
   9861                       V1, V2));
   9862   }
   9863   case MVT::v16f32:
   9864   case MVT::v8f64:
   9865   case MVT::v8i64:
   9866   case MVT::v16i32:
   9867   case MVT::v32i16:
   9868   case MVT::v64i8: {
   9869     MVT IntegerType =
   9870         MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
   9871     SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
   9872     return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
   9873   }
   9874   default:
   9875     llvm_unreachable("Not a supported integer vector type!");
   9876   }
   9877 }
   9878 
   9879 /// Try to lower as a blend of elements from two inputs followed by
   9880 /// a single-input permutation.
   9881 ///
   9882 /// This matches the pattern where we can blend elements from two inputs and
   9883 /// then reduce the shuffle to a single-input permutation.
   9884 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
   9885                                                    SDValue V1, SDValue V2,
   9886                                                    ArrayRef<int> Mask,
   9887                                                    SelectionDAG &DAG) {
   9888   // We build up the blend mask while checking whether a blend is a viable way
   9889   // to reduce the shuffle.
   9890   SmallVector<int, 32> BlendMask(Mask.size(), -1);
   9891   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
   9892 
   9893   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
   9894     if (Mask[i] < 0)
   9895       continue;
   9896 
   9897     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
   9898 
   9899     if (BlendMask[Mask[i] % Size] < 0)
   9900       BlendMask[Mask[i] % Size] = Mask[i];
   9901     else if (BlendMask[Mask[i] % Size] != Mask[i])
   9902       return SDValue(); // Can't blend in the needed input!
   9903 
   9904     PermuteMask[i] = Mask[i] % Size;
   9905   }
   9906 
   9907   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
   9908   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
   9909 }
   9910 
   9911 /// Generic routine to decompose a shuffle and blend into independent
   9912 /// blends and permutes.
   9913 ///
   9914 /// This matches the extremely common pattern for handling combined
   9915 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
   9916 /// operations. It will try to pick the best arrangement of shuffles and
   9917 /// blends.
   9918 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
   9919                                                           MVT VT, SDValue V1,
   9920                                                           SDValue V2,
   9921                                                           ArrayRef<int> Mask,
   9922                                                           SelectionDAG &DAG) {
   9923   // Shuffle the input elements into the desired positions in V1 and V2 and
   9924   // blend them together.
   9925   SmallVector<int, 32> V1Mask(Mask.size(), -1);
   9926   SmallVector<int, 32> V2Mask(Mask.size(), -1);
   9927   SmallVector<int, 32> BlendMask(Mask.size(), -1);
   9928   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   9929     if (Mask[i] >= 0 && Mask[i] < Size) {
   9930       V1Mask[i] = Mask[i];
   9931       BlendMask[i] = i;
   9932     } else if (Mask[i] >= Size) {
   9933       V2Mask[i] = Mask[i] - Size;
   9934       BlendMask[i] = i + Size;
   9935     }
   9936 
   9937   // Try to lower with the simpler initial blend strategy unless one of the
   9938   // input shuffles would be a no-op. We prefer to shuffle inputs as the
   9939   // shuffle may be able to fold with a load or other benefit. However, when
   9940   // we'll have to do 2x as many shuffles in order to achieve this, blending
   9941   // first is a better strategy.
   9942   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
   9943     if (SDValue BlendPerm =
   9944             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
   9945       return BlendPerm;
   9946 
   9947   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
   9948   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
   9949   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
   9950 }
   9951 
   9952 /// Try to lower a vector shuffle as a rotation.
   9953 ///
   9954 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
   9955 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
   9956                                       ArrayRef<int> Mask) {
   9957   int NumElts = Mask.size();
   9958 
   9959   // We need to detect various ways of spelling a rotation:
   9960   //   [11, 12, 13, 14, 15,  0,  1,  2]
   9961   //   [-1, 12, 13, 14, -1, -1,  1, -1]
   9962   //   [-1, -1, -1, -1, -1, -1,  1,  2]
   9963   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
   9964   //   [-1,  4,  5,  6, -1, -1,  9, -1]
   9965   //   [-1,  4,  5,  6, -1, -1, -1, -1]
   9966   int Rotation = 0;
   9967   SDValue Lo, Hi;
   9968   for (int i = 0; i < NumElts; ++i) {
   9969     int M = Mask[i];
   9970     assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
   9971            "Unexpected mask index.");
   9972     if (M < 0)
   9973       continue;
   9974 
   9975     // Determine where a rotated vector would have started.
   9976     int StartIdx = i - (M % NumElts);
   9977     if (StartIdx == 0)
   9978       // The identity rotation isn't interesting, stop.
   9979       return -1;
   9980 
   9981     // If we found the tail of a vector the rotation must be the missing
   9982     // front. If we found the head of a vector, it must be how much of the
   9983     // head.
   9984     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
   9985 
   9986     if (Rotation == 0)
   9987       Rotation = CandidateRotation;
   9988     else if (Rotation != CandidateRotation)
   9989       // The rotations don't match, so we can't match this mask.
   9990       return -1;
   9991 
   9992     // Compute which value this mask is pointing at.
   9993     SDValue MaskV = M < NumElts ? V1 : V2;
   9994 
   9995     // Compute which of the two target values this index should be assigned
   9996     // to. This reflects whether the high elements are remaining or the low
   9997     // elements are remaining.
   9998     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
   9999 
   10000     // Either set up this value if we've not encountered it before, or check
   10001     // that it remains consistent.
   10002     if (!TargetV)
   10003       TargetV = MaskV;
   10004     else if (TargetV != MaskV)
   10005       // This may be a rotation, but it pulls from the inputs in some
   10006       // unsupported interleaving.
   10007       return -1;
   10008   }
   10009 
   10010   // Check that we successfully analyzed the mask, and normalize the results.
   10011   assert(Rotation != 0 && "Failed to locate a viable rotation!");
   10012   assert((Lo || Hi) && "Failed to find a rotated input vector!");
   10013   if (!Lo)
   10014     Lo = Hi;
   10015   else if (!Hi)
   10016     Hi = Lo;
   10017 
   10018   V1 = Lo;
   10019   V2 = Hi;
   10020 
   10021   return Rotation;
   10022 }
   10023 
   10024 /// Try to lower a vector shuffle as a byte rotation.
   10025 ///
   10026 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
   10027 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
   10028 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
   10029 /// try to generically lower a vector shuffle through such an pattern. It
   10030 /// does not check for the profitability of lowering either as PALIGNR or
   10031 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
   10032 /// This matches shuffle vectors that look like:
   10033 ///
   10034 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
   10035 ///
   10036 /// Essentially it concatenates V1 and V2, shifts right by some number of
   10037 /// elements, and takes the low elements as the result. Note that while this is
   10038 /// specified as a *right shift* because x86 is little-endian, it is a *left
   10039 /// rotate* of the vector lanes.
   10040 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
   10041                                           ArrayRef<int> Mask) {
   10042   // Don't accept any shuffles with zero elements.
   10043   if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
   10044     return -1;
   10045 
   10046   // PALIGNR works on 128-bit lanes.
   10047   SmallVector<int, 16> RepeatedMask;
   10048   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
   10049     return -1;
   10050 
   10051   int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
   10052   if (Rotation <= 0)
   10053     return -1;
   10054 
   10055   // PALIGNR rotates bytes, so we need to scale the
   10056   // rotation based on how many bytes are in the vector lane.
   10057   int NumElts = RepeatedMask.size();
   10058   int Scale = 16 / NumElts;
   10059   return Rotation * Scale;
   10060 }
   10061 
   10062 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
   10063                                               SDValue V1, SDValue V2,
   10064                                               ArrayRef<int> Mask,
   10065                                               const X86Subtarget &Subtarget,
   10066                                               SelectionDAG &DAG) {
   10067   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
   10068 
   10069   SDValue Lo = V1, Hi = V2;
   10070   int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
   10071   if (ByteRotation <= 0)
   10072     return SDValue();
   10073 
   10074   // Cast the inputs to i8 vector of correct length to match PALIGNR or
   10075   // PSLLDQ/PSRLDQ.
   10076   MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
   10077   Lo = DAG.getBitcast(ByteVT, Lo);
   10078   Hi = DAG.getBitcast(ByteVT, Hi);
   10079 
   10080   // SSSE3 targets can use the palignr instruction.
   10081   if (Subtarget.hasSSSE3()) {
   10082     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
   10083            "512-bit PALIGNR requires BWI instructions");
   10084     return DAG.getBitcast(
   10085         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
   10086                         DAG.getConstant(ByteRotation, DL, MVT::i8)));
   10087   }
   10088 
   10089   assert(VT.is128BitVector() &&
   10090          "Rotate-based lowering only supports 128-bit lowering!");
   10091   assert(Mask.size() <= 16 &&
   10092          "Can shuffle at most 16 bytes in a 128-bit vector!");
   10093   assert(ByteVT == MVT::v16i8 &&
   10094          "SSE2 rotate lowering only needed for v16i8!");
   10095 
   10096   // Default SSE2 implementation
   10097   int LoByteShift = 16 - ByteRotation;
   10098   int HiByteShift = ByteRotation;
   10099 
   10100   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
   10101                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
   10102   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
   10103                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
   10104   return DAG.getBitcast(VT,
   10105                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
   10106 }
   10107 
   10108 /// Try to lower a vector shuffle as a dword/qword rotation.
   10109 ///
   10110 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
   10111 /// rotation of the concatenation of two vectors; This routine will
   10112 /// try to generically lower a vector shuffle through such an pattern.
   10113 ///
   10114 /// Essentially it concatenates V1 and V2, shifts right by some number of
   10115 /// elements, and takes the low elements as the result. Note that while this is
   10116 /// specified as a *right shift* because x86 is little-endian, it is a *left
   10117 /// rotate* of the vector lanes.
   10118 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
   10119                                           SDValue V1, SDValue V2,
   10120                                           ArrayRef<int> Mask,
   10121                                           const X86Subtarget &Subtarget,
   10122                                           SelectionDAG &DAG) {
   10123   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
   10124          "Only 32-bit and 64-bit elements are supported!");
   10125 
   10126   // 128/256-bit vectors are only supported with VLX.
   10127   assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
   10128          && "VLX required for 128/256-bit vectors");
   10129 
   10130   SDValue Lo = V1, Hi = V2;
   10131   int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
   10132   if (Rotation <= 0)
   10133     return SDValue();
   10134 
   10135   return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
   10136                      DAG.getConstant(Rotation, DL, MVT::i8));
   10137 }
   10138 
   10139 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
   10140 ///
   10141 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
   10142 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
   10143 /// matches elements from one of the input vectors shuffled to the left or
   10144 /// right with zeroable elements 'shifted in'. It handles both the strictly
   10145 /// bit-wise element shifts and the byte shift across an entire 128-bit double
   10146 /// quad word lane.
   10147 ///
   10148 /// PSHL : (little-endian) left bit shift.
   10149 /// [ zz, 0, zz,  2 ]
   10150 /// [ -1, 4, zz, -1 ]
   10151 /// PSRL : (little-endian) right bit shift.
   10152 /// [  1, zz,  3, zz]
   10153 /// [ -1, -1,  7, zz]
   10154 /// PSLLDQ : (little-endian) left byte shift
   10155 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
   10156 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
   10157 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
   10158 /// PSRLDQ : (little-endian) right byte shift
   10159 /// [  5, 6,  7, zz, zz, zz, zz, zz]
   10160 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
   10161 /// [  1, 2, -1, -1, -1, -1, zz, zz]
   10162 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
   10163                                      unsigned ScalarSizeInBits,
   10164                                      ArrayRef<int> Mask, int MaskOffset,
   10165                                      const APInt &Zeroable,
   10166                                      const X86Subtarget &Subtarget) {
   10167   int Size = Mask.size();
   10168   unsigned SizeInBits = Size * ScalarSizeInBits;
   10169 
   10170   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
   10171     for (int i = 0; i < Size; i += Scale)
   10172       for (int j = 0; j < Shift; ++j)
   10173         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
   10174           return false;
   10175 
   10176     return true;
   10177   };
   10178 
   10179   auto MatchShift = [&](int Shift, int Scale, bool Left) {
   10180     for (int i = 0; i != Size; i += Scale) {
   10181       unsigned Pos = Left ? i + Shift : i;
   10182       unsigned Low = Left ? i : i + Shift;
   10183       unsigned Len = Scale - Shift;
   10184       if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
   10185         return -1;
   10186     }
   10187 
   10188     int ShiftEltBits = ScalarSizeInBits * Scale;
   10189     bool ByteShift = ShiftEltBits > 64;
   10190     Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
   10191                   : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
   10192     int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
   10193 
   10194     // Normalize the scale for byte shifts to still produce an i64 element
   10195     // type.
   10196     Scale = ByteShift ? Scale / 2 : Scale;
   10197 
   10198     // We need to round trip through the appropriate type for the shift.
   10199     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
   10200     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
   10201                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
   10202     return (int)ShiftAmt;
   10203   };
   10204 
   10205   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
   10206   // keep doubling the size of the integer elements up to that. We can
   10207   // then shift the elements of the integer vector by whole multiples of
   10208   // their width within the elements of the larger integer vector. Test each
   10209   // multiple to see if we can find a match with the moved element indices
   10210   // and that the shifted in elements are all zeroable.
   10211   unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
   10212   for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
   10213     for (int Shift = 1; Shift != Scale; ++Shift)
   10214       for (bool Left : {true, false})
   10215         if (CheckZeros(Shift, Scale, Left)) {
   10216           int ShiftAmt = MatchShift(Shift, Scale, Left);
   10217           if (0 < ShiftAmt)
   10218             return ShiftAmt;
   10219         }
   10220 
   10221   // no match
   10222   return -1;
   10223 }
   10224 
   10225 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
   10226                                          SDValue V2, ArrayRef<int> Mask,
   10227                                          const APInt &Zeroable,
   10228                                          const X86Subtarget &Subtarget,
   10229                                          SelectionDAG &DAG) {
   10230   int Size = Mask.size();
   10231   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
   10232 
   10233   MVT ShiftVT;
   10234   SDValue V = V1;
   10235   unsigned Opcode;
   10236 
   10237   // Try to match shuffle against V1 shift.
   10238   int ShiftAmt = matchVectorShuffleAsShift(
   10239       ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
   10240 
   10241   // If V1 failed, try to match shuffle against V2 shift.
   10242   if (ShiftAmt < 0) {
   10243     ShiftAmt =
   10244         matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
   10245                                   Mask, Size, Zeroable, Subtarget);
   10246     V = V2;
   10247   }
   10248 
   10249   if (ShiftAmt < 0)
   10250     return SDValue();
   10251 
   10252   assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
   10253          "Illegal integer vector type");
   10254   V = DAG.getBitcast(ShiftVT, V);
   10255   V = DAG.getNode(Opcode, DL, ShiftVT, V,
   10256                   DAG.getConstant(ShiftAmt, DL, MVT::i8));
   10257   return DAG.getBitcast(VT, V);
   10258 }
   10259 
   10260 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
   10261 // Remainder of lower half result is zero and upper half is all undef.
   10262 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
   10263                                       ArrayRef<int> Mask, uint64_t &BitLen,
   10264                                       uint64_t &BitIdx, const APInt &Zeroable) {
   10265   int Size = Mask.size();
   10266   int HalfSize = Size / 2;
   10267   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
   10268   assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
   10269 
   10270   // Upper half must be undefined.
   10271   if (!isUndefInRange(Mask, HalfSize, HalfSize))
   10272     return false;
   10273 
   10274   // Determine the extraction length from the part of the
   10275   // lower half that isn't zeroable.
   10276   int Len = HalfSize;
   10277   for (; Len > 0; --Len)
   10278     if (!Zeroable[Len - 1])
   10279       break;
   10280   assert(Len > 0 && "Zeroable shuffle mask");
   10281 
   10282   // Attempt to match first Len sequential elements from the lower half.
   10283   SDValue Src;
   10284   int Idx = -1;
   10285   for (int i = 0; i != Len; ++i) {
   10286     int M = Mask[i];
   10287     if (M == SM_SentinelUndef)
   10288       continue;
   10289     SDValue &V = (M < Size ? V1 : V2);
   10290     M = M % Size;
   10291 
   10292     // The extracted elements must start at a valid index and all mask
   10293     // elements must be in the lower half.
   10294     if (i > M || M >= HalfSize)
   10295       return false;
   10296 
   10297     if (Idx < 0 || (Src == V && Idx == (M - i))) {
   10298       Src = V;
   10299       Idx = M - i;
   10300       continue;
   10301     }
   10302     return false;
   10303   }
   10304 
   10305   if (!Src || Idx < 0)
   10306     return false;
   10307 
   10308   assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
   10309   BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
   10310   BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
   10311   V1 = Src;
   10312   return true;
   10313 }
   10314 
   10315 // INSERTQ: Extract lowest Len elements from lower half of second source and
   10316 // insert over first source, starting at Idx.
   10317 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
   10318 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
   10319                                         ArrayRef<int> Mask, uint64_t &BitLen,
   10320                                         uint64_t &BitIdx) {
   10321   int Size = Mask.size();
   10322   int HalfSize = Size / 2;
   10323   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
   10324 
   10325   // Upper half must be undefined.
   10326   if (!isUndefInRange(Mask, HalfSize, HalfSize))
   10327     return false;
   10328 
   10329   for (int Idx = 0; Idx != HalfSize; ++Idx) {
   10330     SDValue Base;
   10331 
   10332     // Attempt to match first source from mask before insertion point.
   10333     if (isUndefInRange(Mask, 0, Idx)) {
   10334       /* EMPTY */
   10335     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
   10336       Base = V1;
   10337     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
   10338       Base = V2;
   10339     } else {
   10340       continue;
   10341     }
   10342 
   10343     // Extend the extraction length looking to match both the insertion of
   10344     // the second source and the remaining elements of the first.
   10345     for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
   10346       SDValue Insert;
   10347       int Len = Hi - Idx;
   10348 
   10349       // Match insertion.
   10350       if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
   10351         Insert = V1;
   10352       } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
   10353         Insert = V2;
   10354       } else {
   10355         continue;
   10356       }
   10357 
   10358       // Match the remaining elements of the lower half.
   10359       if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
   10360         /* EMPTY */
   10361       } else if ((!Base || (Base == V1)) &&
   10362                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
   10363         Base = V1;
   10364       } else if ((!Base || (Base == V2)) &&
   10365                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
   10366                                             Size + Hi)) {
   10367         Base = V2;
   10368       } else {
   10369         continue;
   10370       }
   10371 
   10372       BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
   10373       BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
   10374       V1 = Base;
   10375       V2 = Insert;
   10376       return true;
   10377     }
   10378   }
   10379 
   10380   return false;
   10381 }
   10382 
   10383 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
   10384 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
   10385                                            SDValue V2, ArrayRef<int> Mask,
   10386                                            const APInt &Zeroable,
   10387                                            SelectionDAG &DAG) {
   10388   uint64_t BitLen, BitIdx;
   10389   if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
   10390     return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
   10391                        DAG.getConstant(BitLen, DL, MVT::i8),
   10392                        DAG.getConstant(BitIdx, DL, MVT::i8));
   10393 
   10394   if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
   10395     return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
   10396                        V2 ? V2 : DAG.getUNDEF(VT),
   10397                        DAG.getConstant(BitLen, DL, MVT::i8),
   10398                        DAG.getConstant(BitIdx, DL, MVT::i8));
   10399 
   10400   return SDValue();
   10401 }
   10402 
   10403 /// Lower a vector shuffle as a zero or any extension.
   10404 ///
   10405 /// Given a specific number of elements, element bit width, and extension
   10406 /// stride, produce either a zero or any extension based on the available
   10407 /// features of the subtarget. The extended elements are consecutive and
   10408 /// begin and can start from an offsetted element index in the input; to
   10409 /// avoid excess shuffling the offset must either being in the bottom lane
   10410 /// or at the start of a higher lane. All extended elements must be from
   10411 /// the same lane.
   10412 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   10413     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
   10414     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   10415   assert(Scale > 1 && "Need a scale to extend.");
   10416   int EltBits = VT.getScalarSizeInBits();
   10417   int NumElements = VT.getVectorNumElements();
   10418   int NumEltsPerLane = 128 / EltBits;
   10419   int OffsetLane = Offset / NumEltsPerLane;
   10420   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
   10421          "Only 8, 16, and 32 bit elements can be extended.");
   10422   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
   10423   assert(0 <= Offset && "Extension offset must be positive.");
   10424   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
   10425          "Extension offset must be in the first lane or start an upper lane.");
   10426 
   10427   // Check that an index is in same lane as the base offset.
   10428   auto SafeOffset = [&](int Idx) {
   10429     return OffsetLane == (Idx / NumEltsPerLane);
   10430   };
   10431 
   10432   // Shift along an input so that the offset base moves to the first element.
   10433   auto ShuffleOffset = [&](SDValue V) {
   10434     if (!Offset)
   10435       return V;
   10436 
   10437     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
   10438     for (int i = 0; i * Scale < NumElements; ++i) {
   10439       int SrcIdx = i + Offset;
   10440       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
   10441     }
   10442     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
   10443   };
   10444 
   10445   // Found a valid zext mask! Try various lowering strategies based on the
   10446   // input type and available ISA extensions.
   10447   if (Subtarget.hasSSE41()) {
   10448     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
   10449     // PUNPCK will catch this in a later shuffle match.
   10450     if (Offset && Scale == 2 && VT.is128BitVector())
   10451       return SDValue();
   10452     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
   10453                                  NumElements / Scale);
   10454     InputV = ShuffleOffset(InputV);
   10455     InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
   10456     return DAG.getBitcast(VT, InputV);
   10457   }
   10458 
   10459   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
   10460 
   10461   // For any extends we can cheat for larger element sizes and use shuffle
   10462   // instructions that can fold with a load and/or copy.
   10463   if (AnyExt && EltBits == 32) {
   10464     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
   10465                          -1};
   10466     return DAG.getBitcast(
   10467         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
   10468                         DAG.getBitcast(MVT::v4i32, InputV),
   10469                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   10470   }
   10471   if (AnyExt && EltBits == 16 && Scale > 2) {
   10472     int PSHUFDMask[4] = {Offset / 2, -1,
   10473                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
   10474     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
   10475                          DAG.getBitcast(MVT::v4i32, InputV),
   10476                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
   10477     int PSHUFWMask[4] = {1, -1, -1, -1};
   10478     unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
   10479     return DAG.getBitcast(
   10480         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
   10481                         DAG.getBitcast(MVT::v8i16, InputV),
   10482                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
   10483   }
   10484 
   10485   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
   10486   // to 64-bits.
   10487   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
   10488     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
   10489     assert(VT.is128BitVector() && "Unexpected vector width!");
   10490 
   10491     int LoIdx = Offset * EltBits;
   10492     SDValue Lo = DAG.getBitcast(
   10493         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
   10494                                 DAG.getConstant(EltBits, DL, MVT::i8),
   10495                                 DAG.getConstant(LoIdx, DL, MVT::i8)));
   10496 
   10497     if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
   10498         !SafeOffset(Offset + 1))
   10499       return DAG.getBitcast(VT, Lo);
   10500 
   10501     int HiIdx = (Offset + 1) * EltBits;
   10502     SDValue Hi = DAG.getBitcast(
   10503         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
   10504                                 DAG.getConstant(EltBits, DL, MVT::i8),
   10505                                 DAG.getConstant(HiIdx, DL, MVT::i8)));
   10506     return DAG.getBitcast(VT,
   10507                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
   10508   }
   10509 
   10510   // If this would require more than 2 unpack instructions to expand, use
   10511   // pshufb when available. We can only use more than 2 unpack instructions
   10512   // when zero extending i8 elements which also makes it easier to use pshufb.
   10513   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
   10514     assert(NumElements == 16 && "Unexpected byte vector width!");
   10515     SDValue PSHUFBMask[16];
   10516     for (int i = 0; i < 16; ++i) {
   10517       int Idx = Offset + (i / Scale);
   10518       PSHUFBMask[i] = DAG.getConstant(
   10519           (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
   10520     }
   10521     InputV = DAG.getBitcast(MVT::v16i8, InputV);
   10522     return DAG.getBitcast(
   10523         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
   10524                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
   10525   }
   10526 
   10527   // If we are extending from an offset, ensure we start on a boundary that
   10528   // we can unpack from.
   10529   int AlignToUnpack = Offset % (NumElements / Scale);
   10530   if (AlignToUnpack) {
   10531     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
   10532     for (int i = AlignToUnpack; i < NumElements; ++i)
   10533       ShMask[i - AlignToUnpack] = i;
   10534     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
   10535     Offset -= AlignToUnpack;
   10536   }
   10537 
   10538   // Otherwise emit a sequence of unpacks.
   10539   do {
   10540     unsigned UnpackLoHi = X86ISD::UNPCKL;
   10541     if (Offset >= (NumElements / 2)) {
   10542       UnpackLoHi = X86ISD::UNPCKH;
   10543       Offset -= (NumElements / 2);
   10544     }
   10545 
   10546     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
   10547     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
   10548                          : getZeroVector(InputVT, Subtarget, DAG, DL);
   10549     InputV = DAG.getBitcast(InputVT, InputV);
   10550     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
   10551     Scale /= 2;
   10552     EltBits *= 2;
   10553     NumElements /= 2;
   10554   } while (Scale > 1);
   10555   return DAG.getBitcast(VT, InputV);
   10556 }
   10557 
   10558 /// Try to lower a vector shuffle as a zero extension on any microarch.
   10559 ///
   10560 /// This routine will try to do everything in its power to cleverly lower
   10561 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
   10562 /// check for the profitability of this lowering,  it tries to aggressively
   10563 /// match this pattern. It will use all of the micro-architectural details it
   10564 /// can to emit an efficient lowering. It handles both blends with all-zero
   10565 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
   10566 /// masking out later).
   10567 ///
   10568 /// The reason we have dedicated lowering for zext-style shuffles is that they
   10569 /// are both incredibly common and often quite performance sensitive.
   10570 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
   10571     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   10572     const APInt &Zeroable, const X86Subtarget &Subtarget,
   10573     SelectionDAG &DAG) {
   10574   int Bits = VT.getSizeInBits();
   10575   int NumLanes = Bits / 128;
   10576   int NumElements = VT.getVectorNumElements();
   10577   int NumEltsPerLane = NumElements / NumLanes;
   10578   assert(VT.getScalarSizeInBits() <= 32 &&
   10579          "Exceeds 32-bit integer zero extension limit");
   10580   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
   10581 
   10582   // Define a helper function to check a particular ext-scale and lower to it if
   10583   // valid.
   10584   auto Lower = [&](int Scale) -> SDValue {
   10585     SDValue InputV;
   10586     bool AnyExt = true;
   10587     int Offset = 0;
   10588     int Matches = 0;
   10589     for (int i = 0; i < NumElements; ++i) {
   10590       int M = Mask[i];
   10591       if (M < 0)
   10592         continue; // Valid anywhere but doesn't tell us anything.
   10593       if (i % Scale != 0) {
   10594         // Each of the extended elements need to be zeroable.
   10595         if (!Zeroable[i])
   10596           return SDValue();
   10597 
   10598         // We no longer are in the anyext case.
   10599         AnyExt = false;
   10600         continue;
   10601       }
   10602 
   10603       // Each of the base elements needs to be consecutive indices into the
   10604       // same input vector.
   10605       SDValue V = M < NumElements ? V1 : V2;
   10606       M = M % NumElements;
   10607       if (!InputV) {
   10608         InputV = V;
   10609         Offset = M - (i / Scale);
   10610       } else if (InputV != V)
   10611         return SDValue(); // Flip-flopping inputs.
   10612 
   10613       // Offset must start in the lowest 128-bit lane or at the start of an
   10614       // upper lane.
   10615       // FIXME: Is it ever worth allowing a negative base offset?
   10616       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
   10617             (Offset % NumEltsPerLane) == 0))
   10618         return SDValue();
   10619 
   10620       // If we are offsetting, all referenced entries must come from the same
   10621       // lane.
   10622       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
   10623         return SDValue();
   10624 
   10625       if ((M % NumElements) != (Offset + (i / Scale)))
   10626         return SDValue(); // Non-consecutive strided elements.
   10627       Matches++;
   10628     }
   10629 
   10630     // If we fail to find an input, we have a zero-shuffle which should always
   10631     // have already been handled.
   10632     // FIXME: Maybe handle this here in case during blending we end up with one?
   10633     if (!InputV)
   10634       return SDValue();
   10635 
   10636     // If we are offsetting, don't extend if we only match a single input, we
   10637     // can always do better by using a basic PSHUF or PUNPCK.
   10638     if (Offset != 0 && Matches < 2)
   10639       return SDValue();
   10640 
   10641     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   10642         DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
   10643   };
   10644 
   10645   // The widest scale possible for extending is to a 64-bit integer.
   10646   assert(Bits % 64 == 0 &&
   10647          "The number of bits in a vector must be divisible by 64 on x86!");
   10648   int NumExtElements = Bits / 64;
   10649 
   10650   // Each iteration, try extending the elements half as much, but into twice as
   10651   // many elements.
   10652   for (; NumExtElements < NumElements; NumExtElements *= 2) {
   10653     assert(NumElements % NumExtElements == 0 &&
   10654            "The input vector size must be divisible by the extended size.");
   10655     if (SDValue V = Lower(NumElements / NumExtElements))
   10656       return V;
   10657   }
   10658 
   10659   // General extends failed, but 128-bit vectors may be able to use MOVQ.
   10660   if (Bits != 128)
   10661     return SDValue();
   10662 
   10663   // Returns one of the source operands if the shuffle can be reduced to a
   10664   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
   10665   auto CanZExtLowHalf = [&]() {
   10666     for (int i = NumElements / 2; i != NumElements; ++i)
   10667       if (!Zeroable[i])
   10668         return SDValue();
   10669     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
   10670       return V1;
   10671     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
   10672       return V2;
   10673     return SDValue();
   10674   };
   10675 
   10676   if (SDValue V = CanZExtLowHalf()) {
   10677     V = DAG.getBitcast(MVT::v2i64, V);
   10678     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
   10679     return DAG.getBitcast(VT, V);
   10680   }
   10681 
   10682   // No viable ext lowering found.
   10683   return SDValue();
   10684 }
   10685 
   10686 /// Try to get a scalar value for a specific element of a vector.
   10687 ///
   10688 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
   10689 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
   10690                                               SelectionDAG &DAG) {
   10691   MVT VT = V.getSimpleValueType();
   10692   MVT EltVT = VT.getVectorElementType();
   10693   V = peekThroughBitcasts(V);
   10694 
   10695   // If the bitcasts shift the element size, we can't extract an equivalent
   10696   // element from it.
   10697   MVT NewVT = V.getSimpleValueType();
   10698   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
   10699     return SDValue();
   10700 
   10701   if (V.getOpcode() == ISD::BUILD_VECTOR ||
   10702       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
   10703     // Ensure the scalar operand is the same size as the destination.
   10704     // FIXME: Add support for scalar truncation where possible.
   10705     SDValue S = V.getOperand(Idx);
   10706     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
   10707       return DAG.getBitcast(EltVT, S);
   10708   }
   10709 
   10710   return SDValue();
   10711 }
   10712 
   10713 /// Helper to test for a load that can be folded with x86 shuffles.
   10714 ///
   10715 /// This is particularly important because the set of instructions varies
   10716 /// significantly based on whether the operand is a load or not.
   10717 static bool isShuffleFoldableLoad(SDValue V) {
   10718   V = peekThroughBitcasts(V);
   10719   return ISD::isNON_EXTLoad(V.getNode());
   10720 }
   10721 
   10722 /// Try to lower insertion of a single element into a zero vector.
   10723 ///
   10724 /// This is a common pattern that we have especially efficient patterns to lower
   10725 /// across all subtarget feature sets.
   10726 static SDValue lowerVectorShuffleAsElementInsertion(
   10727     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   10728     const APInt &Zeroable, const X86Subtarget &Subtarget,
   10729     SelectionDAG &DAG) {
   10730   MVT ExtVT = VT;
   10731   MVT EltVT = VT.getVectorElementType();
   10732 
   10733   int V2Index =
   10734       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
   10735       Mask.begin();
   10736   bool IsV1Zeroable = true;
   10737   for (int i = 0, Size = Mask.size(); i < Size; ++i)
   10738     if (i != V2Index && !Zeroable[i]) {
   10739       IsV1Zeroable = false;
   10740       break;
   10741     }
   10742 
   10743   // Check for a single input from a SCALAR_TO_VECTOR node.
   10744   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
   10745   // all the smarts here sunk into that routine. However, the current
   10746   // lowering of BUILD_VECTOR makes that nearly impossible until the old
   10747   // vector shuffle lowering is dead.
   10748   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
   10749                                                DAG);
   10750   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
   10751     // We need to zext the scalar if it is smaller than an i32.
   10752     V2S = DAG.getBitcast(EltVT, V2S);
   10753     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
   10754       // Using zext to expand a narrow element won't work for non-zero
   10755       // insertions.
   10756       if (!IsV1Zeroable)
   10757         return SDValue();
   10758 
   10759       // Zero-extend directly to i32.
   10760       ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
   10761       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
   10762     }
   10763     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
   10764   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
   10765              EltVT == MVT::i16) {
   10766     // Either not inserting from the low element of the input or the input
   10767     // element size is too small to use VZEXT_MOVL to clear the high bits.
   10768     return SDValue();
   10769   }
   10770 
   10771   if (!IsV1Zeroable) {
   10772     // If V1 can't be treated as a zero vector we have fewer options to lower
   10773     // this. We can't support integer vectors or non-zero targets cheaply, and
   10774     // the V1 elements can't be permuted in any way.
   10775     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
   10776     if (!VT.isFloatingPoint() || V2Index != 0)
   10777       return SDValue();
   10778     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
   10779     V1Mask[V2Index] = -1;
   10780     if (!isNoopShuffleMask(V1Mask))
   10781       return SDValue();
   10782     if (!VT.is128BitVector())
   10783       return SDValue();
   10784 
   10785     // Otherwise, use MOVSD or MOVSS.
   10786     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
   10787            "Only two types of floating point element types to handle!");
   10788     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
   10789                        ExtVT, V1, V2);
   10790   }
   10791 
   10792   // This lowering only works for the low element with floating point vectors.
   10793   if (VT.isFloatingPoint() && V2Index != 0)
   10794     return SDValue();
   10795 
   10796   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
   10797   if (ExtVT != VT)
   10798     V2 = DAG.getBitcast(VT, V2);
   10799 
   10800   if (V2Index != 0) {
   10801     // If we have 4 or fewer lanes we can cheaply shuffle the element into
   10802     // the desired position. Otherwise it is more efficient to do a vector
   10803     // shift left. We know that we can do a vector shift left because all
   10804     // the inputs are zero.
   10805     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
   10806       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
   10807       V2Shuffle[V2Index] = 0;
   10808       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
   10809     } else {
   10810       V2 = DAG.getBitcast(MVT::v16i8, V2);
   10811       V2 = DAG.getNode(
   10812           X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
   10813           DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
   10814       V2 = DAG.getBitcast(VT, V2);
   10815     }
   10816   }
   10817   return V2;
   10818 }
   10819 
   10820 /// Try to lower broadcast of a single - truncated - integer element,
   10821 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
   10822 ///
   10823 /// This assumes we have AVX2.
   10824 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
   10825                                                   SDValue V0, int BroadcastIdx,
   10826                                                   const X86Subtarget &Subtarget,
   10827                                                   SelectionDAG &DAG) {
   10828   assert(Subtarget.hasAVX2() &&
   10829          "We can only lower integer broadcasts with AVX2!");
   10830 
   10831   EVT EltVT = VT.getVectorElementType();
   10832   EVT V0VT = V0.getValueType();
   10833 
   10834   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
   10835   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
   10836 
   10837   EVT V0EltVT = V0VT.getVectorElementType();
   10838   if (!V0EltVT.isInteger())
   10839     return SDValue();
   10840 
   10841   const unsigned EltSize = EltVT.getSizeInBits();
   10842   const unsigned V0EltSize = V0EltVT.getSizeInBits();
   10843 
   10844   // This is only a truncation if the original element type is larger.
   10845   if (V0EltSize <= EltSize)
   10846     return SDValue();
   10847 
   10848   assert(((V0EltSize % EltSize) == 0) &&
   10849          "Scalar type sizes must all be powers of 2 on x86!");
   10850 
   10851   const unsigned V0Opc = V0.getOpcode();
   10852   const unsigned Scale = V0EltSize / EltSize;
   10853   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
   10854 
   10855   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
   10856       V0Opc != ISD::BUILD_VECTOR)
   10857     return SDValue();
   10858 
   10859   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
   10860 
   10861   // If we're extracting non-least-significant bits, shift so we can truncate.
   10862   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
   10863   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
   10864   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
   10865   if (const int OffsetIdx = BroadcastIdx % Scale)
   10866     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
   10867                          DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
   10868 
   10869   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
   10870                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
   10871 }
   10872 
   10873 /// Try to lower broadcast of a single element.
   10874 ///
   10875 /// For convenience, this code also bundles all of the subtarget feature set
   10876 /// filtering. While a little annoying to re-dispatch on type here, there isn't
   10877 /// a convenient way to factor it out.
   10878 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
   10879                                              SDValue V1, SDValue V2,
   10880                                              ArrayRef<int> Mask,
   10881                                              const X86Subtarget &Subtarget,
   10882                                              SelectionDAG &DAG) {
   10883   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
   10884         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
   10885         (Subtarget.hasAVX2() && VT.isInteger())))
   10886     return SDValue();
   10887 
   10888   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
   10889   // we can only broadcast from a register with AVX2.
   10890   unsigned NumElts = Mask.size();
   10891   unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
   10892                         ? X86ISD::MOVDDUP
   10893                         : X86ISD::VBROADCAST;
   10894   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
   10895 
   10896   // Check that the mask is a broadcast.
   10897   int BroadcastIdx = -1;
   10898   for (int i = 0; i != (int)NumElts; ++i) {
   10899     SmallVector<int, 8> BroadcastMask(NumElts, i);
   10900     if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
   10901       BroadcastIdx = i;
   10902       break;
   10903     }
   10904   }
   10905 
   10906   if (BroadcastIdx < 0)
   10907     return SDValue();
   10908   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
   10909                                             "a sorted mask where the broadcast "
   10910                                             "comes from V1.");
   10911 
   10912   // Go up the chain of (vector) values to find a scalar load that we can
   10913   // combine with the broadcast.
   10914   SDValue V = V1;
   10915   for (;;) {
   10916     switch (V.getOpcode()) {
   10917     case ISD::BITCAST: {
   10918       // Peek through bitcasts as long as BroadcastIdx can be adjusted.
   10919       SDValue VSrc = V.getOperand(0);
   10920       unsigned NumEltBits = V.getScalarValueSizeInBits();
   10921       unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
   10922       if ((NumEltBits % NumSrcBits) == 0)
   10923         BroadcastIdx *= (NumEltBits / NumSrcBits);
   10924       else if ((NumSrcBits % NumEltBits) == 0 &&
   10925                (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
   10926         BroadcastIdx /= (NumSrcBits / NumEltBits);
   10927       else
   10928         break;
   10929       V = VSrc;
   10930       continue;
   10931     }
   10932     case ISD::CONCAT_VECTORS: {
   10933       int OperandSize = Mask.size() / V.getNumOperands();
   10934       V = V.getOperand(BroadcastIdx / OperandSize);
   10935       BroadcastIdx %= OperandSize;
   10936       continue;
   10937     }
   10938     case ISD::INSERT_SUBVECTOR: {
   10939       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
   10940       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
   10941       if (!ConstantIdx)
   10942         break;
   10943 
   10944       int BeginIdx = (int)ConstantIdx->getZExtValue();
   10945       int EndIdx =
   10946           BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
   10947       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
   10948         BroadcastIdx -= BeginIdx;
   10949         V = VInner;
   10950       } else {
   10951         V = VOuter;
   10952       }
   10953       continue;
   10954     }
   10955     }
   10956     break;
   10957   }
   10958 
   10959   // Ensure the source vector and BroadcastIdx are for a suitable type.
   10960   if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
   10961     unsigned NumEltBits = VT.getScalarSizeInBits();
   10962     unsigned NumSrcBits = V.getScalarValueSizeInBits();
   10963     if ((NumSrcBits % NumEltBits) == 0)
   10964       BroadcastIdx *= (NumSrcBits / NumEltBits);
   10965     else if ((NumEltBits % NumSrcBits) == 0 &&
   10966              (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
   10967       BroadcastIdx /= (NumEltBits / NumSrcBits);
   10968     else
   10969       return SDValue();
   10970 
   10971     unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
   10972     MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
   10973     V = DAG.getBitcast(SrcVT, V);
   10974   }
   10975 
   10976   // Check if this is a broadcast of a scalar. We special case lowering
   10977   // for scalars so that we can more effectively fold with loads.
   10978   // First, look through bitcast: if the original value has a larger element
   10979   // type than the shuffle, the broadcast element is in essence truncated.
   10980   // Make that explicit to ease folding.
   10981   if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
   10982     if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
   10983             DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
   10984       return TruncBroadcast;
   10985 
   10986   MVT BroadcastVT = VT;
   10987 
   10988   // Peek through any bitcast (only useful for loads).
   10989   SDValue BC = peekThroughBitcasts(V);
   10990 
   10991   // Also check the simpler case, where we can directly reuse the scalar.
   10992   if (V.getOpcode() == ISD::BUILD_VECTOR ||
   10993       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
   10994     V = V.getOperand(BroadcastIdx);
   10995 
   10996     // If we can't broadcast from a register, check that the input is a load.
   10997     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
   10998       return SDValue();
   10999   } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
   11000     // 32-bit targets need to load i64 as a f64 and then bitcast the result.
   11001     if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
   11002       BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
   11003       Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
   11004                    ? X86ISD::MOVDDUP
   11005                    : Opcode;
   11006     }
   11007 
   11008     // If we are broadcasting a load that is only used by the shuffle
   11009     // then we can reduce the vector load to the broadcasted scalar load.
   11010     LoadSDNode *Ld = cast<LoadSDNode>(BC);
   11011     SDValue BaseAddr = Ld->getOperand(1);
   11012     EVT SVT = BroadcastVT.getScalarType();
   11013     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
   11014     SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
   11015     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
   11016                     DAG.getMachineFunction().getMachineMemOperand(
   11017                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
   11018     DAG.makeEquivalentMemoryOrdering(Ld, V);
   11019   } else if (!BroadcastFromReg) {
   11020     // We can't broadcast from a vector register.
   11021     return SDValue();
   11022   } else if (BroadcastIdx != 0) {
   11023     // We can only broadcast from the zero-element of a vector register,
   11024     // but it can be advantageous to broadcast from the zero-element of a
   11025     // subvector.
   11026     if (!VT.is256BitVector() && !VT.is512BitVector())
   11027       return SDValue();
   11028 
   11029     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
   11030     if (VT == MVT::v4f64 || VT == MVT::v4i64)
   11031       return SDValue();
   11032 
   11033     // Only broadcast the zero-element of a 128-bit subvector.
   11034     unsigned EltSize = VT.getScalarSizeInBits();
   11035     if (((BroadcastIdx * EltSize) % 128) != 0)
   11036       return SDValue();
   11037 
   11038     // The shuffle input might have been a bitcast we looked through; look at
   11039     // the original input vector.  Emit an EXTRACT_SUBVECTOR of that type; we'll
   11040     // later bitcast it to BroadcastVT.
   11041     assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
   11042            "Unexpected vector element size");
   11043     assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
   11044            "Unexpected vector size");
   11045     V = extract128BitVector(V, BroadcastIdx, DAG, DL);
   11046   }
   11047 
   11048   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
   11049     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
   11050                     DAG.getBitcast(MVT::f64, V));
   11051 
   11052   // Bitcast back to the same scalar type as BroadcastVT.
   11053   MVT SrcVT = V.getSimpleValueType();
   11054   if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
   11055     assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
   11056            "Unexpected vector element size");
   11057     if (SrcVT.isVector()) {
   11058       unsigned NumSrcElts = SrcVT.getVectorNumElements();
   11059       SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
   11060     } else {
   11061       SrcVT = BroadcastVT.getScalarType();
   11062     }
   11063     V = DAG.getBitcast(SrcVT, V);
   11064   }
   11065 
   11066   // 32-bit targets need to load i64 as a f64 and then bitcast the result.
   11067   if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
   11068     V = DAG.getBitcast(MVT::f64, V);
   11069     unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
   11070     BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
   11071   }
   11072 
   11073   // We only support broadcasting from 128-bit vectors to minimize the
   11074   // number of patterns we need to deal with in isel. So extract down to
   11075   // 128-bits, removing as many bitcasts as possible.
   11076   if (SrcVT.getSizeInBits() > 128) {
   11077     MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
   11078                                  128 / SrcVT.getScalarSizeInBits());
   11079     V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
   11080     V = DAG.getBitcast(ExtVT, V);
   11081   }
   11082 
   11083   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
   11084 }
   11085 
   11086 // Check for whether we can use INSERTPS to perform the shuffle. We only use
   11087 // INSERTPS when the V1 elements are already in the correct locations
   11088 // because otherwise we can just always use two SHUFPS instructions which
   11089 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
   11090 // perform INSERTPS if a single V1 element is out of place and all V2
   11091 // elements are zeroable.
   11092 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
   11093                                          unsigned &InsertPSMask,
   11094                                          const APInt &Zeroable,
   11095                                          ArrayRef<int> Mask,
   11096                                          SelectionDAG &DAG) {
   11097   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
   11098   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
   11099   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   11100 
   11101   // Attempt to match INSERTPS with one element from VA or VB being
   11102   // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
   11103   // are updated.
   11104   auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
   11105                              ArrayRef<int> CandidateMask) {
   11106     unsigned ZMask = 0;
   11107     int VADstIndex = -1;
   11108     int VBDstIndex = -1;
   11109     bool VAUsedInPlace = false;
   11110 
   11111     for (int i = 0; i < 4; ++i) {
   11112       // Synthesize a zero mask from the zeroable elements (includes undefs).
   11113       if (Zeroable[i]) {
   11114         ZMask |= 1 << i;
   11115         continue;
   11116       }
   11117 
   11118       // Flag if we use any VA inputs in place.
   11119       if (i == CandidateMask[i]) {
   11120         VAUsedInPlace = true;
   11121         continue;
   11122       }
   11123 
   11124       // We can only insert a single non-zeroable element.
   11125       if (VADstIndex >= 0 || VBDstIndex >= 0)
   11126         return false;
   11127 
   11128       if (CandidateMask[i] < 4) {
   11129         // VA input out of place for insertion.
   11130         VADstIndex = i;
   11131       } else {
   11132         // VB input for insertion.
   11133         VBDstIndex = i;
   11134       }
   11135     }
   11136 
   11137     // Don't bother if we have no (non-zeroable) element for insertion.
   11138     if (VADstIndex < 0 && VBDstIndex < 0)
   11139       return false;
   11140 
   11141     // Determine element insertion src/dst indices. The src index is from the
   11142     // start of the inserted vector, not the start of the concatenated vector.
   11143     unsigned VBSrcIndex = 0;
   11144     if (VADstIndex >= 0) {
   11145       // If we have a VA input out of place, we use VA as the V2 element
   11146       // insertion and don't use the original V2 at all.
   11147       VBSrcIndex = CandidateMask[VADstIndex];
   11148       VBDstIndex = VADstIndex;
   11149       VB = VA;
   11150     } else {
   11151       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
   11152     }
   11153 
   11154     // If no V1 inputs are used in place, then the result is created only from
   11155     // the zero mask and the V2 insertion - so remove V1 dependency.
   11156     if (!VAUsedInPlace)
   11157       VA = DAG.getUNDEF(MVT::v4f32);
   11158 
   11159     // Update V1, V2 and InsertPSMask accordingly.
   11160     V1 = VA;
   11161     V2 = VB;
   11162 
   11163     // Insert the V2 element into the desired position.
   11164     InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
   11165     assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
   11166     return true;
   11167   };
   11168 
   11169   if (matchAsInsertPS(V1, V2, Mask))
   11170     return true;
   11171 
   11172   // Commute and try again.
   11173   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
   11174   ShuffleVectorSDNode::commuteMask(CommutedMask);
   11175   if (matchAsInsertPS(V2, V1, CommutedMask))
   11176     return true;
   11177 
   11178   return false;
   11179 }
   11180 
   11181 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
   11182                                             SDValue V2, ArrayRef<int> Mask,
   11183                                             const APInt &Zeroable,
   11184                                             SelectionDAG &DAG) {
   11185   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   11186   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   11187 
   11188   // Attempt to match the insertps pattern.
   11189   unsigned InsertPSMask;
   11190   if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
   11191     return SDValue();
   11192 
   11193   // Insert the V2 element into the desired position.
   11194   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
   11195                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
   11196 }
   11197 
   11198 /// Try to lower a shuffle as a permute of the inputs followed by an
   11199 /// UNPCK instruction.
   11200 ///
   11201 /// This specifically targets cases where we end up with alternating between
   11202 /// the two inputs, and so can permute them into something that feeds a single
   11203 /// UNPCK instruction. Note that this routine only targets integer vectors
   11204 /// because for floating point vectors we have a generalized SHUFPS lowering
   11205 /// strategy that handles everything that doesn't *exactly* match an unpack,
   11206 /// making this clever lowering unnecessary.
   11207 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
   11208                                                     SDValue V1, SDValue V2,
   11209                                                     ArrayRef<int> Mask,
   11210                                                     SelectionDAG &DAG) {
   11211   assert(!VT.isFloatingPoint() &&
   11212          "This routine only supports integer vectors.");
   11213   assert(VT.is128BitVector() &&
   11214          "This routine only works on 128-bit vectors.");
   11215   assert(!V2.isUndef() &&
   11216          "This routine should only be used when blending two inputs.");
   11217   assert(Mask.size() >= 2 && "Single element masks are invalid.");
   11218 
   11219   int Size = Mask.size();
   11220 
   11221   int NumLoInputs =
   11222       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
   11223   int NumHiInputs =
   11224       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
   11225 
   11226   bool UnpackLo = NumLoInputs >= NumHiInputs;
   11227 
   11228   auto TryUnpack = [&](int ScalarSize, int Scale) {
   11229     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
   11230     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
   11231 
   11232     for (int i = 0; i < Size; ++i) {
   11233       if (Mask[i] < 0)
   11234         continue;
   11235 
   11236       // Each element of the unpack contains Scale elements from this mask.
   11237       int UnpackIdx = i / Scale;
   11238 
   11239       // We only handle the case where V1 feeds the first slots of the unpack.
   11240       // We rely on canonicalization to ensure this is the case.
   11241       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
   11242         return SDValue();
   11243 
   11244       // Setup the mask for this input. The indexing is tricky as we have to
   11245       // handle the unpack stride.
   11246       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
   11247       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
   11248           Mask[i] % Size;
   11249     }
   11250 
   11251     // If we will have to shuffle both inputs to use the unpack, check whether
   11252     // we can just unpack first and shuffle the result. If so, skip this unpack.
   11253     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
   11254         !isNoopShuffleMask(V2Mask))
   11255       return SDValue();
   11256 
   11257     // Shuffle the inputs into place.
   11258     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
   11259     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
   11260 
   11261     // Cast the inputs to the type we will use to unpack them.
   11262     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
   11263     V1 = DAG.getBitcast(UnpackVT, V1);
   11264     V2 = DAG.getBitcast(UnpackVT, V2);
   11265 
   11266     // Unpack the inputs and cast the result back to the desired type.
   11267     return DAG.getBitcast(
   11268         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
   11269                         UnpackVT, V1, V2));
   11270   };
   11271 
   11272   // We try each unpack from the largest to the smallest to try and find one
   11273   // that fits this mask.
   11274   int OrigScalarSize = VT.getScalarSizeInBits();
   11275   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
   11276     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
   11277       return Unpack;
   11278 
   11279   // If none of the unpack-rooted lowerings worked (or were profitable) try an
   11280   // initial unpack.
   11281   if (NumLoInputs == 0 || NumHiInputs == 0) {
   11282     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
   11283            "We have to have *some* inputs!");
   11284     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
   11285 
   11286     // FIXME: We could consider the total complexity of the permute of each
   11287     // possible unpacking. Or at the least we should consider how many
   11288     // half-crossings are created.
   11289     // FIXME: We could consider commuting the unpacks.
   11290 
   11291     SmallVector<int, 32> PermMask((unsigned)Size, -1);
   11292     for (int i = 0; i < Size; ++i) {
   11293       if (Mask[i] < 0)
   11294         continue;
   11295 
   11296       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
   11297 
   11298       PermMask[i] =
   11299           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
   11300     }
   11301     return DAG.getVectorShuffle(
   11302         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
   11303                             DL, VT, V1, V2),
   11304         DAG.getUNDEF(VT), PermMask);
   11305   }
   11306 
   11307   return SDValue();
   11308 }
   11309 
   11310 /// Handle lowering of 2-lane 64-bit floating point shuffles.
   11311 ///
   11312 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
   11313 /// support for floating point shuffles but not integer shuffles. These
   11314 /// instructions will incur a domain crossing penalty on some chips though so
   11315 /// it is better to avoid lowering through this for integer vectors where
   11316 /// possible.
   11317 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11318                                        const APInt &Zeroable,
   11319                                        SDValue V1, SDValue V2,
   11320                                        const X86Subtarget &Subtarget,
   11321                                        SelectionDAG &DAG) {
   11322   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   11323   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   11324   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
   11325 
   11326   if (V2.isUndef()) {
   11327     // Check for being able to broadcast a single element.
   11328     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   11329             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
   11330       return Broadcast;
   11331 
   11332     // Straight shuffle of a single input vector. Simulate this by using the
   11333     // single input as both of the "inputs" to this instruction..
   11334     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
   11335 
   11336     if (Subtarget.hasAVX()) {
   11337       // If we have AVX, we can use VPERMILPS which will allow folding a load
   11338       // into the shuffle.
   11339       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
   11340                          DAG.getConstant(SHUFPDMask, DL, MVT::i8));
   11341     }
   11342 
   11343     return DAG.getNode(
   11344         X86ISD::SHUFP, DL, MVT::v2f64,
   11345         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
   11346         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
   11347         DAG.getConstant(SHUFPDMask, DL, MVT::i8));
   11348   }
   11349   assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
   11350   assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
   11351   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
   11352   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
   11353 
   11354   // When loading a scalar and then shuffling it into a vector we can often do
   11355   // the insertion cheaply.
   11356   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   11357           DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
   11358     return Insertion;
   11359   // Try inverting the insertion since for v2 masks it is easy to do and we
   11360   // can't reliably sort the mask one way or the other.
   11361   int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
   11362                         Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
   11363   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   11364           DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
   11365     return Insertion;
   11366 
   11367   // Try to use one of the special instruction patterns to handle two common
   11368   // blend patterns if a zero-blend above didn't work.
   11369   if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
   11370       isShuffleEquivalent(V1, V2, Mask, {1, 3}))
   11371     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
   11372       // We can either use a special instruction to load over the low double or
   11373       // to move just the low double.
   11374       return DAG.getNode(
   11375           X86ISD::MOVSD, DL, MVT::v2f64, V2,
   11376           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
   11377 
   11378   if (Subtarget.hasSSE41())
   11379     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
   11380                                                   Zeroable, Subtarget, DAG))
   11381       return Blend;
   11382 
   11383   // Use dedicated unpack instructions for masks that match their pattern.
   11384   if (SDValue V =
   11385           lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
   11386     return V;
   11387 
   11388   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
   11389   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
   11390                      DAG.getConstant(SHUFPDMask, DL, MVT::i8));
   11391 }
   11392 
   11393 /// Handle lowering of 2-lane 64-bit integer shuffles.
   11394 ///
   11395 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
   11396 /// the integer unit to minimize domain crossing penalties. However, for blends
   11397 /// it falls back to the floating point shuffle operation with appropriate bit
   11398 /// casting.
   11399 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11400                                        const APInt &Zeroable,
   11401                                        SDValue V1, SDValue V2,
   11402                                        const X86Subtarget &Subtarget,
   11403                                        SelectionDAG &DAG) {
   11404   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   11405   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   11406   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
   11407 
   11408   if (V2.isUndef()) {
   11409     // Check for being able to broadcast a single element.
   11410     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   11411             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
   11412       return Broadcast;
   11413 
   11414     // Straight shuffle of a single input vector. For everything from SSE2
   11415     // onward this has a single fast instruction with no scary immediates.
   11416     // We have to map the mask as it is actually a v4i32 shuffle instruction.
   11417     V1 = DAG.getBitcast(MVT::v4i32, V1);
   11418     int WidenedMask[4] = {
   11419         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
   11420         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
   11421     return DAG.getBitcast(
   11422         MVT::v2i64,
   11423         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
   11424                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
   11425   }
   11426   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
   11427   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
   11428   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
   11429   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
   11430 
   11431   // Try to use shift instructions.
   11432   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
   11433                                                 Zeroable, Subtarget, DAG))
   11434     return Shift;
   11435 
   11436   // When loading a scalar and then shuffling it into a vector we can often do
   11437   // the insertion cheaply.
   11438   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   11439           DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
   11440     return Insertion;
   11441   // Try inverting the insertion since for v2 masks it is easy to do and we
   11442   // can't reliably sort the mask one way or the other.
   11443   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
   11444   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   11445           DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
   11446     return Insertion;
   11447 
   11448   // We have different paths for blend lowering, but they all must use the
   11449   // *exact* same predicate.
   11450   bool IsBlendSupported = Subtarget.hasSSE41();
   11451   if (IsBlendSupported)
   11452     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
   11453                                                   Zeroable, Subtarget, DAG))
   11454       return Blend;
   11455 
   11456   // Use dedicated unpack instructions for masks that match their pattern.
   11457   if (SDValue V =
   11458           lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
   11459     return V;
   11460 
   11461   // Try to use byte rotation instructions.
   11462   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   11463   if (Subtarget.hasSSSE3()) {
   11464     if (Subtarget.hasVLX())
   11465       if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
   11466                                                       Mask, Subtarget, DAG))
   11467         return Rotate;
   11468 
   11469     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   11470             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
   11471       return Rotate;
   11472   }
   11473 
   11474   // If we have direct support for blends, we should lower by decomposing into
   11475   // a permute. That will be faster than the domain cross.
   11476   if (IsBlendSupported)
   11477     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
   11478                                                       Mask, DAG);
   11479 
   11480   // We implement this with SHUFPD which is pretty lame because it will likely
   11481   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
   11482   // However, all the alternatives are still more cycles and newer chips don't
   11483   // have this problem. It would be really nice if x86 had better shuffles here.
   11484   V1 = DAG.getBitcast(MVT::v2f64, V1);
   11485   V2 = DAG.getBitcast(MVT::v2f64, V2);
   11486   return DAG.getBitcast(MVT::v2i64,
   11487                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
   11488 }
   11489 
   11490 /// Test whether this can be lowered with a single SHUFPS instruction.
   11491 ///
   11492 /// This is used to disable more specialized lowerings when the shufps lowering
   11493 /// will happen to be efficient.
   11494 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
   11495   // This routine only handles 128-bit shufps.
   11496   assert(Mask.size() == 4 && "Unsupported mask size!");
   11497   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
   11498   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
   11499   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
   11500   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
   11501 
   11502   // To lower with a single SHUFPS we need to have the low half and high half
   11503   // each requiring a single input.
   11504   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
   11505     return false;
   11506   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
   11507     return false;
   11508 
   11509   return true;
   11510 }
   11511 
   11512 /// Lower a vector shuffle using the SHUFPS instruction.
   11513 ///
   11514 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
   11515 /// It makes no assumptions about whether this is the *best* lowering, it simply
   11516 /// uses it.
   11517 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
   11518                                             ArrayRef<int> Mask, SDValue V1,
   11519                                             SDValue V2, SelectionDAG &DAG) {
   11520   SDValue LowV = V1, HighV = V2;
   11521   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
   11522 
   11523   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
   11524 
   11525   if (NumV2Elements == 1) {
   11526     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
   11527 
   11528     // Compute the index adjacent to V2Index and in the same half by toggling
   11529     // the low bit.
   11530     int V2AdjIndex = V2Index ^ 1;
   11531 
   11532     if (Mask[V2AdjIndex] < 0) {
   11533       // Handles all the cases where we have a single V2 element and an undef.
   11534       // This will only ever happen in the high lanes because we commute the
   11535       // vector otherwise.
   11536       if (V2Index < 2)
   11537         std::swap(LowV, HighV);
   11538       NewMask[V2Index] -= 4;
   11539     } else {
   11540       // Handle the case where the V2 element ends up adjacent to a V1 element.
   11541       // To make this work, blend them together as the first step.
   11542       int V1Index = V2AdjIndex;
   11543       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
   11544       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
   11545                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
   11546 
   11547       // Now proceed to reconstruct the final blend as we have the necessary
   11548       // high or low half formed.
   11549       if (V2Index < 2) {
   11550         LowV = V2;
   11551         HighV = V1;
   11552       } else {
   11553         HighV = V2;
   11554       }
   11555       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
   11556       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
   11557     }
   11558   } else if (NumV2Elements == 2) {
   11559     if (Mask[0] < 4 && Mask[1] < 4) {
   11560       // Handle the easy case where we have V1 in the low lanes and V2 in the
   11561       // high lanes.
   11562       NewMask[2] -= 4;
   11563       NewMask[3] -= 4;
   11564     } else if (Mask[2] < 4 && Mask[3] < 4) {
   11565       // We also handle the reversed case because this utility may get called
   11566       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
   11567       // arrange things in the right direction.
   11568       NewMask[0] -= 4;
   11569       NewMask[1] -= 4;
   11570       HighV = V1;
   11571       LowV = V2;
   11572     } else {
   11573       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
   11574       // trying to place elements directly, just blend them and set up the final
   11575       // shuffle to place them.
   11576 
   11577       // The first two blend mask elements are for V1, the second two are for
   11578       // V2.
   11579       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
   11580                           Mask[2] < 4 ? Mask[2] : Mask[3],
   11581                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
   11582                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
   11583       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
   11584                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
   11585 
   11586       // Now we do a normal shuffle of V1 by giving V1 as both operands to
   11587       // a blend.
   11588       LowV = HighV = V1;
   11589       NewMask[0] = Mask[0] < 4 ? 0 : 2;
   11590       NewMask[1] = Mask[0] < 4 ? 2 : 0;
   11591       NewMask[2] = Mask[2] < 4 ? 1 : 3;
   11592       NewMask[3] = Mask[2] < 4 ? 3 : 1;
   11593     }
   11594   }
   11595   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
   11596                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
   11597 }
   11598 
   11599 /// Lower 4-lane 32-bit floating point shuffles.
   11600 ///
   11601 /// Uses instructions exclusively from the floating point unit to minimize
   11602 /// domain crossing penalties, as these are sufficient to implement all v4f32
   11603 /// shuffles.
   11604 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11605                                        const APInt &Zeroable,
   11606                                        SDValue V1, SDValue V2,
   11607                                        const X86Subtarget &Subtarget,
   11608                                        SelectionDAG &DAG) {
   11609   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   11610   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   11611   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   11612 
   11613   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
   11614 
   11615   if (NumV2Elements == 0) {
   11616     // Check for being able to broadcast a single element.
   11617     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   11618             DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
   11619       return Broadcast;
   11620 
   11621     // Use even/odd duplicate instructions for masks that match their pattern.
   11622     if (Subtarget.hasSSE3()) {
   11623       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
   11624         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
   11625       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
   11626         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
   11627     }
   11628 
   11629     if (Subtarget.hasAVX()) {
   11630       // If we have AVX, we can use VPERMILPS which will allow folding a load
   11631       // into the shuffle.
   11632       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
   11633                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   11634     }
   11635 
   11636     // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
   11637     // in SSE1 because otherwise they are widened to v2f64 and never get here.
   11638     if (!Subtarget.hasSSE2()) {
   11639       if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
   11640         return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
   11641       if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
   11642         return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
   11643     }
   11644 
   11645     // Otherwise, use a straight shuffle of a single input vector. We pass the
   11646     // input vector to both operands to simulate this with a SHUFPS.
   11647     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
   11648                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   11649   }
   11650 
   11651   // There are special ways we can lower some single-element blends. However, we
   11652   // have custom ways we can lower more complex single-element blends below that
   11653   // we defer to if both this and BLENDPS fail to match, so restrict this to
   11654   // when the V2 input is targeting element 0 of the mask -- that is the fast
   11655   // case here.
   11656   if (NumV2Elements == 1 && Mask[0] >= 4)
   11657     if (SDValue V = lowerVectorShuffleAsElementInsertion(
   11658             DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
   11659       return V;
   11660 
   11661   if (Subtarget.hasSSE41()) {
   11662     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
   11663                                                   Zeroable, Subtarget, DAG))
   11664       return Blend;
   11665 
   11666     // Use INSERTPS if we can complete the shuffle efficiently.
   11667     if (SDValue V =
   11668             lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
   11669       return V;
   11670 
   11671     if (!isSingleSHUFPSMask(Mask))
   11672       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
   11673               DL, MVT::v4f32, V1, V2, Mask, DAG))
   11674         return BlendPerm;
   11675   }
   11676 
   11677   // Use low/high mov instructions. These are only valid in SSE1 because
   11678   // otherwise they are widened to v2f64 and never get here.
   11679   if (!Subtarget.hasSSE2()) {
   11680     if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
   11681       return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
   11682     if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
   11683       return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
   11684   }
   11685 
   11686   // Use dedicated unpack instructions for masks that match their pattern.
   11687   if (SDValue V =
   11688           lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
   11689     return V;
   11690 
   11691   // Otherwise fall back to a SHUFPS lowering strategy.
   11692   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
   11693 }
   11694 
   11695 /// Lower 4-lane i32 vector shuffles.
   11696 ///
   11697 /// We try to handle these with integer-domain shuffles where we can, but for
   11698 /// blends we use the floating point domain blend instructions.
   11699 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   11700                                        const APInt &Zeroable,
   11701                                        SDValue V1, SDValue V2,
   11702                                        const X86Subtarget &Subtarget,
   11703                                        SelectionDAG &DAG) {
   11704   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   11705   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   11706   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   11707 
   11708   // Whenever we can lower this as a zext, that instruction is strictly faster
   11709   // than any alternative. It also allows us to fold memory operands into the
   11710   // shuffle in many cases.
   11711   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   11712           DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
   11713     return ZExt;
   11714 
   11715   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
   11716 
   11717   if (NumV2Elements == 0) {
   11718     // Check for being able to broadcast a single element.
   11719     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   11720             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
   11721       return Broadcast;
   11722 
   11723     // Straight shuffle of a single input vector. For everything from SSE2
   11724     // onward this has a single fast instruction with no scary immediates.
   11725     // We coerce the shuffle pattern to be compatible with UNPCK instructions
   11726     // but we aren't actually going to use the UNPCK instruction because doing
   11727     // so prevents folding a load into this instruction or making a copy.
   11728     const int UnpackLoMask[] = {0, 0, 1, 1};
   11729     const int UnpackHiMask[] = {2, 2, 3, 3};
   11730     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
   11731       Mask = UnpackLoMask;
   11732     else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
   11733       Mask = UnpackHiMask;
   11734 
   11735     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
   11736                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   11737   }
   11738 
   11739   // Try to use shift instructions.
   11740   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
   11741                                                 Zeroable, Subtarget, DAG))
   11742     return Shift;
   11743 
   11744   // There are special ways we can lower some single-element blends.
   11745   if (NumV2Elements == 1)
   11746     if (SDValue V = lowerVectorShuffleAsElementInsertion(
   11747             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
   11748       return V;
   11749 
   11750   // We have different paths for blend lowering, but they all must use the
   11751   // *exact* same predicate.
   11752   bool IsBlendSupported = Subtarget.hasSSE41();
   11753   if (IsBlendSupported)
   11754     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
   11755                                                   Zeroable, Subtarget, DAG))
   11756       return Blend;
   11757 
   11758   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
   11759                                                    Zeroable, DAG))
   11760     return Masked;
   11761 
   11762   // Use dedicated unpack instructions for masks that match their pattern.
   11763   if (SDValue V =
   11764           lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
   11765     return V;
   11766 
   11767   // Try to use byte rotation instructions.
   11768   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   11769   if (Subtarget.hasSSSE3()) {
   11770     if (Subtarget.hasVLX())
   11771       if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
   11772                                                       Mask, Subtarget, DAG))
   11773         return Rotate;
   11774 
   11775     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   11776             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
   11777       return Rotate;
   11778   }
   11779 
   11780   // Assume that a single SHUFPS is faster than an alternative sequence of
   11781   // multiple instructions (even if the CPU has a domain penalty).
   11782   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
   11783   if (!isSingleSHUFPSMask(Mask)) {
   11784     // If we have direct support for blends, we should lower by decomposing into
   11785     // a permute. That will be faster than the domain cross.
   11786     if (IsBlendSupported)
   11787       return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
   11788                                                         Mask, DAG);
   11789 
   11790     // Try to lower by permuting the inputs into an unpack instruction.
   11791     if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
   11792             DL, MVT::v4i32, V1, V2, Mask, DAG))
   11793       return Unpack;
   11794   }
   11795 
   11796   // We implement this with SHUFPS because it can blend from two vectors.
   11797   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
   11798   // up the inputs, bypassing domain shift penalties that we would incur if we
   11799   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
   11800   // relevant.
   11801   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
   11802   SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
   11803   SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
   11804   return DAG.getBitcast(MVT::v4i32, ShufPS);
   11805 }
   11806 
   11807 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
   11808 /// shuffle lowering, and the most complex part.
   11809 ///
   11810 /// The lowering strategy is to try to form pairs of input lanes which are
   11811 /// targeted at the same half of the final vector, and then use a dword shuffle
   11812 /// to place them onto the right half, and finally unpack the paired lanes into
   11813 /// their final position.
   11814 ///
   11815 /// The exact breakdown of how to form these dword pairs and align them on the
   11816 /// correct sides is really tricky. See the comments within the function for
   11817 /// more of the details.
   11818 ///
   11819 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
   11820 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
   11821 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
   11822 /// vector, form the analogous 128-bit 8-element Mask.
   11823 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
   11824     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
   11825     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   11826   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
   11827   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
   11828 
   11829   assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
   11830   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
   11831   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
   11832 
   11833   // Attempt to directly match PSHUFLW or PSHUFHW.
   11834   if (isUndefOrInRange(LoMask, 0, 4) &&
   11835       isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
   11836     return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
   11837                        getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
   11838   }
   11839   if (isUndefOrInRange(HiMask, 4, 8) &&
   11840       isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
   11841     for (int i = 0; i != 4; ++i)
   11842       HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
   11843     return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
   11844                        getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
   11845   }
   11846 
   11847   SmallVector<int, 4> LoInputs;
   11848   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
   11849   array_pod_sort(LoInputs.begin(), LoInputs.end());
   11850   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
   11851   SmallVector<int, 4> HiInputs;
   11852   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
   11853   array_pod_sort(HiInputs.begin(), HiInputs.end());
   11854   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
   11855   int NumLToL =
   11856       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
   11857   int NumHToL = LoInputs.size() - NumLToL;
   11858   int NumLToH =
   11859       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
   11860   int NumHToH = HiInputs.size() - NumLToH;
   11861   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
   11862   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
   11863   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
   11864   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
   11865 
   11866   // If we are shuffling values from one half - check how many different DWORD
   11867   // pairs we need to create. If only 1 or 2 then we can perform this as a
   11868   // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
   11869   auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
   11870                                ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
   11871     V = DAG.getNode(ShufWOp, DL, VT, V,
   11872                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
   11873     V = DAG.getBitcast(PSHUFDVT, V);
   11874     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
   11875                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
   11876     return DAG.getBitcast(VT, V);
   11877   };
   11878 
   11879   if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
   11880     int PSHUFDMask[4] = { -1, -1, -1, -1 };
   11881     SmallVector<std::pair<int, int>, 4> DWordPairs;
   11882     int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
   11883 
   11884     // Collect the different DWORD pairs.
   11885     for (int DWord = 0; DWord != 4; ++DWord) {
   11886       int M0 = Mask[2 * DWord + 0];
   11887       int M1 = Mask[2 * DWord + 1];
   11888       M0 = (M0 >= 0 ? M0 % 4 : M0);
   11889       M1 = (M1 >= 0 ? M1 % 4 : M1);
   11890       if (M0 < 0 && M1 < 0)
   11891         continue;
   11892 
   11893       bool Match = false;
   11894       for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
   11895         auto &DWordPair = DWordPairs[j];
   11896         if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
   11897             (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
   11898           DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
   11899           DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
   11900           PSHUFDMask[DWord] = DOffset + j;
   11901           Match = true;
   11902           break;
   11903         }
   11904       }
   11905       if (!Match) {
   11906         PSHUFDMask[DWord] = DOffset + DWordPairs.size();
   11907         DWordPairs.push_back(std::make_pair(M0, M1));
   11908       }
   11909     }
   11910 
   11911     if (DWordPairs.size() <= 2) {
   11912       DWordPairs.resize(2, std::make_pair(-1, -1));
   11913       int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
   11914                               DWordPairs[1].first, DWordPairs[1].second};
   11915       if ((NumHToL + NumHToH) == 0)
   11916         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
   11917       if ((NumLToL + NumLToH) == 0)
   11918         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
   11919     }
   11920   }
   11921 
   11922   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
   11923   // such inputs we can swap two of the dwords across the half mark and end up
   11924   // with <=2 inputs to each half in each half. Once there, we can fall through
   11925   // to the generic code below. For example:
   11926   //
   11927   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   11928   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
   11929   //
   11930   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
   11931   // and an existing 2-into-2 on the other half. In this case we may have to
   11932   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
   11933   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
   11934   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
   11935   // because any other situation (including a 3-into-1 or 1-into-3 in the other
   11936   // half than the one we target for fixing) will be fixed when we re-enter this
   11937   // path. We will also combine away any sequence of PSHUFD instructions that
   11938   // result into a single instruction. Here is an example of the tricky case:
   11939   //
   11940   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   11941   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
   11942   //
   11943   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
   11944   //
   11945   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
   11946   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
   11947   //
   11948   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
   11949   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
   11950   //
   11951   // The result is fine to be handled by the generic logic.
   11952   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
   11953                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
   11954                           int AOffset, int BOffset) {
   11955     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
   11956            "Must call this with A having 3 or 1 inputs from the A half.");
   11957     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
   11958            "Must call this with B having 1 or 3 inputs from the B half.");
   11959     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
   11960            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
   11961 
   11962     bool ThreeAInputs = AToAInputs.size() == 3;
   11963 
   11964     // Compute the index of dword with only one word among the three inputs in
   11965     // a half by taking the sum of the half with three inputs and subtracting
   11966     // the sum of the actual three inputs. The difference is the remaining
   11967     // slot.
   11968     int ADWord, BDWord;
   11969     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
   11970     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
   11971     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
   11972     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
   11973     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
   11974     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
   11975     int TripleNonInputIdx =
   11976         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
   11977     TripleDWord = TripleNonInputIdx / 2;
   11978 
   11979     // We use xor with one to compute the adjacent DWord to whichever one the
   11980     // OneInput is in.
   11981     OneInputDWord = (OneInput / 2) ^ 1;
   11982 
   11983     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
   11984     // and BToA inputs. If there is also such a problem with the BToB and AToB
   11985     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
   11986     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
   11987     // is essential that we don't *create* a 3<-1 as then we might oscillate.
   11988     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
   11989       // Compute how many inputs will be flipped by swapping these DWords. We
   11990       // need
   11991       // to balance this to ensure we don't form a 3-1 shuffle in the other
   11992       // half.
   11993       int NumFlippedAToBInputs =
   11994           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
   11995           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
   11996       int NumFlippedBToBInputs =
   11997           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
   11998           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
   11999       if ((NumFlippedAToBInputs == 1 &&
   12000            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
   12001           (NumFlippedBToBInputs == 1 &&
   12002            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
   12003         // We choose whether to fix the A half or B half based on whether that
   12004         // half has zero flipped inputs. At zero, we may not be able to fix it
   12005         // with that half. We also bias towards fixing the B half because that
   12006         // will more commonly be the high half, and we have to bias one way.
   12007         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
   12008                                                        ArrayRef<int> Inputs) {
   12009           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
   12010           bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
   12011           // Determine whether the free index is in the flipped dword or the
   12012           // unflipped dword based on where the pinned index is. We use this bit
   12013           // in an xor to conditionally select the adjacent dword.
   12014           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
   12015           bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
   12016           if (IsFixIdxInput == IsFixFreeIdxInput)
   12017             FixFreeIdx += 1;
   12018           IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
   12019           assert(IsFixIdxInput != IsFixFreeIdxInput &&
   12020                  "We need to be changing the number of flipped inputs!");
   12021           int PSHUFHalfMask[] = {0, 1, 2, 3};
   12022           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
   12023           V = DAG.getNode(
   12024               FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
   12025               MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
   12026               getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
   12027 
   12028           for (int &M : Mask)
   12029             if (M >= 0 && M == FixIdx)
   12030               M = FixFreeIdx;
   12031             else if (M >= 0 && M == FixFreeIdx)
   12032               M = FixIdx;
   12033         };
   12034         if (NumFlippedBToBInputs != 0) {
   12035           int BPinnedIdx =
   12036               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
   12037           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
   12038         } else {
   12039           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
   12040           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
   12041           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
   12042         }
   12043       }
   12044     }
   12045 
   12046     int PSHUFDMask[] = {0, 1, 2, 3};
   12047     PSHUFDMask[ADWord] = BDWord;
   12048     PSHUFDMask[BDWord] = ADWord;
   12049     V = DAG.getBitcast(
   12050         VT,
   12051         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
   12052                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   12053 
   12054     // Adjust the mask to match the new locations of A and B.
   12055     for (int &M : Mask)
   12056       if (M >= 0 && M/2 == ADWord)
   12057         M = 2 * BDWord + M % 2;
   12058       else if (M >= 0 && M/2 == BDWord)
   12059         M = 2 * ADWord + M % 2;
   12060 
   12061     // Recurse back into this routine to re-compute state now that this isn't
   12062     // a 3 and 1 problem.
   12063     return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
   12064                                                      DAG);
   12065   };
   12066   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
   12067     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
   12068   if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
   12069     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
   12070 
   12071   // At this point there are at most two inputs to the low and high halves from
   12072   // each half. That means the inputs can always be grouped into dwords and
   12073   // those dwords can then be moved to the correct half with a dword shuffle.
   12074   // We use at most one low and one high word shuffle to collect these paired
   12075   // inputs into dwords, and finally a dword shuffle to place them.
   12076   int PSHUFLMask[4] = {-1, -1, -1, -1};
   12077   int PSHUFHMask[4] = {-1, -1, -1, -1};
   12078   int PSHUFDMask[4] = {-1, -1, -1, -1};
   12079 
   12080   // First fix the masks for all the inputs that are staying in their
   12081   // original halves. This will then dictate the targets of the cross-half
   12082   // shuffles.
   12083   auto fixInPlaceInputs =
   12084       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
   12085                     MutableArrayRef<int> SourceHalfMask,
   12086                     MutableArrayRef<int> HalfMask, int HalfOffset) {
   12087     if (InPlaceInputs.empty())
   12088       return;
   12089     if (InPlaceInputs.size() == 1) {
   12090       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
   12091           InPlaceInputs[0] - HalfOffset;
   12092       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
   12093       return;
   12094     }
   12095     if (IncomingInputs.empty()) {
   12096       // Just fix all of the in place inputs.
   12097       for (int Input : InPlaceInputs) {
   12098         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
   12099         PSHUFDMask[Input / 2] = Input / 2;
   12100       }
   12101       return;
   12102     }
   12103 
   12104     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
   12105     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
   12106         InPlaceInputs[0] - HalfOffset;
   12107     // Put the second input next to the first so that they are packed into
   12108     // a dword. We find the adjacent index by toggling the low bit.
   12109     int AdjIndex = InPlaceInputs[0] ^ 1;
   12110     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
   12111     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
   12112     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
   12113   };
   12114   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
   12115   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
   12116 
   12117   // Now gather the cross-half inputs and place them into a free dword of
   12118   // their target half.
   12119   // FIXME: This operation could almost certainly be simplified dramatically to
   12120   // look more like the 3-1 fixing operation.
   12121   auto moveInputsToRightHalf = [&PSHUFDMask](
   12122       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
   12123       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
   12124       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
   12125       int DestOffset) {
   12126     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
   12127       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
   12128     };
   12129     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
   12130                                                int Word) {
   12131       int LowWord = Word & ~1;
   12132       int HighWord = Word | 1;
   12133       return isWordClobbered(SourceHalfMask, LowWord) ||
   12134              isWordClobbered(SourceHalfMask, HighWord);
   12135     };
   12136 
   12137     if (IncomingInputs.empty())
   12138       return;
   12139 
   12140     if (ExistingInputs.empty()) {
   12141       // Map any dwords with inputs from them into the right half.
   12142       for (int Input : IncomingInputs) {
   12143         // If the source half mask maps over the inputs, turn those into
   12144         // swaps and use the swapped lane.
   12145         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
   12146           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
   12147             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
   12148                 Input - SourceOffset;
   12149             // We have to swap the uses in our half mask in one sweep.
   12150             for (int &M : HalfMask)
   12151               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
   12152                 M = Input;
   12153               else if (M == Input)
   12154                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
   12155           } else {
   12156             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
   12157                        Input - SourceOffset &&
   12158                    "Previous placement doesn't match!");
   12159           }
   12160           // Note that this correctly re-maps both when we do a swap and when
   12161           // we observe the other side of the swap above. We rely on that to
   12162           // avoid swapping the members of the input list directly.
   12163           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
   12164         }
   12165 
   12166         // Map the input's dword into the correct half.
   12167         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
   12168           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
   12169         else
   12170           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
   12171                      Input / 2 &&
   12172                  "Previous placement doesn't match!");
   12173       }
   12174 
   12175       // And just directly shift any other-half mask elements to be same-half
   12176       // as we will have mirrored the dword containing the element into the
   12177       // same position within that half.
   12178       for (int &M : HalfMask)
   12179         if (M >= SourceOffset && M < SourceOffset + 4) {
   12180           M = M - SourceOffset + DestOffset;
   12181           assert(M >= 0 && "This should never wrap below zero!");
   12182         }
   12183       return;
   12184     }
   12185 
   12186     // Ensure we have the input in a viable dword of its current half. This
   12187     // is particularly tricky because the original position may be clobbered
   12188     // by inputs being moved and *staying* in that half.
   12189     if (IncomingInputs.size() == 1) {
   12190       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
   12191         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
   12192                          SourceOffset;
   12193         SourceHalfMask[InputFixed - SourceOffset] =
   12194             IncomingInputs[0] - SourceOffset;
   12195         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
   12196                      InputFixed);
   12197         IncomingInputs[0] = InputFixed;
   12198       }
   12199     } else if (IncomingInputs.size() == 2) {
   12200       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
   12201           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
   12202         // We have two non-adjacent or clobbered inputs we need to extract from
   12203         // the source half. To do this, we need to map them into some adjacent
   12204         // dword slot in the source mask.
   12205         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
   12206                               IncomingInputs[1] - SourceOffset};
   12207 
   12208         // If there is a free slot in the source half mask adjacent to one of
   12209         // the inputs, place the other input in it. We use (Index XOR 1) to
   12210         // compute an adjacent index.
   12211         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
   12212             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
   12213           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
   12214           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
   12215           InputsFixed[1] = InputsFixed[0] ^ 1;
   12216         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
   12217                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
   12218           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
   12219           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
   12220           InputsFixed[0] = InputsFixed[1] ^ 1;
   12221         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
   12222                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
   12223           // The two inputs are in the same DWord but it is clobbered and the
   12224           // adjacent DWord isn't used at all. Move both inputs to the free
   12225           // slot.
   12226           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
   12227           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
   12228           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
   12229           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
   12230         } else {
   12231           // The only way we hit this point is if there is no clobbering
   12232           // (because there are no off-half inputs to this half) and there is no
   12233           // free slot adjacent to one of the inputs. In this case, we have to
   12234           // swap an input with a non-input.
   12235           for (int i = 0; i < 4; ++i)
   12236             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
   12237                    "We can't handle any clobbers here!");
   12238           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
   12239                  "Cannot have adjacent inputs here!");
   12240 
   12241           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
   12242           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
   12243 
   12244           // We also have to update the final source mask in this case because
   12245           // it may need to undo the above swap.
   12246           for (int &M : FinalSourceHalfMask)
   12247             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
   12248               M = InputsFixed[1] + SourceOffset;
   12249             else if (M == InputsFixed[1] + SourceOffset)
   12250               M = (InputsFixed[0] ^ 1) + SourceOffset;
   12251 
   12252           InputsFixed[1] = InputsFixed[0] ^ 1;
   12253         }
   12254 
   12255         // Point everything at the fixed inputs.
   12256         for (int &M : HalfMask)
   12257           if (M == IncomingInputs[0])
   12258             M = InputsFixed[0] + SourceOffset;
   12259           else if (M == IncomingInputs[1])
   12260             M = InputsFixed[1] + SourceOffset;
   12261 
   12262         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
   12263         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
   12264       }
   12265     } else {
   12266       llvm_unreachable("Unhandled input size!");
   12267     }
   12268 
   12269     // Now hoist the DWord down to the right half.
   12270     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
   12271     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
   12272     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
   12273     for (int &M : HalfMask)
   12274       for (int Input : IncomingInputs)
   12275         if (M == Input)
   12276           M = FreeDWord * 2 + Input % 2;
   12277   };
   12278   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
   12279                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
   12280   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
   12281                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
   12282 
   12283   // Now enact all the shuffles we've computed to move the inputs into their
   12284   // target half.
   12285   if (!isNoopShuffleMask(PSHUFLMask))
   12286     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
   12287                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
   12288   if (!isNoopShuffleMask(PSHUFHMask))
   12289     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
   12290                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
   12291   if (!isNoopShuffleMask(PSHUFDMask))
   12292     V = DAG.getBitcast(
   12293         VT,
   12294         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
   12295                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   12296 
   12297   // At this point, each half should contain all its inputs, and we can then
   12298   // just shuffle them into their final position.
   12299   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
   12300          "Failed to lift all the high half inputs to the low mask!");
   12301   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
   12302          "Failed to lift all the low half inputs to the high mask!");
   12303 
   12304   // Do a half shuffle for the low mask.
   12305   if (!isNoopShuffleMask(LoMask))
   12306     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
   12307                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
   12308 
   12309   // Do a half shuffle with the high mask after shifting its values down.
   12310   for (int &M : HiMask)
   12311     if (M >= 0)
   12312       M -= 4;
   12313   if (!isNoopShuffleMask(HiMask))
   12314     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
   12315                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
   12316 
   12317   return V;
   12318 }
   12319 
   12320 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
   12321 /// blend if only one input is used.
   12322 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
   12323     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   12324     const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
   12325     bool &V2InUse) {
   12326   SDValue V1Mask[16];
   12327   SDValue V2Mask[16];
   12328   V1InUse = false;
   12329   V2InUse = false;
   12330 
   12331   int Size = Mask.size();
   12332   int Scale = 16 / Size;
   12333   for (int i = 0; i < 16; ++i) {
   12334     if (Mask[i / Scale] < 0) {
   12335       V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
   12336     } else {
   12337       const int ZeroMask = 0x80;
   12338       int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
   12339                                           : ZeroMask;
   12340       int V2Idx = Mask[i / Scale] < Size
   12341                       ? ZeroMask
   12342                       : (Mask[i / Scale] - Size) * Scale + i % Scale;
   12343       if (Zeroable[i / Scale])
   12344         V1Idx = V2Idx = ZeroMask;
   12345       V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
   12346       V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
   12347       V1InUse |= (ZeroMask != V1Idx);
   12348       V2InUse |= (ZeroMask != V2Idx);
   12349     }
   12350   }
   12351 
   12352   if (V1InUse)
   12353     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
   12354                      DAG.getBitcast(MVT::v16i8, V1),
   12355                      DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
   12356   if (V2InUse)
   12357     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
   12358                      DAG.getBitcast(MVT::v16i8, V2),
   12359                      DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
   12360 
   12361   // If we need shuffled inputs from both, blend the two.
   12362   SDValue V;
   12363   if (V1InUse && V2InUse)
   12364     V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
   12365   else
   12366     V = V1InUse ? V1 : V2;
   12367 
   12368   // Cast the result back to the correct type.
   12369   return DAG.getBitcast(VT, V);
   12370 }
   12371 
   12372 /// Generic lowering of 8-lane i16 shuffles.
   12373 ///
   12374 /// This handles both single-input shuffles and combined shuffle/blends with
   12375 /// two inputs. The single input shuffles are immediately delegated to
   12376 /// a dedicated lowering routine.
   12377 ///
   12378 /// The blends are lowered in one of three fundamental ways. If there are few
   12379 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
   12380 /// of the input is significantly cheaper when lowered as an interleaving of
   12381 /// the two inputs, try to interleave them. Otherwise, blend the low and high
   12382 /// halves of the inputs separately (making them have relatively few inputs)
   12383 /// and then concatenate them.
   12384 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   12385                                        const APInt &Zeroable,
   12386                                        SDValue V1, SDValue V2,
   12387                                        const X86Subtarget &Subtarget,
   12388                                        SelectionDAG &DAG) {
   12389   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   12390   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   12391   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   12392 
   12393   // Whenever we can lower this as a zext, that instruction is strictly faster
   12394   // than any alternative.
   12395   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   12396           DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
   12397     return ZExt;
   12398 
   12399   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
   12400 
   12401   if (NumV2Inputs == 0) {
   12402     // Check for being able to broadcast a single element.
   12403     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   12404             DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
   12405       return Broadcast;
   12406 
   12407     // Try to use shift instructions.
   12408     if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
   12409                                                   Zeroable, Subtarget, DAG))
   12410       return Shift;
   12411 
   12412     // Use dedicated unpack instructions for masks that match their pattern.
   12413     if (SDValue V =
   12414             lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
   12415       return V;
   12416 
   12417     // Use dedicated pack instructions for masks that match their pattern.
   12418     if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
   12419                                                DAG, Subtarget))
   12420       return V;
   12421 
   12422     // Try to use byte rotation instructions.
   12423     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
   12424                                                         Mask, Subtarget, DAG))
   12425       return Rotate;
   12426 
   12427     // Make a copy of the mask so it can be modified.
   12428     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
   12429     return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
   12430                                                      MutableMask, Subtarget,
   12431                                                      DAG);
   12432   }
   12433 
   12434   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
   12435          "All single-input shuffles should be canonicalized to be V1-input "
   12436          "shuffles.");
   12437 
   12438   // Try to use shift instructions.
   12439   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
   12440                                                 Zeroable, Subtarget, DAG))
   12441     return Shift;
   12442 
   12443   // See if we can use SSE4A Extraction / Insertion.
   12444   if (Subtarget.hasSSE4A())
   12445     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
   12446                                                 Zeroable, DAG))
   12447       return V;
   12448 
   12449   // There are special ways we can lower some single-element blends.
   12450   if (NumV2Inputs == 1)
   12451     if (SDValue V = lowerVectorShuffleAsElementInsertion(
   12452             DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
   12453       return V;
   12454 
   12455   // We have different paths for blend lowering, but they all must use the
   12456   // *exact* same predicate.
   12457   bool IsBlendSupported = Subtarget.hasSSE41();
   12458   if (IsBlendSupported)
   12459     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
   12460                                                   Zeroable, Subtarget, DAG))
   12461       return Blend;
   12462 
   12463   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
   12464                                                    Zeroable, DAG))
   12465     return Masked;
   12466 
   12467   // Use dedicated unpack instructions for masks that match their pattern.
   12468   if (SDValue V =
   12469           lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
   12470     return V;
   12471 
   12472   // Use dedicated pack instructions for masks that match their pattern.
   12473   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
   12474                                              Subtarget))
   12475     return V;
   12476 
   12477   // Try to use byte rotation instructions.
   12478   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   12479           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
   12480     return Rotate;
   12481 
   12482   if (SDValue BitBlend =
   12483           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
   12484     return BitBlend;
   12485 
   12486   // Try to lower by permuting the inputs into an unpack instruction.
   12487   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
   12488                                                             V2, Mask, DAG))
   12489     return Unpack;
   12490 
   12491   // If we can't directly blend but can use PSHUFB, that will be better as it
   12492   // can both shuffle and set up the inefficient blend.
   12493   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
   12494     bool V1InUse, V2InUse;
   12495     return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
   12496                                               Zeroable, DAG, V1InUse, V2InUse);
   12497   }
   12498 
   12499   // We can always bit-blend if we have to so the fallback strategy is to
   12500   // decompose into single-input permutes and blends.
   12501   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
   12502                                                     Mask, DAG);
   12503 }
   12504 
   12505 /// Check whether a compaction lowering can be done by dropping even
   12506 /// elements and compute how many times even elements must be dropped.
   12507 ///
   12508 /// This handles shuffles which take every Nth element where N is a power of
   12509 /// two. Example shuffle masks:
   12510 ///
   12511 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
   12512 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
   12513 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
   12514 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
   12515 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
   12516 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
   12517 ///
   12518 /// Any of these lanes can of course be undef.
   12519 ///
   12520 /// This routine only supports N <= 3.
   12521 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
   12522 /// for larger N.
   12523 ///
   12524 /// \returns N above, or the number of times even elements must be dropped if
   12525 /// there is such a number. Otherwise returns zero.
   12526 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
   12527                                           bool IsSingleInput) {
   12528   // The modulus for the shuffle vector entries is based on whether this is
   12529   // a single input or not.
   12530   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
   12531   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
   12532          "We should only be called with masks with a power-of-2 size!");
   12533 
   12534   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
   12535 
   12536   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
   12537   // and 2^3 simultaneously. This is because we may have ambiguity with
   12538   // partially undef inputs.
   12539   bool ViableForN[3] = {true, true, true};
   12540 
   12541   for (int i = 0, e = Mask.size(); i < e; ++i) {
   12542     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
   12543     // want.
   12544     if (Mask[i] < 0)
   12545       continue;
   12546 
   12547     bool IsAnyViable = false;
   12548     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
   12549       if (ViableForN[j]) {
   12550         uint64_t N = j + 1;
   12551 
   12552         // The shuffle mask must be equal to (i * 2^N) % M.
   12553         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
   12554           IsAnyViable = true;
   12555         else
   12556           ViableForN[j] = false;
   12557       }
   12558     // Early exit if we exhaust the possible powers of two.
   12559     if (!IsAnyViable)
   12560       break;
   12561   }
   12562 
   12563   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
   12564     if (ViableForN[j])
   12565       return j + 1;
   12566 
   12567   // Return 0 as there is no viable power of two.
   12568   return 0;
   12569 }
   12570 
   12571 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
   12572                                            ArrayRef<int> Mask, SDValue V1,
   12573                                            SDValue V2, SelectionDAG &DAG) {
   12574   MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
   12575   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
   12576 
   12577   SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
   12578   if (V2.isUndef())
   12579     return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
   12580 
   12581   return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
   12582 }
   12583 
   12584 /// Generic lowering of v16i8 shuffles.
   12585 ///
   12586 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
   12587 /// detect any complexity reducing interleaving. If that doesn't help, it uses
   12588 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
   12589 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
   12590 /// back together.
   12591 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   12592                                        const APInt &Zeroable,
   12593                                        SDValue V1, SDValue V2,
   12594                                        const X86Subtarget &Subtarget,
   12595                                        SelectionDAG &DAG) {
   12596   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   12597   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   12598   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   12599 
   12600   // Try to use shift instructions.
   12601   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
   12602                                                 Zeroable, Subtarget, DAG))
   12603     return Shift;
   12604 
   12605   // Try to use byte rotation instructions.
   12606   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   12607           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
   12608     return Rotate;
   12609 
   12610   // Use dedicated pack instructions for masks that match their pattern.
   12611   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
   12612                                              Subtarget))
   12613     return V;
   12614 
   12615   // Try to use a zext lowering.
   12616   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   12617           DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
   12618     return ZExt;
   12619 
   12620   // See if we can use SSE4A Extraction / Insertion.
   12621   if (Subtarget.hasSSE4A())
   12622     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
   12623                                                 Zeroable, DAG))
   12624       return V;
   12625 
   12626   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
   12627 
   12628   // For single-input shuffles, there are some nicer lowering tricks we can use.
   12629   if (NumV2Elements == 0) {
   12630     // Check for being able to broadcast a single element.
   12631     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   12632             DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
   12633       return Broadcast;
   12634 
   12635     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
   12636     // Notably, this handles splat and partial-splat shuffles more efficiently.
   12637     // However, it only makes sense if the pre-duplication shuffle simplifies
   12638     // things significantly. Currently, this means we need to be able to
   12639     // express the pre-duplication shuffle as an i16 shuffle.
   12640     //
   12641     // FIXME: We should check for other patterns which can be widened into an
   12642     // i16 shuffle as well.
   12643     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
   12644       for (int i = 0; i < 16; i += 2)
   12645         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
   12646           return false;
   12647 
   12648       return true;
   12649     };
   12650     auto tryToWidenViaDuplication = [&]() -> SDValue {
   12651       if (!canWidenViaDuplication(Mask))
   12652         return SDValue();
   12653       SmallVector<int, 4> LoInputs;
   12654       copy_if(Mask, std::back_inserter(LoInputs),
   12655               [](int M) { return M >= 0 && M < 8; });
   12656       array_pod_sort(LoInputs.begin(), LoInputs.end());
   12657       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
   12658                      LoInputs.end());
   12659       SmallVector<int, 4> HiInputs;
   12660       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
   12661       array_pod_sort(HiInputs.begin(), HiInputs.end());
   12662       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
   12663                      HiInputs.end());
   12664 
   12665       bool TargetLo = LoInputs.size() >= HiInputs.size();
   12666       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
   12667       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
   12668 
   12669       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
   12670       SmallDenseMap<int, int, 8> LaneMap;
   12671       for (int I : InPlaceInputs) {
   12672         PreDupI16Shuffle[I/2] = I/2;
   12673         LaneMap[I] = I;
   12674       }
   12675       int j = TargetLo ? 0 : 4, je = j + 4;
   12676       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
   12677         // Check if j is already a shuffle of this input. This happens when
   12678         // there are two adjacent bytes after we move the low one.
   12679         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
   12680           // If we haven't yet mapped the input, search for a slot into which
   12681           // we can map it.
   12682           while (j < je && PreDupI16Shuffle[j] >= 0)
   12683             ++j;
   12684 
   12685           if (j == je)
   12686             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
   12687             return SDValue();
   12688 
   12689           // Map this input with the i16 shuffle.
   12690           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
   12691         }
   12692 
   12693         // Update the lane map based on the mapping we ended up with.
   12694         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
   12695       }
   12696       V1 = DAG.getBitcast(
   12697           MVT::v16i8,
   12698           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
   12699                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
   12700 
   12701       // Unpack the bytes to form the i16s that will be shuffled into place.
   12702       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
   12703                        MVT::v16i8, V1, V1);
   12704 
   12705       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
   12706       for (int i = 0; i < 16; ++i)
   12707         if (Mask[i] >= 0) {
   12708           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
   12709           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
   12710           if (PostDupI16Shuffle[i / 2] < 0)
   12711             PostDupI16Shuffle[i / 2] = MappedMask;
   12712           else
   12713             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
   12714                    "Conflicting entries in the original shuffle!");
   12715         }
   12716       return DAG.getBitcast(
   12717           MVT::v16i8,
   12718           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
   12719                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
   12720     };
   12721     if (SDValue V = tryToWidenViaDuplication())
   12722       return V;
   12723   }
   12724 
   12725   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
   12726                                                    Zeroable, DAG))
   12727     return Masked;
   12728 
   12729   // Use dedicated unpack instructions for masks that match their pattern.
   12730   if (SDValue V =
   12731           lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
   12732     return V;
   12733 
   12734   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
   12735   // with PSHUFB. It is important to do this before we attempt to generate any
   12736   // blends but after all of the single-input lowerings. If the single input
   12737   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
   12738   // want to preserve that and we can DAG combine any longer sequences into
   12739   // a PSHUFB in the end. But once we start blending from multiple inputs,
   12740   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
   12741   // and there are *very* few patterns that would actually be faster than the
   12742   // PSHUFB approach because of its ability to zero lanes.
   12743   //
   12744   // FIXME: The only exceptions to the above are blends which are exact
   12745   // interleavings with direct instructions supporting them. We currently don't
   12746   // handle those well here.
   12747   if (Subtarget.hasSSSE3()) {
   12748     bool V1InUse = false;
   12749     bool V2InUse = false;
   12750 
   12751     SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
   12752         DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
   12753 
   12754     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
   12755     // do so. This avoids using them to handle blends-with-zero which is
   12756     // important as a single pshufb is significantly faster for that.
   12757     if (V1InUse && V2InUse) {
   12758       if (Subtarget.hasSSE41())
   12759         if (SDValue Blend = lowerVectorShuffleAsBlend(
   12760                 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
   12761           return Blend;
   12762 
   12763       // We can use an unpack to do the blending rather than an or in some
   12764       // cases. Even though the or may be (very minorly) more efficient, we
   12765       // preference this lowering because there are common cases where part of
   12766       // the complexity of the shuffles goes away when we do the final blend as
   12767       // an unpack.
   12768       // FIXME: It might be worth trying to detect if the unpack-feeding
   12769       // shuffles will both be pshufb, in which case we shouldn't bother with
   12770       // this.
   12771       if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
   12772               DL, MVT::v16i8, V1, V2, Mask, DAG))
   12773         return Unpack;
   12774 
   12775       // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
   12776       if (Subtarget.hasVBMI() && Subtarget.hasVLX())
   12777         return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
   12778     }
   12779 
   12780     return PSHUFB;
   12781   }
   12782 
   12783   // There are special ways we can lower some single-element blends.
   12784   if (NumV2Elements == 1)
   12785     if (SDValue V = lowerVectorShuffleAsElementInsertion(
   12786             DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
   12787       return V;
   12788 
   12789   if (SDValue BitBlend =
   12790           lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
   12791     return BitBlend;
   12792 
   12793   // Check whether a compaction lowering can be done. This handles shuffles
   12794   // which take every Nth element for some even N. See the helper function for
   12795   // details.
   12796   //
   12797   // We special case these as they can be particularly efficiently handled with
   12798   // the PACKUSB instruction on x86 and they show up in common patterns of
   12799   // rearranging bytes to truncate wide elements.
   12800   bool IsSingleInput = V2.isUndef();
   12801   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
   12802     // NumEvenDrops is the power of two stride of the elements. Another way of
   12803     // thinking about it is that we need to drop the even elements this many
   12804     // times to get the original input.
   12805 
   12806     // First we need to zero all the dropped bytes.
   12807     assert(NumEvenDrops <= 3 &&
   12808            "No support for dropping even elements more than 3 times.");
   12809     // We use the mask type to pick which bytes are preserved based on how many
   12810     // elements are dropped.
   12811     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
   12812     SDValue ByteClearMask = DAG.getBitcast(
   12813         MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
   12814     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
   12815     if (!IsSingleInput)
   12816       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
   12817 
   12818     // Now pack things back together.
   12819     V1 = DAG.getBitcast(MVT::v8i16, V1);
   12820     V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
   12821     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
   12822     for (int i = 1; i < NumEvenDrops; ++i) {
   12823       Result = DAG.getBitcast(MVT::v8i16, Result);
   12824       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
   12825     }
   12826 
   12827     return Result;
   12828   }
   12829 
   12830   // Handle multi-input cases by blending single-input shuffles.
   12831   if (NumV2Elements > 0)
   12832     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
   12833                                                       Mask, DAG);
   12834 
   12835   // The fallback path for single-input shuffles widens this into two v8i16
   12836   // vectors with unpacks, shuffles those, and then pulls them back together
   12837   // with a pack.
   12838   SDValue V = V1;
   12839 
   12840   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
   12841   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
   12842   for (int i = 0; i < 16; ++i)
   12843     if (Mask[i] >= 0)
   12844       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
   12845 
   12846   SDValue VLoHalf, VHiHalf;
   12847   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
   12848   // them out and avoid using UNPCK{L,H} to extract the elements of V as
   12849   // i16s.
   12850   if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
   12851       none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
   12852     // Use a mask to drop the high bytes.
   12853     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
   12854     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
   12855                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
   12856 
   12857     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
   12858     VHiHalf = DAG.getUNDEF(MVT::v8i16);
   12859 
   12860     // Squash the masks to point directly into VLoHalf.
   12861     for (int &M : LoBlendMask)
   12862       if (M >= 0)
   12863         M /= 2;
   12864     for (int &M : HiBlendMask)
   12865       if (M >= 0)
   12866         M /= 2;
   12867   } else {
   12868     // Otherwise just unpack the low half of V into VLoHalf and the high half into
   12869     // VHiHalf so that we can blend them as i16s.
   12870     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
   12871 
   12872     VLoHalf = DAG.getBitcast(
   12873         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
   12874     VHiHalf = DAG.getBitcast(
   12875         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
   12876   }
   12877 
   12878   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
   12879   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
   12880 
   12881   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
   12882 }
   12883 
   12884 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
   12885 ///
   12886 /// This routine breaks down the specific type of 128-bit shuffle and
   12887 /// dispatches to the lowering routines accordingly.
   12888 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   12889                                         MVT VT, SDValue V1, SDValue V2,
   12890                                         const APInt &Zeroable,
   12891                                         const X86Subtarget &Subtarget,
   12892                                         SelectionDAG &DAG) {
   12893   switch (VT.SimpleTy) {
   12894   case MVT::v2i64:
   12895     return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   12896   case MVT::v2f64:
   12897     return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   12898   case MVT::v4i32:
   12899     return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   12900   case MVT::v4f32:
   12901     return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   12902   case MVT::v8i16:
   12903     return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   12904   case MVT::v16i8:
   12905     return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   12906 
   12907   default:
   12908     llvm_unreachable("Unimplemented!");
   12909   }
   12910 }
   12911 
   12912 /// Generic routine to split vector shuffle into half-sized shuffles.
   12913 ///
   12914 /// This routine just extracts two subvectors, shuffles them independently, and
   12915 /// then concatenates them back together. This should work effectively with all
   12916 /// AVX vector shuffle types.
   12917 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
   12918                                           SDValue V2, ArrayRef<int> Mask,
   12919                                           SelectionDAG &DAG) {
   12920   assert(VT.getSizeInBits() >= 256 &&
   12921          "Only for 256-bit or wider vector shuffles!");
   12922   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
   12923   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
   12924 
   12925   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
   12926   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
   12927 
   12928   int NumElements = VT.getVectorNumElements();
   12929   int SplitNumElements = NumElements / 2;
   12930   MVT ScalarVT = VT.getVectorElementType();
   12931   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
   12932 
   12933   // Rather than splitting build-vectors, just build two narrower build
   12934   // vectors. This helps shuffling with splats and zeros.
   12935   auto SplitVector = [&](SDValue V) {
   12936     V = peekThroughBitcasts(V);
   12937 
   12938     MVT OrigVT = V.getSimpleValueType();
   12939     int OrigNumElements = OrigVT.getVectorNumElements();
   12940     int OrigSplitNumElements = OrigNumElements / 2;
   12941     MVT OrigScalarVT = OrigVT.getVectorElementType();
   12942     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
   12943 
   12944     SDValue LoV, HiV;
   12945 
   12946     auto *BV = dyn_cast<BuildVectorSDNode>(V);
   12947     if (!BV) {
   12948       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
   12949                         DAG.getIntPtrConstant(0, DL));
   12950       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
   12951                         DAG.getIntPtrConstant(OrigSplitNumElements, DL));
   12952     } else {
   12953 
   12954       SmallVector<SDValue, 16> LoOps, HiOps;
   12955       for (int i = 0; i < OrigSplitNumElements; ++i) {
   12956         LoOps.push_back(BV->getOperand(i));
   12957         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
   12958       }
   12959       LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
   12960       HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
   12961     }
   12962     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
   12963                           DAG.getBitcast(SplitVT, HiV));
   12964   };
   12965 
   12966   SDValue LoV1, HiV1, LoV2, HiV2;
   12967   std::tie(LoV1, HiV1) = SplitVector(V1);
   12968   std::tie(LoV2, HiV2) = SplitVector(V2);
   12969 
   12970   // Now create two 4-way blends of these half-width vectors.
   12971   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
   12972     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
   12973     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
   12974     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
   12975     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
   12976     for (int i = 0; i < SplitNumElements; ++i) {
   12977       int M = HalfMask[i];
   12978       if (M >= NumElements) {
   12979         if (M >= NumElements + SplitNumElements)
   12980           UseHiV2 = true;
   12981         else
   12982           UseLoV2 = true;
   12983         V2BlendMask[i] = M - NumElements;
   12984         BlendMask[i] = SplitNumElements + i;
   12985       } else if (M >= 0) {
   12986         if (M >= SplitNumElements)
   12987           UseHiV1 = true;
   12988         else
   12989           UseLoV1 = true;
   12990         V1BlendMask[i] = M;
   12991         BlendMask[i] = i;
   12992       }
   12993     }
   12994 
   12995     // Because the lowering happens after all combining takes place, we need to
   12996     // manually combine these blend masks as much as possible so that we create
   12997     // a minimal number of high-level vector shuffle nodes.
   12998 
   12999     // First try just blending the halves of V1 or V2.
   13000     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
   13001       return DAG.getUNDEF(SplitVT);
   13002     if (!UseLoV2 && !UseHiV2)
   13003       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
   13004     if (!UseLoV1 && !UseHiV1)
   13005       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
   13006 
   13007     SDValue V1Blend, V2Blend;
   13008     if (UseLoV1 && UseHiV1) {
   13009       V1Blend =
   13010         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
   13011     } else {
   13012       // We only use half of V1 so map the usage down into the final blend mask.
   13013       V1Blend = UseLoV1 ? LoV1 : HiV1;
   13014       for (int i = 0; i < SplitNumElements; ++i)
   13015         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
   13016           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
   13017     }
   13018     if (UseLoV2 && UseHiV2) {
   13019       V2Blend =
   13020         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
   13021     } else {
   13022       // We only use half of V2 so map the usage down into the final blend mask.
   13023       V2Blend = UseLoV2 ? LoV2 : HiV2;
   13024       for (int i = 0; i < SplitNumElements; ++i)
   13025         if (BlendMask[i] >= SplitNumElements)
   13026           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
   13027     }
   13028     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
   13029   };
   13030   SDValue Lo = HalfBlend(LoMask);
   13031   SDValue Hi = HalfBlend(HiMask);
   13032   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
   13033 }
   13034 
   13035 /// Either split a vector in halves or decompose the shuffles and the
   13036 /// blend.
   13037 ///
   13038 /// This is provided as a good fallback for many lowerings of non-single-input
   13039 /// shuffles with more than one 128-bit lane. In those cases, we want to select
   13040 /// between splitting the shuffle into 128-bit components and stitching those
   13041 /// back together vs. extracting the single-input shuffles and blending those
   13042 /// results.
   13043 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
   13044                                                 SDValue V1, SDValue V2,
   13045                                                 ArrayRef<int> Mask,
   13046                                                 SelectionDAG &DAG) {
   13047   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
   13048          "shuffles as it could then recurse on itself.");
   13049   int Size = Mask.size();
   13050 
   13051   // If this can be modeled as a broadcast of two elements followed by a blend,
   13052   // prefer that lowering. This is especially important because broadcasts can
   13053   // often fold with memory operands.
   13054   auto DoBothBroadcast = [&] {
   13055     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
   13056     for (int M : Mask)
   13057       if (M >= Size) {
   13058         if (V2BroadcastIdx < 0)
   13059           V2BroadcastIdx = M - Size;
   13060         else if (M - Size != V2BroadcastIdx)
   13061           return false;
   13062       } else if (M >= 0) {
   13063         if (V1BroadcastIdx < 0)
   13064           V1BroadcastIdx = M;
   13065         else if (M != V1BroadcastIdx)
   13066           return false;
   13067       }
   13068     return true;
   13069   };
   13070   if (DoBothBroadcast())
   13071     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
   13072                                                       DAG);
   13073 
   13074   // If the inputs all stem from a single 128-bit lane of each input, then we
   13075   // split them rather than blending because the split will decompose to
   13076   // unusually few instructions.
   13077   int LaneCount = VT.getSizeInBits() / 128;
   13078   int LaneSize = Size / LaneCount;
   13079   SmallBitVector LaneInputs[2];
   13080   LaneInputs[0].resize(LaneCount, false);
   13081   LaneInputs[1].resize(LaneCount, false);
   13082   for (int i = 0; i < Size; ++i)
   13083     if (Mask[i] >= 0)
   13084       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
   13085   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
   13086     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   13087 
   13088   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
   13089   // that the decomposed single-input shuffles don't end up here.
   13090   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
   13091 }
   13092 
   13093 /// Lower a vector shuffle crossing multiple 128-bit lanes as
   13094 /// a permutation and blend of those lanes.
   13095 ///
   13096 /// This essentially blends the out-of-lane inputs to each lane into the lane
   13097 /// from a permuted copy of the vector. This lowering strategy results in four
   13098 /// instructions in the worst case for a single-input cross lane shuffle which
   13099 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
   13100 /// of. Special cases for each particular shuffle pattern should be handled
   13101 /// prior to trying this lowering.
   13102 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
   13103                                                        SDValue V1, SDValue V2,
   13104                                                        ArrayRef<int> Mask,
   13105                                                        SelectionDAG &DAG,
   13106                                                        const X86Subtarget &Subtarget) {
   13107   // FIXME: This should probably be generalized for 512-bit vectors as well.
   13108   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
   13109   int Size = Mask.size();
   13110   int LaneSize = Size / 2;
   13111 
   13112   // If there are only inputs from one 128-bit lane, splitting will in fact be
   13113   // less expensive. The flags track whether the given lane contains an element
   13114   // that crosses to another lane.
   13115   if (!Subtarget.hasAVX2()) {
   13116     bool LaneCrossing[2] = {false, false};
   13117     for (int i = 0; i < Size; ++i)
   13118       if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
   13119         LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
   13120     if (!LaneCrossing[0] || !LaneCrossing[1])
   13121       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   13122   } else {
   13123     bool LaneUsed[2] = {false, false};
   13124     for (int i = 0; i < Size; ++i)
   13125       if (Mask[i] >= 0)
   13126         LaneUsed[(Mask[i] / LaneSize)] = true;
   13127     if (!LaneUsed[0] || !LaneUsed[1])
   13128       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   13129   }
   13130 
   13131   assert(V2.isUndef() &&
   13132          "This last part of this routine only works on single input shuffles");
   13133 
   13134   SmallVector<int, 32> FlippedBlendMask(Size);
   13135   for (int i = 0; i < Size; ++i)
   13136     FlippedBlendMask[i] =
   13137         Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
   13138                                 ? Mask[i]
   13139                                 : Mask[i] % LaneSize +
   13140                                       (i / LaneSize) * LaneSize + Size);
   13141 
   13142   // Flip the vector, and blend the results which should now be in-lane.
   13143   MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
   13144   SDValue Flipped = DAG.getBitcast(PVT, V1);
   13145   Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
   13146                                  { 2, 3, 0, 1 });
   13147   Flipped = DAG.getBitcast(VT, Flipped);
   13148   return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
   13149 }
   13150 
   13151 /// Handle lowering 2-lane 128-bit shuffles.
   13152 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
   13153                                         SDValue V2, ArrayRef<int> Mask,
   13154                                         const APInt &Zeroable,
   13155                                         const X86Subtarget &Subtarget,
   13156                                         SelectionDAG &DAG) {
   13157   // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
   13158   if (Subtarget.hasAVX2() && V2.isUndef())
   13159     return SDValue();
   13160 
   13161   SmallVector<int, 4> WidenedMask;
   13162   if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
   13163     return SDValue();
   13164 
   13165   bool IsLowZero = (Zeroable & 0x3) == 0x3;
   13166   bool IsHighZero = (Zeroable & 0xc) == 0xc;
   13167 
   13168   // Try to use an insert into a zero vector.
   13169   if (WidenedMask[0] == 0 && IsHighZero) {
   13170     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
   13171     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
   13172                               DAG.getIntPtrConstant(0, DL));
   13173     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
   13174                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
   13175                        DAG.getIntPtrConstant(0, DL));
   13176   }
   13177 
   13178   // TODO: If minimizing size and one of the inputs is a zero vector and the
   13179   // the zero vector has only one use, we could use a VPERM2X128 to save the
   13180   // instruction bytes needed to explicitly generate the zero vector.
   13181 
   13182   // Blends are faster and handle all the non-lane-crossing cases.
   13183   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
   13184                                                 Zeroable, Subtarget, DAG))
   13185     return Blend;
   13186 
   13187   // If either input operand is a zero vector, use VPERM2X128 because its mask
   13188   // allows us to replace the zero input with an implicit zero.
   13189   if (!IsLowZero && !IsHighZero) {
   13190     // Check for patterns which can be matched with a single insert of a 128-bit
   13191     // subvector.
   13192     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
   13193     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
   13194 
   13195       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
   13196       // this will likely become vinsertf128 which can't fold a 256-bit memop.
   13197       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
   13198         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
   13199         SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
   13200                                      OnlyUsesV1 ? V1 : V2,
   13201                                      DAG.getIntPtrConstant(0, DL));
   13202         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
   13203                            DAG.getIntPtrConstant(2, DL));
   13204       }
   13205     }
   13206 
   13207     // Try to use SHUF128 if possible.
   13208     if (Subtarget.hasVLX()) {
   13209       if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
   13210         unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
   13211                             ((WidenedMask[1] % 2) << 1);
   13212       return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
   13213                          DAG.getConstant(PermMask, DL, MVT::i8));
   13214       }
   13215     }
   13216   }
   13217 
   13218   // Otherwise form a 128-bit permutation. After accounting for undefs,
   13219   // convert the 64-bit shuffle mask selection values into 128-bit
   13220   // selection bits by dividing the indexes by 2 and shifting into positions
   13221   // defined by a vperm2*128 instruction's immediate control byte.
   13222 
   13223   // The immediate permute control byte looks like this:
   13224   //    [1:0] - select 128 bits from sources for low half of destination
   13225   //    [2]   - ignore
   13226   //    [3]   - zero low half of destination
   13227   //    [5:4] - select 128 bits from sources for high half of destination
   13228   //    [6]   - ignore
   13229   //    [7]   - zero high half of destination
   13230 
   13231   assert((WidenedMask[0] >= 0 || IsLowZero) &&
   13232          (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
   13233 
   13234   unsigned PermMask = 0;
   13235   PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
   13236   PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
   13237 
   13238   // Check the immediate mask and replace unused sources with undef.
   13239   if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
   13240     V1 = DAG.getUNDEF(VT);
   13241   if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
   13242     V2 = DAG.getUNDEF(VT);
   13243 
   13244   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
   13245                      DAG.getConstant(PermMask, DL, MVT::i8));
   13246 }
   13247 
   13248 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
   13249 /// shuffling each lane.
   13250 ///
   13251 /// This will only succeed when the result of fixing the 128-bit lanes results
   13252 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
   13253 /// each 128-bit lanes. This handles many cases where we can quickly blend away
   13254 /// the lane crosses early and then use simpler shuffles within each lane.
   13255 ///
   13256 /// FIXME: It might be worthwhile at some point to support this without
   13257 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
   13258 /// in x86 only floating point has interesting non-repeating shuffles, and even
   13259 /// those are still *marginally* more expensive.
   13260 static SDValue lowerVectorShuffleByMerging128BitLanes(
   13261     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   13262     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   13263   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
   13264 
   13265   int Size = Mask.size();
   13266   int LaneSize = 128 / VT.getScalarSizeInBits();
   13267   int NumLanes = Size / LaneSize;
   13268   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
   13269 
   13270   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
   13271   // check whether the in-128-bit lane shuffles share a repeating pattern.
   13272   SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
   13273   SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
   13274   for (int i = 0; i < Size; ++i) {
   13275     if (Mask[i] < 0)
   13276       continue;
   13277 
   13278     int j = i / LaneSize;
   13279 
   13280     if (Lanes[j] < 0) {
   13281       // First entry we've seen for this lane.
   13282       Lanes[j] = Mask[i] / LaneSize;
   13283     } else if (Lanes[j] != Mask[i] / LaneSize) {
   13284       // This doesn't match the lane selected previously!
   13285       return SDValue();
   13286     }
   13287 
   13288     // Check that within each lane we have a consistent shuffle mask.
   13289     int k = i % LaneSize;
   13290     if (InLaneMask[k] < 0) {
   13291       InLaneMask[k] = Mask[i] % LaneSize;
   13292     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
   13293       // This doesn't fit a repeating in-lane mask.
   13294       return SDValue();
   13295     }
   13296   }
   13297 
   13298   // First shuffle the lanes into place.
   13299   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
   13300                                 VT.getSizeInBits() / 64);
   13301   SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
   13302   for (int i = 0; i < NumLanes; ++i)
   13303     if (Lanes[i] >= 0) {
   13304       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
   13305       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
   13306     }
   13307 
   13308   V1 = DAG.getBitcast(LaneVT, V1);
   13309   V2 = DAG.getBitcast(LaneVT, V2);
   13310   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
   13311 
   13312   // Cast it back to the type we actually want.
   13313   LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
   13314 
   13315   // Now do a simple shuffle that isn't lane crossing.
   13316   SmallVector<int, 8> NewMask((unsigned)Size, -1);
   13317   for (int i = 0; i < Size; ++i)
   13318     if (Mask[i] >= 0)
   13319       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
   13320   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
   13321          "Must not introduce lane crosses at this point!");
   13322 
   13323   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
   13324 }
   13325 
   13326 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
   13327 /// This allows for fast cases such as subvector extraction/insertion
   13328 /// or shuffling smaller vector types which can lower more efficiently.
   13329 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
   13330                                                SDValue V1, SDValue V2,
   13331                                                ArrayRef<int> Mask,
   13332                                                const X86Subtarget &Subtarget,
   13333                                                SelectionDAG &DAG) {
   13334   assert((VT.is256BitVector() || VT.is512BitVector()) &&
   13335          "Expected 256-bit or 512-bit vector");
   13336 
   13337   unsigned NumElts = VT.getVectorNumElements();
   13338   unsigned HalfNumElts = NumElts / 2;
   13339   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
   13340 
   13341   bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
   13342   bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
   13343   if (!UndefLower && !UndefUpper)
   13344     return SDValue();
   13345 
   13346   // Upper half is undef and lower half is whole upper subvector.
   13347   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   13348   if (UndefUpper &&
   13349       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
   13350     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
   13351                              DAG.getIntPtrConstant(HalfNumElts, DL));
   13352     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
   13353                        DAG.getIntPtrConstant(0, DL));
   13354   }
   13355 
   13356   // Lower half is undef and upper half is whole lower subvector.
   13357   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   13358   if (UndefLower &&
   13359       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
   13360     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
   13361                              DAG.getIntPtrConstant(0, DL));
   13362     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
   13363                        DAG.getIntPtrConstant(HalfNumElts, DL));
   13364   }
   13365 
   13366   // If the shuffle only uses two of the four halves of the input operands,
   13367   // then extract them and perform the 'half' shuffle at half width.
   13368   // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
   13369   int HalfIdx1 = -1, HalfIdx2 = -1;
   13370   SmallVector<int, 8> HalfMask(HalfNumElts);
   13371   unsigned Offset = UndefLower ? HalfNumElts : 0;
   13372   for (unsigned i = 0; i != HalfNumElts; ++i) {
   13373     int M = Mask[i + Offset];
   13374     if (M < 0) {
   13375       HalfMask[i] = M;
   13376       continue;
   13377     }
   13378 
   13379     // Determine which of the 4 half vectors this element is from.
   13380     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
   13381     int HalfIdx = M / HalfNumElts;
   13382 
   13383     // Determine the element index into its half vector source.
   13384     int HalfElt = M % HalfNumElts;
   13385 
   13386     // We can shuffle with up to 2 half vectors, set the new 'half'
   13387     // shuffle mask accordingly.
   13388     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
   13389       HalfMask[i] = HalfElt;
   13390       HalfIdx1 = HalfIdx;
   13391       continue;
   13392     }
   13393     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
   13394       HalfMask[i] = HalfElt + HalfNumElts;
   13395       HalfIdx2 = HalfIdx;
   13396       continue;
   13397     }
   13398 
   13399     // Too many half vectors referenced.
   13400     return SDValue();
   13401   }
   13402   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
   13403 
   13404   // Only shuffle the halves of the inputs when useful.
   13405   int NumLowerHalves =
   13406       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
   13407   int NumUpperHalves =
   13408       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
   13409 
   13410   // uuuuXXXX - don't extract uppers just to insert again.
   13411   if (UndefLower && NumUpperHalves != 0)
   13412     return SDValue();
   13413 
   13414   // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
   13415   if (UndefUpper && NumUpperHalves == 2)
   13416     return SDValue();
   13417 
   13418   // AVX2 - XXXXuuuu - always extract lowers.
   13419   if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
   13420     // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
   13421     if (VT == MVT::v4f64 || VT == MVT::v4i64)
   13422       return SDValue();
   13423     // AVX2 supports variable 32-bit element cross-lane shuffles.
   13424     if (VT == MVT::v8f32 || VT == MVT::v8i32) {
   13425       // XXXXuuuu - don't extract lowers and uppers.
   13426       if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
   13427         return SDValue();
   13428     }
   13429   }
   13430 
   13431   // AVX512 - XXXXuuuu - always extract lowers.
   13432   if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
   13433     return SDValue();
   13434 
   13435   auto GetHalfVector = [&](int HalfIdx) {
   13436     if (HalfIdx < 0)
   13437       return DAG.getUNDEF(HalfVT);
   13438     SDValue V = (HalfIdx < 2 ? V1 : V2);
   13439     HalfIdx = (HalfIdx % 2) * HalfNumElts;
   13440     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
   13441                        DAG.getIntPtrConstant(HalfIdx, DL));
   13442   };
   13443 
   13444   SDValue Half1 = GetHalfVector(HalfIdx1);
   13445   SDValue Half2 = GetHalfVector(HalfIdx2);
   13446   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
   13447   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
   13448                      DAG.getIntPtrConstant(Offset, DL));
   13449 }
   13450 
   13451 /// Test whether the specified input (0 or 1) is in-place blended by the
   13452 /// given mask.
   13453 ///
   13454 /// This returns true if the elements from a particular input are already in the
   13455 /// slot required by the given mask and require no permutation.
   13456 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
   13457   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
   13458   int Size = Mask.size();
   13459   for (int i = 0; i < Size; ++i)
   13460     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
   13461       return false;
   13462 
   13463   return true;
   13464 }
   13465 
   13466 /// Handle case where shuffle sources are coming from the same 128-bit lane and
   13467 /// every lane can be represented as the same repeating mask - allowing us to
   13468 /// shuffle the sources with the repeating shuffle and then permute the result
   13469 /// to the destination lanes.
   13470 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
   13471     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
   13472     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   13473   int NumElts = VT.getVectorNumElements();
   13474   int NumLanes = VT.getSizeInBits() / 128;
   13475   int NumLaneElts = NumElts / NumLanes;
   13476 
   13477   // On AVX2 we may be able to just shuffle the lowest elements and then
   13478   // broadcast the result.
   13479   if (Subtarget.hasAVX2()) {
   13480     for (unsigned BroadcastSize : {16, 32, 64}) {
   13481       if (BroadcastSize <= VT.getScalarSizeInBits())
   13482         continue;
   13483       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
   13484 
   13485       // Attempt to match a repeating pattern every NumBroadcastElts,
   13486       // accounting for UNDEFs but only references the lowest 128-bit
   13487       // lane of the inputs.
   13488       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
   13489         for (int i = 0; i != NumElts; i += NumBroadcastElts)
   13490           for (int j = 0; j != NumBroadcastElts; ++j) {
   13491             int M = Mask[i + j];
   13492             if (M < 0)
   13493               continue;
   13494             int &R = RepeatMask[j];
   13495             if (0 != ((M % NumElts) / NumLaneElts))
   13496               return false;
   13497             if (0 <= R && R != M)
   13498               return false;
   13499             R = M;
   13500           }
   13501         return true;
   13502       };
   13503 
   13504       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
   13505       if (!FindRepeatingBroadcastMask(RepeatMask))
   13506         continue;
   13507 
   13508       // Shuffle the (lowest) repeated elements in place for broadcast.
   13509       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
   13510 
   13511       // Shuffle the actual broadcast.
   13512       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
   13513       for (int i = 0; i != NumElts; i += NumBroadcastElts)
   13514         for (int j = 0; j != NumBroadcastElts; ++j)
   13515           BroadcastMask[i + j] = j;
   13516       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
   13517                                   BroadcastMask);
   13518     }
   13519   }
   13520 
   13521   // Bail if the shuffle mask doesn't cross 128-bit lanes.
   13522   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
   13523     return SDValue();
   13524 
   13525   // Bail if we already have a repeated lane shuffle mask.
   13526   SmallVector<int, 8> RepeatedShuffleMask;
   13527   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
   13528     return SDValue();
   13529 
   13530   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
   13531   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
   13532   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
   13533   int NumSubLanes = NumLanes * SubLaneScale;
   13534   int NumSubLaneElts = NumLaneElts / SubLaneScale;
   13535 
   13536   // Check that all the sources are coming from the same lane and see if we can
   13537   // form a repeating shuffle mask (local to each sub-lane). At the same time,
   13538   // determine the source sub-lane for each destination sub-lane.
   13539   int TopSrcSubLane = -1;
   13540   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
   13541   SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
   13542       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
   13543       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
   13544 
   13545   for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
   13546     // Extract the sub-lane mask, check that it all comes from the same lane
   13547     // and normalize the mask entries to come from the first lane.
   13548     int SrcLane = -1;
   13549     SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
   13550     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
   13551       int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
   13552       if (M < 0)
   13553         continue;
   13554       int Lane = (M % NumElts) / NumLaneElts;
   13555       if ((0 <= SrcLane) && (SrcLane != Lane))
   13556         return SDValue();
   13557       SrcLane = Lane;
   13558       int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
   13559       SubLaneMask[Elt] = LocalM;
   13560     }
   13561 
   13562     // Whole sub-lane is UNDEF.
   13563     if (SrcLane < 0)
   13564       continue;
   13565 
   13566     // Attempt to match against the candidate repeated sub-lane masks.
   13567     for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
   13568       auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
   13569         for (int i = 0; i != NumSubLaneElts; ++i) {
   13570           if (M1[i] < 0 || M2[i] < 0)
   13571             continue;
   13572           if (M1[i] != M2[i])
   13573             return false;
   13574         }
   13575         return true;
   13576       };
   13577 
   13578       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
   13579       if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
   13580         continue;
   13581 
   13582       // Merge the sub-lane mask into the matching repeated sub-lane mask.
   13583       for (int i = 0; i != NumSubLaneElts; ++i) {
   13584         int M = SubLaneMask[i];
   13585         if (M < 0)
   13586           continue;
   13587         assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
   13588                "Unexpected mask element");
   13589         RepeatedSubLaneMask[i] = M;
   13590       }
   13591 
   13592       // Track the top most source sub-lane - by setting the remaining to UNDEF
   13593       // we can greatly simplify shuffle matching.
   13594       int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
   13595       TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
   13596       Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
   13597       break;
   13598     }
   13599 
   13600     // Bail if we failed to find a matching repeated sub-lane mask.
   13601     if (Dst2SrcSubLanes[DstSubLane] < 0)
   13602       return SDValue();
   13603   }
   13604   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
   13605          "Unexpected source lane");
   13606 
   13607   // Create a repeating shuffle mask for the entire vector.
   13608   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
   13609   for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
   13610     int Lane = SubLane / SubLaneScale;
   13611     auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
   13612     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
   13613       int M = RepeatedSubLaneMask[Elt];
   13614       if (M < 0)
   13615         continue;
   13616       int Idx = (SubLane * NumSubLaneElts) + Elt;
   13617       RepeatedMask[Idx] = M + (Lane * NumLaneElts);
   13618     }
   13619   }
   13620   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
   13621 
   13622   // Shuffle each source sub-lane to its destination.
   13623   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
   13624   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
   13625     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
   13626     if (SrcSubLane < 0)
   13627       continue;
   13628     for (int j = 0; j != NumSubLaneElts; ++j)
   13629       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
   13630   }
   13631 
   13632   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
   13633                               SubLaneMask);
   13634 }
   13635 
   13636 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
   13637                                          unsigned &ShuffleImm,
   13638                                          ArrayRef<int> Mask) {
   13639   int NumElts = VT.getVectorNumElements();
   13640   assert(VT.getScalarSizeInBits() == 64 &&
   13641          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
   13642          "Unexpected data type for VSHUFPD");
   13643 
   13644   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
   13645   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
   13646   ShuffleImm = 0;
   13647   bool ShufpdMask = true;
   13648   bool CommutableMask = true;
   13649   for (int i = 0; i < NumElts; ++i) {
   13650     if (Mask[i] == SM_SentinelUndef)
   13651       continue;
   13652     if (Mask[i] < 0)
   13653       return false;
   13654     int Val = (i & 6) + NumElts * (i & 1);
   13655     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
   13656     if (Mask[i] < Val || Mask[i] > Val + 1)
   13657       ShufpdMask = false;
   13658     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
   13659       CommutableMask = false;
   13660     ShuffleImm |= (Mask[i] % 2) << i;
   13661   }
   13662 
   13663   if (ShufpdMask)
   13664     return true;
   13665   if (CommutableMask) {
   13666     std::swap(V1, V2);
   13667     return true;
   13668   }
   13669 
   13670   return false;
   13671 }
   13672 
   13673 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
   13674                                             ArrayRef<int> Mask, SDValue V1,
   13675                                             SDValue V2, SelectionDAG &DAG) {
   13676   assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
   13677          "Unexpected data type for VSHUFPD");
   13678 
   13679   unsigned Immediate = 0;
   13680   if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
   13681     return SDValue();
   13682 
   13683   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
   13684                      DAG.getConstant(Immediate, DL, MVT::i8));
   13685 }
   13686 
   13687 /// Handle lowering of 4-lane 64-bit floating point shuffles.
   13688 ///
   13689 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
   13690 /// isn't available.
   13691 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   13692                                        const APInt &Zeroable,
   13693                                        SDValue V1, SDValue V2,
   13694                                        const X86Subtarget &Subtarget,
   13695                                        SelectionDAG &DAG) {
   13696   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   13697   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   13698   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   13699 
   13700   if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
   13701                                            Zeroable, Subtarget, DAG))
   13702     return V;
   13703 
   13704   if (V2.isUndef()) {
   13705     // Check for being able to broadcast a single element.
   13706     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
   13707             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
   13708       return Broadcast;
   13709 
   13710     // Use low duplicate instructions for masks that match their pattern.
   13711     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
   13712       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
   13713 
   13714     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
   13715       // Non-half-crossing single input shuffles can be lowered with an
   13716       // interleaved permutation.
   13717       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
   13718                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
   13719       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
   13720                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
   13721     }
   13722 
   13723     // With AVX2 we have direct support for this permutation.
   13724     if (Subtarget.hasAVX2())
   13725       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
   13726                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   13727 
   13728     // Try to create an in-lane repeating shuffle mask and then shuffle the
   13729     // results into the target lanes.
   13730     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   13731             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
   13732       return V;
   13733 
   13734     // Otherwise, fall back.
   13735     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
   13736                                                    DAG, Subtarget);
   13737   }
   13738 
   13739   // Use dedicated unpack instructions for masks that match their pattern.
   13740   if (SDValue V =
   13741           lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
   13742     return V;
   13743 
   13744   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
   13745                                                 Zeroable, Subtarget, DAG))
   13746     return Blend;
   13747 
   13748   // Check if the blend happens to exactly fit that of SHUFPD.
   13749   if (SDValue Op =
   13750       lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
   13751     return Op;
   13752 
   13753   // Try to create an in-lane repeating shuffle mask and then shuffle the
   13754   // results into the target lanes.
   13755   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   13756           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
   13757     return V;
   13758 
   13759   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   13760   // shuffle. However, if we have AVX2 and either inputs are already in place,
   13761   // we will be able to shuffle even across lanes the other input in a single
   13762   // instruction so skip this pattern.
   13763   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
   13764                                 isShuffleMaskInputInPlace(1, Mask))))
   13765     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   13766             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
   13767       return Result;
   13768   // If we have VLX support, we can use VEXPAND.
   13769   if (Subtarget.hasVLX())
   13770     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
   13771                                                V1, V2, DAG, Subtarget))
   13772       return V;
   13773 
   13774   // If we have AVX2 then we always want to lower with a blend because an v4 we
   13775   // can fully permute the elements.
   13776   if (Subtarget.hasAVX2())
   13777     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
   13778                                                       Mask, DAG);
   13779 
   13780   // Otherwise fall back on generic lowering.
   13781   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
   13782 }
   13783 
   13784 /// Handle lowering of 4-lane 64-bit integer shuffles.
   13785 ///
   13786 /// This routine is only called when we have AVX2 and thus a reasonable
   13787 /// instruction set for v4i64 shuffling..
   13788 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   13789                                        const APInt &Zeroable,
   13790                                        SDValue V1, SDValue V2,
   13791                                        const X86Subtarget &Subtarget,
   13792                                        SelectionDAG &DAG) {
   13793   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   13794   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   13795   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   13796   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
   13797 
   13798   if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
   13799                                            Zeroable, Subtarget, DAG))
   13800     return V;
   13801 
   13802   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
   13803                                                 Zeroable, Subtarget, DAG))
   13804     return Blend;
   13805 
   13806   // Check for being able to broadcast a single element.
   13807   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
   13808                                                         Mask, Subtarget, DAG))
   13809     return Broadcast;
   13810 
   13811   if (V2.isUndef()) {
   13812     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
   13813     // can use lower latency instructions that will operate on both lanes.
   13814     SmallVector<int, 2> RepeatedMask;
   13815     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
   13816       SmallVector<int, 4> PSHUFDMask;
   13817       scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
   13818       return DAG.getBitcast(
   13819           MVT::v4i64,
   13820           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
   13821                       DAG.getBitcast(MVT::v8i32, V1),
   13822                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   13823     }
   13824 
   13825     // AVX2 provides a direct instruction for permuting a single input across
   13826     // lanes.
   13827     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
   13828                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   13829   }
   13830 
   13831   // Try to use shift instructions.
   13832   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
   13833                                                 Zeroable, Subtarget, DAG))
   13834     return Shift;
   13835 
   13836   // If we have VLX support, we can use VALIGN or VEXPAND.
   13837   if (Subtarget.hasVLX()) {
   13838     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
   13839                                                     Mask, Subtarget, DAG))
   13840       return Rotate;
   13841 
   13842     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
   13843                                                V1, V2, DAG, Subtarget))
   13844       return V;
   13845   }
   13846 
   13847   // Try to use PALIGNR.
   13848   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
   13849                                                       Mask, Subtarget, DAG))
   13850     return Rotate;
   13851 
   13852   // Use dedicated unpack instructions for masks that match their pattern.
   13853   if (SDValue V =
   13854           lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
   13855     return V;
   13856 
   13857   // Try to create an in-lane repeating shuffle mask and then shuffle the
   13858   // results into the target lanes.
   13859   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   13860           DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
   13861     return V;
   13862 
   13863   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   13864   // shuffle. However, if we have AVX2 and either inputs are already in place,
   13865   // we will be able to shuffle even across lanes the other input in a single
   13866   // instruction so skip this pattern.
   13867   if (!isShuffleMaskInputInPlace(0, Mask) &&
   13868       !isShuffleMaskInputInPlace(1, Mask))
   13869     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   13870             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
   13871       return Result;
   13872 
   13873   // Otherwise fall back on generic blend lowering.
   13874   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
   13875                                                     Mask, DAG);
   13876 }
   13877 
   13878 /// Handle lowering of 8-lane 32-bit floating point shuffles.
   13879 ///
   13880 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
   13881 /// isn't available.
   13882 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   13883                                        const APInt &Zeroable,
   13884                                        SDValue V1, SDValue V2,
   13885                                        const X86Subtarget &Subtarget,
   13886                                        SelectionDAG &DAG) {
   13887   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   13888   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   13889   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   13890 
   13891   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
   13892                                                 Zeroable, Subtarget, DAG))
   13893     return Blend;
   13894 
   13895   // Check for being able to broadcast a single element.
   13896   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
   13897                                                         Mask, Subtarget, DAG))
   13898     return Broadcast;
   13899 
   13900   // If the shuffle mask is repeated in each 128-bit lane, we have many more
   13901   // options to efficiently lower the shuffle.
   13902   SmallVector<int, 4> RepeatedMask;
   13903   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
   13904     assert(RepeatedMask.size() == 4 &&
   13905            "Repeated masks must be half the mask width!");
   13906 
   13907     // Use even/odd duplicate instructions for masks that match their pattern.
   13908     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
   13909       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
   13910     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
   13911       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
   13912 
   13913     if (V2.isUndef())
   13914       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
   13915                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   13916 
   13917     // Use dedicated unpack instructions for masks that match their pattern.
   13918     if (SDValue V =
   13919             lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
   13920       return V;
   13921 
   13922     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
   13923     // have already handled any direct blends.
   13924     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
   13925   }
   13926 
   13927   // Try to create an in-lane repeating shuffle mask and then shuffle the
   13928   // results into the target lanes.
   13929   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   13930           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
   13931     return V;
   13932 
   13933   // If we have a single input shuffle with different shuffle patterns in the
   13934   // two 128-bit lanes use the variable mask to VPERMILPS.
   13935   if (V2.isUndef()) {
   13936     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
   13937     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
   13938       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
   13939 
   13940     if (Subtarget.hasAVX2())
   13941       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
   13942 
   13943     // Otherwise, fall back.
   13944     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
   13945                                                    DAG, Subtarget);
   13946   }
   13947 
   13948   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   13949   // shuffle.
   13950   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   13951           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
   13952     return Result;
   13953   // If we have VLX support, we can use VEXPAND.
   13954   if (Subtarget.hasVLX())
   13955     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
   13956                                                V1, V2, DAG, Subtarget))
   13957       return V;
   13958 
   13959   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
   13960   // since after split we get a more efficient code using vpunpcklwd and
   13961   // vpunpckhwd instrs than vblend.
   13962   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
   13963     if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
   13964                                                      Mask, DAG))
   13965       return V;
   13966 
   13967   // If we have AVX2 then we always want to lower with a blend because at v8 we
   13968   // can fully permute the elements.
   13969   if (Subtarget.hasAVX2())
   13970     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
   13971                                                       Mask, DAG);
   13972 
   13973   // Otherwise fall back on generic lowering.
   13974   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
   13975 }
   13976 
   13977 /// Handle lowering of 8-lane 32-bit integer shuffles.
   13978 ///
   13979 /// This routine is only called when we have AVX2 and thus a reasonable
   13980 /// instruction set for v8i32 shuffling..
   13981 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   13982                                        const APInt &Zeroable,
   13983                                        SDValue V1, SDValue V2,
   13984                                        const X86Subtarget &Subtarget,
   13985                                        SelectionDAG &DAG) {
   13986   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   13987   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   13988   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   13989   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
   13990 
   13991   // Whenever we can lower this as a zext, that instruction is strictly faster
   13992   // than any alternative. It also allows us to fold memory operands into the
   13993   // shuffle in many cases.
   13994   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   13995           DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
   13996     return ZExt;
   13997 
   13998   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
   13999   // since after split we get a more efficient code than vblend by using
   14000   // vpunpcklwd and vpunpckhwd instrs.
   14001   if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
   14002       !Subtarget.hasAVX512())
   14003     if (SDValue V =
   14004             lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
   14005       return V;
   14006 
   14007   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
   14008                                                 Zeroable, Subtarget, DAG))
   14009     return Blend;
   14010 
   14011   // Check for being able to broadcast a single element.
   14012   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
   14013                                                         Mask, Subtarget, DAG))
   14014     return Broadcast;
   14015 
   14016   // If the shuffle mask is repeated in each 128-bit lane we can use more
   14017   // efficient instructions that mirror the shuffles across the two 128-bit
   14018   // lanes.
   14019   SmallVector<int, 4> RepeatedMask;
   14020   bool Is128BitLaneRepeatedShuffle =
   14021       is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
   14022   if (Is128BitLaneRepeatedShuffle) {
   14023     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
   14024     if (V2.isUndef())
   14025       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
   14026                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   14027 
   14028     // Use dedicated unpack instructions for masks that match their pattern.
   14029     if (SDValue V =
   14030             lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
   14031       return V;
   14032   }
   14033 
   14034   // Try to use shift instructions.
   14035   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
   14036                                                 Zeroable, Subtarget, DAG))
   14037     return Shift;
   14038 
   14039   // If we have VLX support, we can use VALIGN or EXPAND.
   14040   if (Subtarget.hasVLX()) {
   14041     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
   14042                                                     Mask, Subtarget, DAG))
   14043       return Rotate;
   14044 
   14045     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
   14046                                                V1, V2, DAG, Subtarget))
   14047       return V;
   14048   }
   14049 
   14050   // Try to use byte rotation instructions.
   14051   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   14052           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
   14053     return Rotate;
   14054 
   14055   // Try to create an in-lane repeating shuffle mask and then shuffle the
   14056   // results into the target lanes.
   14057   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   14058           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
   14059     return V;
   14060 
   14061   // If the shuffle patterns aren't repeated but it is a single input, directly
   14062   // generate a cross-lane VPERMD instruction.
   14063   if (V2.isUndef()) {
   14064     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
   14065     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
   14066   }
   14067 
   14068   // Assume that a single SHUFPS is faster than an alternative sequence of
   14069   // multiple instructions (even if the CPU has a domain penalty).
   14070   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
   14071   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
   14072     SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
   14073     SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
   14074     SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
   14075                                                   CastV1, CastV2, DAG);
   14076     return DAG.getBitcast(MVT::v8i32, ShufPS);
   14077   }
   14078 
   14079   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   14080   // shuffle.
   14081   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   14082           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
   14083     return Result;
   14084 
   14085   // Otherwise fall back on generic blend lowering.
   14086   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
   14087                                                     Mask, DAG);
   14088 }
   14089 
   14090 /// Handle lowering of 16-lane 16-bit integer shuffles.
   14091 ///
   14092 /// This routine is only called when we have AVX2 and thus a reasonable
   14093 /// instruction set for v16i16 shuffling..
   14094 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   14095                                         const APInt &Zeroable,
   14096                                         SDValue V1, SDValue V2,
   14097                                         const X86Subtarget &Subtarget,
   14098                                         SelectionDAG &DAG) {
   14099   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   14100   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   14101   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   14102   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
   14103 
   14104   // Whenever we can lower this as a zext, that instruction is strictly faster
   14105   // than any alternative. It also allows us to fold memory operands into the
   14106   // shuffle in many cases.
   14107   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   14108           DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
   14109     return ZExt;
   14110 
   14111   // Check for being able to broadcast a single element.
   14112   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
   14113                                                         Mask, Subtarget, DAG))
   14114     return Broadcast;
   14115 
   14116   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
   14117                                                 Zeroable, Subtarget, DAG))
   14118     return Blend;
   14119 
   14120   // Use dedicated unpack instructions for masks that match their pattern.
   14121   if (SDValue V =
   14122           lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
   14123     return V;
   14124 
   14125   // Use dedicated pack instructions for masks that match their pattern.
   14126   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
   14127                                              Subtarget))
   14128     return V;
   14129 
   14130   // Try to use shift instructions.
   14131   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
   14132                                                 Zeroable, Subtarget, DAG))
   14133     return Shift;
   14134 
   14135   // Try to use byte rotation instructions.
   14136   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   14137           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
   14138     return Rotate;
   14139 
   14140   // Try to create an in-lane repeating shuffle mask and then shuffle the
   14141   // results into the target lanes.
   14142   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   14143           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
   14144     return V;
   14145 
   14146   if (V2.isUndef()) {
   14147     // There are no generalized cross-lane shuffle operations available on i16
   14148     // element types.
   14149     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
   14150       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
   14151                                                      Mask, DAG, Subtarget);
   14152 
   14153     SmallVector<int, 8> RepeatedMask;
   14154     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
   14155       // As this is a single-input shuffle, the repeated mask should be
   14156       // a strictly valid v8i16 mask that we can pass through to the v8i16
   14157       // lowering to handle even the v16 case.
   14158       return lowerV8I16GeneralSingleInputVectorShuffle(
   14159           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
   14160     }
   14161   }
   14162 
   14163   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
   14164           DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
   14165     return PSHUFB;
   14166 
   14167   // AVX512BWVL can lower to VPERMW.
   14168   if (Subtarget.hasBWI() && Subtarget.hasVLX())
   14169     return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
   14170 
   14171   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   14172   // shuffle.
   14173   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   14174           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
   14175     return Result;
   14176 
   14177   // Otherwise fall back on generic lowering.
   14178   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
   14179 }
   14180 
   14181 /// Handle lowering of 32-lane 8-bit integer shuffles.
   14182 ///
   14183 /// This routine is only called when we have AVX2 and thus a reasonable
   14184 /// instruction set for v32i8 shuffling..
   14185 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   14186                                        const APInt &Zeroable,
   14187                                        SDValue V1, SDValue V2,
   14188                                        const X86Subtarget &Subtarget,
   14189                                        SelectionDAG &DAG) {
   14190   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   14191   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   14192   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   14193   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
   14194 
   14195   // Whenever we can lower this as a zext, that instruction is strictly faster
   14196   // than any alternative. It also allows us to fold memory operands into the
   14197   // shuffle in many cases.
   14198   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   14199           DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
   14200     return ZExt;
   14201 
   14202   // Check for being able to broadcast a single element.
   14203   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
   14204                                                         Mask, Subtarget, DAG))
   14205     return Broadcast;
   14206 
   14207   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
   14208                                                 Zeroable, Subtarget, DAG))
   14209     return Blend;
   14210 
   14211   // Use dedicated unpack instructions for masks that match their pattern.
   14212   if (SDValue V =
   14213           lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
   14214     return V;
   14215 
   14216   // Use dedicated pack instructions for masks that match their pattern.
   14217   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
   14218                                              Subtarget))
   14219     return V;
   14220 
   14221   // Try to use shift instructions.
   14222   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
   14223                                                 Zeroable, Subtarget, DAG))
   14224     return Shift;
   14225 
   14226   // Try to use byte rotation instructions.
   14227   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   14228           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
   14229     return Rotate;
   14230 
   14231   // Try to create an in-lane repeating shuffle mask and then shuffle the
   14232   // results into the target lanes.
   14233   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   14234           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
   14235     return V;
   14236 
   14237   // There are no generalized cross-lane shuffle operations available on i8
   14238   // element types.
   14239   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
   14240     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
   14241                                                    DAG, Subtarget);
   14242 
   14243   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
   14244           DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
   14245     return PSHUFB;
   14246 
   14247   // AVX512VBMIVL can lower to VPERMB.
   14248   if (Subtarget.hasVBMI() && Subtarget.hasVLX())
   14249     return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
   14250 
   14251   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   14252   // shuffle.
   14253   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
   14254           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
   14255     return Result;
   14256 
   14257   // Otherwise fall back on generic lowering.
   14258   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
   14259 }
   14260 
   14261 /// High-level routine to lower various 256-bit x86 vector shuffles.
   14262 ///
   14263 /// This routine either breaks down the specific type of a 256-bit x86 vector
   14264 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
   14265 /// together based on the available instructions.
   14266 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   14267                                         MVT VT, SDValue V1, SDValue V2,
   14268                                         const APInt &Zeroable,
   14269                                         const X86Subtarget &Subtarget,
   14270                                         SelectionDAG &DAG) {
   14271   // If we have a single input to the zero element, insert that into V1 if we
   14272   // can do so cheaply.
   14273   int NumElts = VT.getVectorNumElements();
   14274   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
   14275 
   14276   if (NumV2Elements == 1 && Mask[0] >= NumElts)
   14277     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   14278             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
   14279       return Insertion;
   14280 
   14281   // Handle special cases where the lower or upper half is UNDEF.
   14282   if (SDValue V =
   14283           lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
   14284     return V;
   14285 
   14286   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
   14287   // can check for those subtargets here and avoid much of the subtarget
   14288   // querying in the per-vector-type lowering routines. With AVX1 we have
   14289   // essentially *zero* ability to manipulate a 256-bit vector with integer
   14290   // types. Since we'll use floating point types there eventually, just
   14291   // immediately cast everything to a float and operate entirely in that domain.
   14292   if (VT.isInteger() && !Subtarget.hasAVX2()) {
   14293     int ElementBits = VT.getScalarSizeInBits();
   14294     if (ElementBits < 32) {
   14295       // No floating point type available, if we can't use the bit operations
   14296       // for masking/blending then decompose into 128-bit vectors.
   14297       if (SDValue V =
   14298               lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
   14299         return V;
   14300       if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
   14301         return V;
   14302       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   14303     }
   14304 
   14305     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
   14306                                 VT.getVectorNumElements());
   14307     V1 = DAG.getBitcast(FpVT, V1);
   14308     V2 = DAG.getBitcast(FpVT, V2);
   14309     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
   14310   }
   14311 
   14312   switch (VT.SimpleTy) {
   14313   case MVT::v4f64:
   14314     return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   14315   case MVT::v4i64:
   14316     return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   14317   case MVT::v8f32:
   14318     return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   14319   case MVT::v8i32:
   14320     return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   14321   case MVT::v16i16:
   14322     return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   14323   case MVT::v32i8:
   14324     return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   14325 
   14326   default:
   14327     llvm_unreachable("Not a valid 256-bit x86 vector type!");
   14328   }
   14329 }
   14330 
   14331 /// Try to lower a vector shuffle as a 128-bit shuffles.
   14332 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
   14333                                         ArrayRef<int> Mask,
   14334                                         const APInt &Zeroable,
   14335                                         SDValue V1, SDValue V2,
   14336                                         const X86Subtarget &Subtarget,
   14337                                         SelectionDAG &DAG) {
   14338   assert(VT.getScalarSizeInBits() == 64 &&
   14339          "Unexpected element type size for 128bit shuffle.");
   14340 
   14341   // To handle 256 bit vector requires VLX and most probably
   14342   // function lowerV2X128VectorShuffle() is better solution.
   14343   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
   14344 
   14345   // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
   14346   SmallVector<int, 4> WidenedMask;
   14347   if (!canWidenShuffleElements(Mask, WidenedMask))
   14348     return SDValue();
   14349 
   14350   // Try to use an insert into a zero vector.
   14351   if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
   14352       (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
   14353     unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
   14354     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
   14355     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
   14356                               DAG.getIntPtrConstant(0, DL));
   14357     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
   14358                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
   14359                        DAG.getIntPtrConstant(0, DL));
   14360   }
   14361 
   14362   // Check for patterns which can be matched with a single insert of a 256-bit
   14363   // subvector.
   14364   bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
   14365                                         {0, 1, 2, 3, 0, 1, 2, 3});
   14366   if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
   14367                                         {0, 1, 2, 3, 8, 9, 10, 11})) {
   14368     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
   14369     SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
   14370                                  OnlyUsesV1 ? V1 : V2,
   14371                               DAG.getIntPtrConstant(0, DL));
   14372     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
   14373                        DAG.getIntPtrConstant(4, DL));
   14374   }
   14375 
   14376   assert(WidenedMask.size() == 4);
   14377 
   14378   // See if this is an insertion of the lower 128-bits of V2 into V1.
   14379   bool IsInsert = true;
   14380   int V2Index = -1;
   14381   for (int i = 0; i < 4; ++i) {
   14382     assert(WidenedMask[i] >= -1);
   14383     if (WidenedMask[i] < 0)
   14384       continue;
   14385 
   14386     // Make sure all V1 subvectors are in place.
   14387     if (WidenedMask[i] < 4) {
   14388       if (WidenedMask[i] != i) {
   14389         IsInsert = false;
   14390         break;
   14391       }
   14392     } else {
   14393       // Make sure we only have a single V2 index and its the lowest 128-bits.
   14394       if (V2Index >= 0 || WidenedMask[i] != 4) {
   14395         IsInsert = false;
   14396         break;
   14397       }
   14398       V2Index = i;
   14399     }
   14400   }
   14401   if (IsInsert && V2Index >= 0) {
   14402     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
   14403     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
   14404                                  DAG.getIntPtrConstant(0, DL));
   14405     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
   14406   }
   14407 
   14408   // Try to lower to vshuf64x2/vshuf32x4.
   14409   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
   14410   unsigned PermMask = 0;
   14411   // Insure elements came from the same Op.
   14412   for (int i = 0; i < 4; ++i) {
   14413     assert(WidenedMask[i] >= -1);
   14414     if (WidenedMask[i] < 0)
   14415       continue;
   14416 
   14417     SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
   14418     unsigned OpIndex = i / 2;
   14419     if (Ops[OpIndex].isUndef())
   14420       Ops[OpIndex] = Op;
   14421     else if (Ops[OpIndex] != Op)
   14422       return SDValue();
   14423 
   14424     // Convert the 128-bit shuffle mask selection values into 128-bit selection
   14425     // bits defined by a vshuf64x2 instruction's immediate control byte.
   14426     PermMask |= (WidenedMask[i] % 4) << (i * 2);
   14427   }
   14428 
   14429   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
   14430                      DAG.getConstant(PermMask, DL, MVT::i8));
   14431 }
   14432 
   14433 /// Handle lowering of 8-lane 64-bit floating point shuffles.
   14434 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   14435                                        const APInt &Zeroable,
   14436                                        SDValue V1, SDValue V2,
   14437                                        const X86Subtarget &Subtarget,
   14438                                        SelectionDAG &DAG) {
   14439   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   14440   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   14441   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   14442 
   14443   if (V2.isUndef()) {
   14444     // Use low duplicate instructions for masks that match their pattern.
   14445     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
   14446       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
   14447 
   14448     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
   14449       // Non-half-crossing single input shuffles can be lowered with an
   14450       // interleaved permutation.
   14451       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
   14452                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
   14453                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
   14454                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
   14455       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
   14456                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
   14457     }
   14458 
   14459     SmallVector<int, 4> RepeatedMask;
   14460     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
   14461       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
   14462                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   14463   }
   14464 
   14465   if (SDValue Shuf128 =
   14466           lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
   14467                                    Subtarget, DAG))
   14468     return Shuf128;
   14469 
   14470   if (SDValue Unpck =
   14471           lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
   14472     return Unpck;
   14473 
   14474   // Check if the blend happens to exactly fit that of SHUFPD.
   14475   if (SDValue Op =
   14476       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
   14477     return Op;
   14478 
   14479   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
   14480                                              V2, DAG, Subtarget))
   14481     return V;
   14482 
   14483   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
   14484                                                 Zeroable, Subtarget, DAG))
   14485     return Blend;
   14486 
   14487   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
   14488 }
   14489 
   14490 /// Handle lowering of 16-lane 32-bit floating point shuffles.
   14491 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   14492                                         const APInt &Zeroable,
   14493                                         SDValue V1, SDValue V2,
   14494                                         const X86Subtarget &Subtarget,
   14495                                         SelectionDAG &DAG) {
   14496   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   14497   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   14498   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   14499 
   14500   // If the shuffle mask is repeated in each 128-bit lane, we have many more
   14501   // options to efficiently lower the shuffle.
   14502   SmallVector<int, 4> RepeatedMask;
   14503   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
   14504     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
   14505 
   14506     // Use even/odd duplicate instructions for masks that match their pattern.
   14507     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
   14508       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
   14509     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
   14510       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
   14511 
   14512     if (V2.isUndef())
   14513       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
   14514                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   14515 
   14516     // Use dedicated unpack instructions for masks that match their pattern.
   14517     if (SDValue Unpck =
   14518             lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
   14519       return Unpck;
   14520 
   14521     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
   14522                                                   Zeroable, Subtarget, DAG))
   14523       return Blend;
   14524 
   14525     // Otherwise, fall back to a SHUFPS sequence.
   14526     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
   14527   }
   14528 
   14529   // If we have a single input shuffle with different shuffle patterns in the
   14530   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
   14531   if (V2.isUndef() &&
   14532       !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
   14533     SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
   14534     return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
   14535   }
   14536 
   14537   // If we have AVX512F support, we can use VEXPAND.
   14538   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
   14539                                              V1, V2, DAG, Subtarget))
   14540     return V;
   14541 
   14542   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
   14543 }
   14544 
   14545 /// Handle lowering of 8-lane 64-bit integer shuffles.
   14546 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   14547                                        const APInt &Zeroable,
   14548                                        SDValue V1, SDValue V2,
   14549                                        const X86Subtarget &Subtarget,
   14550                                        SelectionDAG &DAG) {
   14551   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   14552   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   14553   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   14554 
   14555   if (V2.isUndef()) {
   14556     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
   14557     // can use lower latency instructions that will operate on all four
   14558     // 128-bit lanes.
   14559     SmallVector<int, 2> Repeated128Mask;
   14560     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
   14561       SmallVector<int, 4> PSHUFDMask;
   14562       scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
   14563       return DAG.getBitcast(
   14564           MVT::v8i64,
   14565           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
   14566                       DAG.getBitcast(MVT::v16i32, V1),
   14567                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   14568     }
   14569 
   14570     SmallVector<int, 4> Repeated256Mask;
   14571     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
   14572       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
   14573                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
   14574   }
   14575 
   14576   if (SDValue Shuf128 =
   14577           lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
   14578                                    V1, V2, Subtarget, DAG))
   14579     return Shuf128;
   14580 
   14581   // Try to use shift instructions.
   14582   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
   14583                                                 Zeroable, Subtarget, DAG))
   14584     return Shift;
   14585 
   14586   // Try to use VALIGN.
   14587   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
   14588                                                   Mask, Subtarget, DAG))
   14589     return Rotate;
   14590 
   14591   // Try to use PALIGNR.
   14592   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
   14593                                                       Mask, Subtarget, DAG))
   14594     return Rotate;
   14595 
   14596   if (SDValue Unpck =
   14597           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
   14598     return Unpck;
   14599   // If we have AVX512F support, we can use VEXPAND.
   14600   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
   14601                                              V2, DAG, Subtarget))
   14602     return V;
   14603 
   14604   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
   14605                                                 Zeroable, Subtarget, DAG))
   14606     return Blend;
   14607 
   14608   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
   14609 }
   14610 
   14611 /// Handle lowering of 16-lane 32-bit integer shuffles.
   14612 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   14613                                         const APInt &Zeroable,
   14614                                         SDValue V1, SDValue V2,
   14615                                         const X86Subtarget &Subtarget,
   14616                                         SelectionDAG &DAG) {
   14617   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   14618   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   14619   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   14620 
   14621   // Whenever we can lower this as a zext, that instruction is strictly faster
   14622   // than any alternative. It also allows us to fold memory operands into the
   14623   // shuffle in many cases.
   14624   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   14625           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
   14626     return ZExt;
   14627 
   14628   // If the shuffle mask is repeated in each 128-bit lane we can use more
   14629   // efficient instructions that mirror the shuffles across the four 128-bit
   14630   // lanes.
   14631   SmallVector<int, 4> RepeatedMask;
   14632   bool Is128BitLaneRepeatedShuffle =
   14633       is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
   14634   if (Is128BitLaneRepeatedShuffle) {
   14635     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
   14636     if (V2.isUndef())
   14637       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
   14638                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   14639 
   14640     // Use dedicated unpack instructions for masks that match their pattern.
   14641     if (SDValue V =
   14642             lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
   14643       return V;
   14644   }
   14645 
   14646   // Try to use shift instructions.
   14647   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
   14648                                                 Zeroable, Subtarget, DAG))
   14649     return Shift;
   14650 
   14651   // Try to use VALIGN.
   14652   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
   14653                                                   Mask, Subtarget, DAG))
   14654     return Rotate;
   14655 
   14656   // Try to use byte rotation instructions.
   14657   if (Subtarget.hasBWI())
   14658     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   14659             DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
   14660       return Rotate;
   14661 
   14662   // Assume that a single SHUFPS is faster than using a permv shuffle.
   14663   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
   14664   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
   14665     SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
   14666     SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
   14667     SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
   14668                                                   CastV1, CastV2, DAG);
   14669     return DAG.getBitcast(MVT::v16i32, ShufPS);
   14670   }
   14671   // If we have AVX512F support, we can use VEXPAND.
   14672   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
   14673                                              V1, V2, DAG, Subtarget))
   14674     return V;
   14675 
   14676   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
   14677                                                 Zeroable, Subtarget, DAG))
   14678     return Blend;
   14679   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
   14680 }
   14681 
   14682 /// Handle lowering of 32-lane 16-bit integer shuffles.
   14683 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   14684                                         const APInt &Zeroable,
   14685                                         SDValue V1, SDValue V2,
   14686                                         const X86Subtarget &Subtarget,
   14687                                         SelectionDAG &DAG) {
   14688   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   14689   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   14690   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   14691   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
   14692 
   14693   // Whenever we can lower this as a zext, that instruction is strictly faster
   14694   // than any alternative. It also allows us to fold memory operands into the
   14695   // shuffle in many cases.
   14696   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   14697           DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
   14698     return ZExt;
   14699 
   14700   // Use dedicated unpack instructions for masks that match their pattern.
   14701   if (SDValue V =
   14702           lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
   14703     return V;
   14704 
   14705   // Try to use shift instructions.
   14706   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
   14707                                                 Zeroable, Subtarget, DAG))
   14708     return Shift;
   14709 
   14710   // Try to use byte rotation instructions.
   14711   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   14712           DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
   14713     return Rotate;
   14714 
   14715   if (V2.isUndef()) {
   14716     SmallVector<int, 8> RepeatedMask;
   14717     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
   14718       // As this is a single-input shuffle, the repeated mask should be
   14719       // a strictly valid v8i16 mask that we can pass through to the v8i16
   14720       // lowering to handle even the v32 case.
   14721       return lowerV8I16GeneralSingleInputVectorShuffle(
   14722           DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
   14723     }
   14724   }
   14725 
   14726   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
   14727                                                 Zeroable, Subtarget, DAG))
   14728     return Blend;
   14729 
   14730   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
   14731           DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
   14732     return PSHUFB;
   14733 
   14734   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
   14735 }
   14736 
   14737 /// Handle lowering of 64-lane 8-bit integer shuffles.
   14738 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   14739                                        const APInt &Zeroable,
   14740                                        SDValue V1, SDValue V2,
   14741                                        const X86Subtarget &Subtarget,
   14742                                        SelectionDAG &DAG) {
   14743   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   14744   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   14745   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
   14746   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
   14747 
   14748   // Whenever we can lower this as a zext, that instruction is strictly faster
   14749   // than any alternative. It also allows us to fold memory operands into the
   14750   // shuffle in many cases.
   14751   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
   14752           DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
   14753     return ZExt;
   14754 
   14755   // Use dedicated unpack instructions for masks that match their pattern.
   14756   if (SDValue V =
   14757           lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
   14758     return V;
   14759 
   14760   // Try to use shift instructions.
   14761   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
   14762                                                 Zeroable, Subtarget, DAG))
   14763     return Shift;
   14764 
   14765   // Try to use byte rotation instructions.
   14766   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
   14767           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
   14768     return Rotate;
   14769 
   14770   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
   14771           DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
   14772     return PSHUFB;
   14773 
   14774   // VBMI can use VPERMV/VPERMV3 byte shuffles.
   14775   if (Subtarget.hasVBMI())
   14776     return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
   14777 
   14778   // Try to create an in-lane repeating shuffle mask and then shuffle the
   14779   // results into the target lanes.
   14780   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
   14781           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
   14782     return V;
   14783 
   14784   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
   14785                                                 Zeroable, Subtarget, DAG))
   14786     return Blend;
   14787 
   14788   // FIXME: Implement direct support for this type!
   14789   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
   14790 }
   14791 
   14792 /// High-level routine to lower various 512-bit x86 vector shuffles.
   14793 ///
   14794 /// This routine either breaks down the specific type of a 512-bit x86 vector
   14795 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
   14796 /// together based on the available instructions.
   14797 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   14798                                         MVT VT, SDValue V1, SDValue V2,
   14799                                         const APInt &Zeroable,
   14800                                         const X86Subtarget &Subtarget,
   14801                                         SelectionDAG &DAG) {
   14802   assert(Subtarget.hasAVX512() &&
   14803          "Cannot lower 512-bit vectors w/ basic ISA!");
   14804 
   14805   // If we have a single input to the zero element, insert that into V1 if we
   14806   // can do so cheaply.
   14807   int NumElts = Mask.size();
   14808   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
   14809 
   14810   if (NumV2Elements == 1 && Mask[0] >= NumElts)
   14811     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
   14812             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
   14813       return Insertion;
   14814 
   14815   // Handle special cases where the lower or upper half is UNDEF.
   14816   if (SDValue V =
   14817         lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
   14818     return V;
   14819 
   14820   // Check for being able to broadcast a single element.
   14821   if (SDValue Broadcast =
   14822           lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
   14823     return Broadcast;
   14824 
   14825   // Dispatch to each element type for lowering. If we don't have support for
   14826   // specific element type shuffles at 512 bits, immediately split them and
   14827   // lower them. Each lowering routine of a given type is allowed to assume that
   14828   // the requisite ISA extensions for that element type are available.
   14829   switch (VT.SimpleTy) {
   14830   case MVT::v8f64:
   14831     return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   14832   case MVT::v16f32:
   14833     return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   14834   case MVT::v8i64:
   14835     return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   14836   case MVT::v16i32:
   14837     return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   14838   case MVT::v32i16:
   14839     return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   14840   case MVT::v64i8:
   14841     return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   14842 
   14843   default:
   14844     llvm_unreachable("Not a valid 512-bit x86 vector type!");
   14845   }
   14846 }
   14847 
   14848 // Lower vXi1 vector shuffles.
   14849 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
   14850 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
   14851 // vector, shuffle and then truncate it back.
   14852 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   14853                                       MVT VT, SDValue V1, SDValue V2,
   14854                                       const APInt &Zeroable,
   14855                                       const X86Subtarget &Subtarget,
   14856                                       SelectionDAG &DAG) {
   14857   unsigned NumElts = Mask.size();
   14858 
   14859   // Try to recognize shuffles that are just padding a subvector with zeros.
   14860   unsigned SubvecElts = 0;
   14861   for (int i = 0; i != (int)NumElts; ++i) {
   14862     if (Mask[i] >= 0 && Mask[i] != i)
   14863       break;
   14864 
   14865     ++SubvecElts;
   14866   }
   14867   assert(SubvecElts != NumElts && "Identity shuffle?");
   14868 
   14869   // Clip to a power 2.
   14870   SubvecElts = PowerOf2Floor(SubvecElts);
   14871 
   14872   // Make sure the number of zeroable bits in the top at least covers the bits
   14873   // not covered by the subvector.
   14874   if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
   14875     MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
   14876     SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
   14877                                   V1, DAG.getIntPtrConstant(0, DL));
   14878     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
   14879                        getZeroVector(VT, Subtarget, DAG, DL),
   14880                        Extract, DAG.getIntPtrConstant(0, DL));
   14881   }
   14882 
   14883 
   14884   assert(Subtarget.hasAVX512() &&
   14885          "Cannot lower 512-bit vectors w/o basic ISA!");
   14886   MVT ExtVT;
   14887   switch (VT.SimpleTy) {
   14888   default:
   14889     llvm_unreachable("Expected a vector of i1 elements");
   14890   case MVT::v2i1:
   14891     ExtVT = MVT::v2i64;
   14892     break;
   14893   case MVT::v4i1:
   14894     ExtVT = MVT::v4i32;
   14895     break;
   14896   case MVT::v8i1:
   14897     // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
   14898     // shuffle.
   14899     ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
   14900     break;
   14901   case MVT::v16i1:
   14902     // Take 512-bit type, unless we are avoiding 512-bit types and have the
   14903     // 256-bit operation available.
   14904     ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
   14905     break;
   14906   case MVT::v32i1:
   14907     // Take 512-bit type, unless we are avoiding 512-bit types and have the
   14908     // 256-bit operation available.
   14909     assert(Subtarget.hasBWI() && "Expected AVX512BW support");
   14910     ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
   14911     break;
   14912   case MVT::v64i1:
   14913     ExtVT = MVT::v64i8;
   14914     break;
   14915   }
   14916 
   14917   V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
   14918   V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
   14919 
   14920   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
   14921   // i1 was sign extended we can use X86ISD::CVT2MASK.
   14922   int NumElems = VT.getVectorNumElements();
   14923   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
   14924       (Subtarget.hasDQI() && (NumElems < 32)))
   14925     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
   14926                        Shuffle, ISD::SETGT);
   14927 
   14928   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
   14929 }
   14930 
   14931 /// Helper function that returns true if the shuffle mask should be
   14932 /// commuted to improve canonicalization.
   14933 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
   14934   int NumElements = Mask.size();
   14935 
   14936   int NumV1Elements = 0, NumV2Elements = 0;
   14937   for (int M : Mask)
   14938     if (M < 0)
   14939       continue;
   14940     else if (M < NumElements)
   14941       ++NumV1Elements;
   14942     else
   14943       ++NumV2Elements;
   14944 
   14945   // Commute the shuffle as needed such that more elements come from V1 than
   14946   // V2. This allows us to match the shuffle pattern strictly on how many
   14947   // elements come from V1 without handling the symmetric cases.
   14948   if (NumV2Elements > NumV1Elements)
   14949     return true;
   14950 
   14951   assert(NumV1Elements > 0 && "No V1 indices");
   14952 
   14953   if (NumV2Elements == 0)
   14954     return false;
   14955 
   14956   // When the number of V1 and V2 elements are the same, try to minimize the
   14957   // number of uses of V2 in the low half of the vector. When that is tied,
   14958   // ensure that the sum of indices for V1 is equal to or lower than the sum
   14959   // indices for V2. When those are equal, try to ensure that the number of odd
   14960   // indices for V1 is lower than the number of odd indices for V2.
   14961   if (NumV1Elements == NumV2Elements) {
   14962     int LowV1Elements = 0, LowV2Elements = 0;
   14963     for (int M : Mask.slice(0, NumElements / 2))
   14964       if (M >= NumElements)
   14965         ++LowV2Elements;
   14966       else if (M >= 0)
   14967         ++LowV1Elements;
   14968     if (LowV2Elements > LowV1Elements)
   14969       return true;
   14970     if (LowV2Elements == LowV1Elements) {
   14971       int SumV1Indices = 0, SumV2Indices = 0;
   14972       for (int i = 0, Size = Mask.size(); i < Size; ++i)
   14973         if (Mask[i] >= NumElements)
   14974           SumV2Indices += i;
   14975         else if (Mask[i] >= 0)
   14976           SumV1Indices += i;
   14977       if (SumV2Indices < SumV1Indices)
   14978         return true;
   14979       if (SumV2Indices == SumV1Indices) {
   14980         int NumV1OddIndices = 0, NumV2OddIndices = 0;
   14981         for (int i = 0, Size = Mask.size(); i < Size; ++i)
   14982           if (Mask[i] >= NumElements)
   14983             NumV2OddIndices += i % 2;
   14984           else if (Mask[i] >= 0)
   14985             NumV1OddIndices += i % 2;
   14986         if (NumV2OddIndices < NumV1OddIndices)
   14987           return true;
   14988       }
   14989     }
   14990   }
   14991 
   14992   return false;
   14993 }
   14994 
   14995 /// Top-level lowering for x86 vector shuffles.
   14996 ///
   14997 /// This handles decomposition, canonicalization, and lowering of all x86
   14998 /// vector shuffles. Most of the specific lowering strategies are encapsulated
   14999 /// above in helper routines. The canonicalization attempts to widen shuffles
   15000 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
   15001 /// s.t. only one of the two inputs needs to be tested, etc.
   15002 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
   15003                                   SelectionDAG &DAG) {
   15004   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   15005   ArrayRef<int> Mask = SVOp->getMask();
   15006   SDValue V1 = Op.getOperand(0);
   15007   SDValue V2 = Op.getOperand(1);
   15008   MVT VT = Op.getSimpleValueType();
   15009   int NumElements = VT.getVectorNumElements();
   15010   SDLoc DL(Op);
   15011   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
   15012 
   15013   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
   15014          "Can't lower MMX shuffles");
   15015 
   15016   bool V1IsUndef = V1.isUndef();
   15017   bool V2IsUndef = V2.isUndef();
   15018   if (V1IsUndef && V2IsUndef)
   15019     return DAG.getUNDEF(VT);
   15020 
   15021   // When we create a shuffle node we put the UNDEF node to second operand,
   15022   // but in some cases the first operand may be transformed to UNDEF.
   15023   // In this case we should just commute the node.
   15024   if (V1IsUndef)
   15025     return DAG.getCommutedVectorShuffle(*SVOp);
   15026 
   15027   // Check for non-undef masks pointing at an undef vector and make the masks
   15028   // undef as well. This makes it easier to match the shuffle based solely on
   15029   // the mask.
   15030   if (V2IsUndef)
   15031     for (int M : Mask)
   15032       if (M >= NumElements) {
   15033         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
   15034         for (int &M : NewMask)
   15035           if (M >= NumElements)
   15036             M = -1;
   15037         return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
   15038       }
   15039 
   15040   // Check for illegal shuffle mask element index values.
   15041   int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
   15042   assert(llvm::all_of(Mask,
   15043                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
   15044          "Out of bounds shuffle index");
   15045 
   15046   // We actually see shuffles that are entirely re-arrangements of a set of
   15047   // zero inputs. This mostly happens while decomposing complex shuffles into
   15048   // simple ones. Directly lower these as a buildvector of zeros.
   15049   APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   15050   if (Zeroable.isAllOnesValue())
   15051     return getZeroVector(VT, Subtarget, DAG, DL);
   15052 
   15053   bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
   15054 
   15055   // Create an alternative mask with info about zeroable elements.
   15056   // Here we do not set undef elements as zeroable.
   15057   SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
   15058   if (V2IsZero) {
   15059     assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
   15060     for (int i = 0; i != NumElements; ++i)
   15061       if (Mask[i] != SM_SentinelUndef && Zeroable[i])
   15062         ZeroableMask[i] = SM_SentinelZero;
   15063   }
   15064 
   15065   // Try to collapse shuffles into using a vector type with fewer elements but
   15066   // wider element types. We cap this to not form integers or floating point
   15067   // elements wider than 64 bits, but it might be interesting to form i128
   15068   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
   15069   SmallVector<int, 16> WidenedMask;
   15070   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
   15071       canWidenShuffleElements(ZeroableMask, WidenedMask)) {
   15072     MVT NewEltVT = VT.isFloatingPoint()
   15073                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
   15074                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
   15075     int NewNumElts = NumElements / 2;
   15076     MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
   15077     // Make sure that the new vector type is legal. For example, v2f64 isn't
   15078     // legal on SSE1.
   15079     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
   15080       if (V2IsZero) {
   15081         // Modify the new Mask to take all zeros from the all-zero vector.
   15082         // Choose indices that are blend-friendly.
   15083         bool UsedZeroVector = false;
   15084         assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
   15085                "V2's non-undef elements are used?!");
   15086         for (int i = 0; i != NewNumElts; ++i)
   15087           if (WidenedMask[i] == SM_SentinelZero) {
   15088             WidenedMask[i] = i + NewNumElts;
   15089             UsedZeroVector = true;
   15090           }
   15091         // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
   15092         // some elements to be undef.
   15093         if (UsedZeroVector)
   15094           V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
   15095       }
   15096       V1 = DAG.getBitcast(NewVT, V1);
   15097       V2 = DAG.getBitcast(NewVT, V2);
   15098       return DAG.getBitcast(
   15099           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
   15100     }
   15101   }
   15102 
   15103   // Commute the shuffle if it will improve canonicalization.
   15104   if (canonicalizeShuffleMaskWithCommute(Mask))
   15105     return DAG.getCommutedVectorShuffle(*SVOp);
   15106 
   15107   if (SDValue V =
   15108           lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
   15109     return V;
   15110 
   15111   // For each vector width, delegate to a specialized lowering routine.
   15112   if (VT.is128BitVector())
   15113     return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
   15114                                     DAG);
   15115 
   15116   if (VT.is256BitVector())
   15117     return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
   15118                                     DAG);
   15119 
   15120   if (VT.is512BitVector())
   15121     return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
   15122                                     DAG);
   15123 
   15124   if (Is1BitVector)
   15125     return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
   15126                                   DAG);
   15127 
   15128   llvm_unreachable("Unimplemented!");
   15129 }
   15130 
   15131 /// Try to lower a VSELECT instruction to a vector shuffle.
   15132 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
   15133                                            const X86Subtarget &Subtarget,
   15134                                            SelectionDAG &DAG) {
   15135   SDValue Cond = Op.getOperand(0);
   15136   SDValue LHS = Op.getOperand(1);
   15137   SDValue RHS = Op.getOperand(2);
   15138   SDLoc dl(Op);
   15139   MVT VT = Op.getSimpleValueType();
   15140 
   15141   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
   15142     return SDValue();
   15143   auto *CondBV = cast<BuildVectorSDNode>(Cond);
   15144 
   15145   // Only non-legal VSELECTs reach this lowering, convert those into generic
   15146   // shuffles and re-use the shuffle lowering path for blends.
   15147   SmallVector<int, 32> Mask;
   15148   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
   15149     SDValue CondElt = CondBV->getOperand(i);
   15150     int M = i;
   15151     // We can't map undef to undef here. They have different meanings. Treat
   15152     // as the same as zero.
   15153     if (CondElt.isUndef() || isNullConstant(CondElt))
   15154       M += Size;
   15155     Mask.push_back(M);
   15156   }
   15157   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
   15158 }
   15159 
   15160 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   15161   // A vselect where all conditions and data are constants can be optimized into
   15162   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
   15163   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
   15164       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
   15165       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
   15166     return SDValue();
   15167 
   15168   // Try to lower this to a blend-style vector shuffle. This can handle all
   15169   // constant condition cases.
   15170   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
   15171     return BlendOp;
   15172 
   15173   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
   15174   // with patterns on the mask registers on AVX-512.
   15175   if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
   15176     return Op;
   15177 
   15178   // Variable blends are only legal from SSE4.1 onward.
   15179   if (!Subtarget.hasSSE41())
   15180     return SDValue();
   15181 
   15182   SDLoc dl(Op);
   15183   MVT VT = Op.getSimpleValueType();
   15184 
   15185   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
   15186   // into an i1 condition so that we can use the mask-based 512-bit blend
   15187   // instructions.
   15188   if (VT.getSizeInBits() == 512) {
   15189     SDValue Cond = Op.getOperand(0);
   15190     // The vNi1 condition case should be handled above as it can be trivially
   15191     // lowered.
   15192     assert(Cond.getValueType().getScalarSizeInBits() ==
   15193                VT.getScalarSizeInBits() &&
   15194            "Should have a size-matched integer condition!");
   15195     // Build a mask by testing the condition against zero.
   15196     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   15197     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
   15198                                 getZeroVector(VT, Subtarget, DAG, dl),
   15199                                 ISD::SETNE);
   15200     // Now return a new VSELECT using the mask.
   15201     return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
   15202   }
   15203 
   15204   // Only some types will be legal on some subtargets. If we can emit a legal
   15205   // VSELECT-matching blend, return Op, and but if we need to expand, return
   15206   // a null value.
   15207   switch (VT.SimpleTy) {
   15208   default:
   15209     // Most of the vector types have blends past SSE4.1.
   15210     return Op;
   15211 
   15212   case MVT::v32i8:
   15213     // The byte blends for AVX vectors were introduced only in AVX2.
   15214     if (Subtarget.hasAVX2())
   15215       return Op;
   15216 
   15217     return SDValue();
   15218 
   15219   case MVT::v8i16:
   15220   case MVT::v16i16: {
   15221     // Bitcast everything to the vXi8 type and use a vXi8 vselect.
   15222     MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
   15223     SDValue Cond = DAG.getBitcast(CastVT, Op->getOperand(0));
   15224     SDValue LHS = DAG.getBitcast(CastVT, Op->getOperand(1));
   15225     SDValue RHS = DAG.getBitcast(CastVT, Op->getOperand(2));
   15226     SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
   15227     return DAG.getBitcast(VT, Select);
   15228   }
   15229   }
   15230 }
   15231 
   15232 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   15233   MVT VT = Op.getSimpleValueType();
   15234   SDLoc dl(Op);
   15235 
   15236   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
   15237     return SDValue();
   15238 
   15239   if (VT.getSizeInBits() == 8) {
   15240     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
   15241                                   Op.getOperand(0), Op.getOperand(1));
   15242     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
   15243   }
   15244 
   15245   if (VT == MVT::f32) {
   15246     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
   15247     // the result back to FR32 register. It's only worth matching if the
   15248     // result has a single use which is a store or a bitcast to i32.  And in
   15249     // the case of a store, it's not worth it if the index is a constant 0,
   15250     // because a MOVSSmr can be used instead, which is smaller and faster.
   15251     if (!Op.hasOneUse())
   15252       return SDValue();
   15253     SDNode *User = *Op.getNode()->use_begin();
   15254     if ((User->getOpcode() != ISD::STORE ||
   15255          isNullConstant(Op.getOperand(1))) &&
   15256         (User->getOpcode() != ISD::BITCAST ||
   15257          User->getValueType(0) != MVT::i32))
   15258       return SDValue();
   15259     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   15260                                   DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
   15261                                   Op.getOperand(1));
   15262     return DAG.getBitcast(MVT::f32, Extract);
   15263   }
   15264 
   15265   if (VT == MVT::i32 || VT == MVT::i64) {
   15266     // ExtractPS/pextrq works with constant index.
   15267     if (isa<ConstantSDNode>(Op.getOperand(1)))
   15268       return Op;
   15269   }
   15270 
   15271   return SDValue();
   15272 }
   15273 
   15274 /// Extract one bit from mask vector, like v16i1 or v8i1.
   15275 /// AVX-512 feature.
   15276 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
   15277                                         const X86Subtarget &Subtarget) {
   15278   SDValue Vec = Op.getOperand(0);
   15279   SDLoc dl(Vec);
   15280   MVT VecVT = Vec.getSimpleValueType();
   15281   SDValue Idx = Op.getOperand(1);
   15282   MVT EltVT = Op.getSimpleValueType();
   15283 
   15284   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
   15285          "Unexpected vector type in ExtractBitFromMaskVector");
   15286 
   15287   // variable index can't be handled in mask registers,
   15288   // extend vector to VR512/128
   15289   if (!isa<ConstantSDNode>(Idx)) {
   15290     unsigned NumElts = VecVT.getVectorNumElements();
   15291     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
   15292     // than extending to 128/256bit.
   15293     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
   15294     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
   15295     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
   15296     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
   15297     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
   15298   }
   15299 
   15300   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   15301 
   15302   // If the kshift instructions of the correct width aren't natively supported
   15303   // then we need to promote the vector to the native size to get the correct
   15304   // zeroing behavior.
   15305   if (VecVT.getVectorNumElements() < 16) {
   15306     VecVT = MVT::v16i1;
   15307     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
   15308                       DAG.getUNDEF(VecVT), Vec,
   15309                       DAG.getIntPtrConstant(0, dl));
   15310   }
   15311 
   15312   // Extracts from element 0 are always allowed.
   15313   if (IdxVal != 0) {
   15314     // Use kshiftr instruction to move to the lower element.
   15315     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
   15316                       DAG.getConstant(IdxVal, dl, MVT::i8));
   15317   }
   15318 
   15319   // Shrink to v16i1 since that's always legal.
   15320   if (VecVT.getVectorNumElements() > 16) {
   15321     VecVT = MVT::v16i1;
   15322     Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Vec,
   15323                       DAG.getIntPtrConstant(0, dl));
   15324   }
   15325 
   15326   // Convert to a bitcast+aext/trunc.
   15327   MVT CastVT = MVT::getIntegerVT(VecVT.getVectorNumElements());
   15328   return DAG.getAnyExtOrTrunc(DAG.getBitcast(CastVT, Vec), dl, EltVT);
   15329 }
   15330 
   15331 SDValue
   15332 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   15333                                            SelectionDAG &DAG) const {
   15334   SDLoc dl(Op);
   15335   SDValue Vec = Op.getOperand(0);
   15336   MVT VecVT = Vec.getSimpleValueType();
   15337   SDValue Idx = Op.getOperand(1);
   15338 
   15339   if (VecVT.getVectorElementType() == MVT::i1)
   15340     return ExtractBitFromMaskVector(Op, DAG, Subtarget);
   15341 
   15342   if (!isa<ConstantSDNode>(Idx)) {
   15343     // Its more profitable to go through memory (1 cycles throughput)
   15344     // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
   15345     // IACA tool was used to get performance estimation
   15346     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
   15347     //
   15348     // example : extractelement <16 x i8> %a, i32 %i
   15349     //
   15350     // Block Throughput: 3.00 Cycles
   15351     // Throughput Bottleneck: Port5
   15352     //
   15353     // | Num Of |   Ports pressure in cycles  |    |
   15354     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
   15355     // ---------------------------------------------
   15356     // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
   15357     // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
   15358     // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
   15359     // Total Num Of Uops: 4
   15360     //
   15361     //
   15362     // Block Throughput: 1.00 Cycles
   15363     // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
   15364     //
   15365     // |    |  Ports pressure in cycles   |  |
   15366     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
   15367     // ---------------------------------------------------------
   15368     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
   15369     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
   15370     // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
   15371     // Total Num Of Uops: 4
   15372 
   15373     return SDValue();
   15374   }
   15375 
   15376   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   15377 
   15378   // If this is a 256-bit vector result, first extract the 128-bit vector and
   15379   // then extract the element from the 128-bit vector.
   15380   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
   15381     // Get the 128-bit vector.
   15382     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
   15383     MVT EltVT = VecVT.getVectorElementType();
   15384 
   15385     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
   15386     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
   15387 
   15388     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
   15389     // this can be done with a mask.
   15390     IdxVal &= ElemsPerChunk - 1;
   15391     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
   15392                        DAG.getConstant(IdxVal, dl, MVT::i32));
   15393   }
   15394 
   15395   assert(VecVT.is128BitVector() && "Unexpected vector length");
   15396 
   15397   MVT VT = Op.getSimpleValueType();
   15398 
   15399   if (VT.getSizeInBits() == 16) {
   15400     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
   15401     // we're going to zero extend the register or fold the store (SSE41 only).
   15402     if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
   15403         !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
   15404       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
   15405                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   15406                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
   15407 
   15408     // Transform it so it match pextrw which produces a 32-bit result.
   15409     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
   15410                                   Op.getOperand(0), Op.getOperand(1));
   15411     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
   15412   }
   15413 
   15414   if (Subtarget.hasSSE41())
   15415     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
   15416       return Res;
   15417 
   15418   // TODO: We only extract a single element from v16i8, we can probably afford
   15419   // to be more aggressive here before using the default approach of spilling to
   15420   // stack.
   15421   if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
   15422     // Extract either the lowest i32 or any i16, and extract the sub-byte.
   15423     int DWordIdx = IdxVal / 4;
   15424     if (DWordIdx == 0) {
   15425       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
   15426                                 DAG.getBitcast(MVT::v4i32, Vec),
   15427                                 DAG.getIntPtrConstant(DWordIdx, dl));
   15428       int ShiftVal = (IdxVal % 4) * 8;
   15429       if (ShiftVal != 0)
   15430         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
   15431                           DAG.getConstant(ShiftVal, dl, MVT::i8));
   15432       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
   15433     }
   15434 
   15435     int WordIdx = IdxVal / 2;
   15436     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
   15437                               DAG.getBitcast(MVT::v8i16, Vec),
   15438                               DAG.getIntPtrConstant(WordIdx, dl));
   15439     int ShiftVal = (IdxVal % 2) * 8;
   15440     if (ShiftVal != 0)
   15441       Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
   15442                         DAG.getConstant(ShiftVal, dl, MVT::i8));
   15443     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
   15444   }
   15445 
   15446   if (VT.getSizeInBits() == 32) {
   15447     if (IdxVal == 0)
   15448       return Op;
   15449 
   15450     // SHUFPS the element to the lowest double word, then movss.
   15451     int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
   15452     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
   15453     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   15454                        DAG.getIntPtrConstant(0, dl));
   15455   }
   15456 
   15457   if (VT.getSizeInBits() == 64) {
   15458     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
   15459     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
   15460     //        to match extract_elt for f64.
   15461     if (IdxVal == 0)
   15462       return Op;
   15463 
   15464     // UNPCKHPD the element to the lowest double word, then movsd.
   15465     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
   15466     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
   15467     int Mask[2] = { 1, -1 };
   15468     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
   15469     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
   15470                        DAG.getIntPtrConstant(0, dl));
   15471   }
   15472 
   15473   return SDValue();
   15474 }
   15475 
   15476 /// Insert one bit to mask vector, like v16i1 or v8i1.
   15477 /// AVX-512 feature.
   15478 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
   15479                                      const X86Subtarget &Subtarget) {
   15480   SDLoc dl(Op);
   15481   SDValue Vec = Op.getOperand(0);
   15482   SDValue Elt = Op.getOperand(1);
   15483   SDValue Idx = Op.getOperand(2);
   15484   MVT VecVT = Vec.getSimpleValueType();
   15485 
   15486   if (!isa<ConstantSDNode>(Idx)) {
   15487     // Non constant index. Extend source and destination,
   15488     // insert element and then truncate the result.
   15489     unsigned NumElts = VecVT.getVectorNumElements();
   15490     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
   15491     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
   15492     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
   15493       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
   15494       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
   15495     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
   15496   }
   15497 
   15498   // Copy into a k-register, extract to v1i1 and insert_subvector.
   15499   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
   15500 
   15501   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
   15502                      Op.getOperand(2));
   15503 }
   15504 
   15505 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   15506                                                   SelectionDAG &DAG) const {
   15507   MVT VT = Op.getSimpleValueType();
   15508   MVT EltVT = VT.getVectorElementType();
   15509   unsigned NumElts = VT.getVectorNumElements();
   15510 
   15511   if (EltVT == MVT::i1)
   15512     return InsertBitToMaskVector(Op, DAG, Subtarget);
   15513 
   15514   SDLoc dl(Op);
   15515   SDValue N0 = Op.getOperand(0);
   15516   SDValue N1 = Op.getOperand(1);
   15517   SDValue N2 = Op.getOperand(2);
   15518   if (!isa<ConstantSDNode>(N2))
   15519     return SDValue();
   15520   auto *N2C = cast<ConstantSDNode>(N2);
   15521   unsigned IdxVal = N2C->getZExtValue();
   15522 
   15523   bool IsZeroElt = X86::isZeroNode(N1);
   15524   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
   15525 
   15526   // If we are inserting a element, see if we can do this more efficiently with
   15527   // a blend shuffle with a rematerializable vector than a costly integer
   15528   // insertion.
   15529   if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
   15530       16 <= EltVT.getSizeInBits()) {
   15531     SmallVector<int, 8> BlendMask;
   15532     for (unsigned i = 0; i != NumElts; ++i)
   15533       BlendMask.push_back(i == IdxVal ? i + NumElts : i);
   15534     SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
   15535                                   : getOnesVector(VT, DAG, dl);
   15536     return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
   15537   }
   15538 
   15539   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
   15540   // into that, and then insert the subvector back into the result.
   15541   if (VT.is256BitVector() || VT.is512BitVector()) {
   15542     // With a 256-bit vector, we can insert into the zero element efficiently
   15543     // using a blend if we have AVX or AVX2 and the right data type.
   15544     if (VT.is256BitVector() && IdxVal == 0) {
   15545       // TODO: It is worthwhile to cast integer to floating point and back
   15546       // and incur a domain crossing penalty if that's what we'll end up
   15547       // doing anyway after extracting to a 128-bit vector.
   15548       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
   15549           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
   15550         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
   15551         N2 = DAG.getIntPtrConstant(1, dl);
   15552         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
   15553       }
   15554     }
   15555 
   15556     // Get the desired 128-bit vector chunk.
   15557     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
   15558 
   15559     // Insert the element into the desired chunk.
   15560     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
   15561     assert(isPowerOf2_32(NumEltsIn128));
   15562     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
   15563     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
   15564 
   15565     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
   15566                     DAG.getConstant(IdxIn128, dl, MVT::i32));
   15567 
   15568     // Insert the changed part back into the bigger vector
   15569     return insert128BitVector(N0, V, IdxVal, DAG, dl);
   15570   }
   15571   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
   15572 
   15573   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
   15574   // argument. SSE41 required for pinsrb.
   15575   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
   15576     unsigned Opc;
   15577     if (VT == MVT::v8i16) {
   15578       assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
   15579       Opc = X86ISD::PINSRW;
   15580     } else {
   15581       assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
   15582       assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
   15583       Opc = X86ISD::PINSRB;
   15584     }
   15585 
   15586     if (N1.getValueType() != MVT::i32)
   15587       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
   15588     if (N2.getValueType() != MVT::i32)
   15589       N2 = DAG.getIntPtrConstant(IdxVal, dl);
   15590     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
   15591   }
   15592 
   15593   if (Subtarget.hasSSE41()) {
   15594     if (EltVT == MVT::f32) {
   15595       // Bits [7:6] of the constant are the source select. This will always be
   15596       //   zero here. The DAG Combiner may combine an extract_elt index into
   15597       //   these bits. For example (insert (extract, 3), 2) could be matched by
   15598       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
   15599       // Bits [5:4] of the constant are the destination select. This is the
   15600       //   value of the incoming immediate.
   15601       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
   15602       //   combine either bitwise AND or insert of float 0.0 to set these bits.
   15603 
   15604       bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
   15605       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
   15606         // If this is an insertion of 32-bits into the low 32-bits of
   15607         // a vector, we prefer to generate a blend with immediate rather
   15608         // than an insertps. Blends are simpler operations in hardware and so
   15609         // will always have equal or better performance than insertps.
   15610         // But if optimizing for size and there's a load folding opportunity,
   15611         // generate insertps because blendps does not have a 32-bit memory
   15612         // operand form.
   15613         N2 = DAG.getIntPtrConstant(1, dl);
   15614         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
   15615         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
   15616       }
   15617       N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
   15618       // Create this as a scalar to vector..
   15619       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
   15620       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
   15621     }
   15622 
   15623     // PINSR* works with constant index.
   15624     if (EltVT == MVT::i32 || EltVT == MVT::i64)
   15625       return Op;
   15626   }
   15627 
   15628   return SDValue();
   15629 }
   15630 
   15631 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
   15632                                      SelectionDAG &DAG) {
   15633   SDLoc dl(Op);
   15634   MVT OpVT = Op.getSimpleValueType();
   15635 
   15636   // It's always cheaper to replace a xor+movd with xorps and simplifies further
   15637   // combines.
   15638   if (X86::isZeroNode(Op.getOperand(0)))
   15639     return getZeroVector(OpVT, Subtarget, DAG, dl);
   15640 
   15641   // If this is a 256-bit vector result, first insert into a 128-bit
   15642   // vector and then insert into the 256-bit vector.
   15643   if (!OpVT.is128BitVector()) {
   15644     // Insert into a 128-bit vector.
   15645     unsigned SizeFactor = OpVT.getSizeInBits() / 128;
   15646     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
   15647                                  OpVT.getVectorNumElements() / SizeFactor);
   15648 
   15649     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
   15650 
   15651     // Insert the 128-bit vector.
   15652     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   15653   }
   15654   assert(OpVT.is128BitVector() && "Expected an SSE type!");
   15655 
   15656   // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
   15657   if (OpVT == MVT::v4i32)
   15658     return Op;
   15659 
   15660   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
   15661   return DAG.getBitcast(
   15662       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
   15663 }
   15664 
   15665 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
   15666 // simple superregister reference or explicit instructions to insert
   15667 // the upper bits of a vector.
   15668 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
   15669                                      SelectionDAG &DAG) {
   15670   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
   15671 
   15672   return insert1BitVector(Op, DAG, Subtarget);
   15673 }
   15674 
   15675 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
   15676                                       SelectionDAG &DAG) {
   15677   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
   15678          "Only vXi1 extract_subvectors need custom lowering");
   15679 
   15680   SDLoc dl(Op);
   15681   SDValue Vec = Op.getOperand(0);
   15682   SDValue Idx = Op.getOperand(1);
   15683 
   15684   if (!isa<ConstantSDNode>(Idx))
   15685     return SDValue();
   15686 
   15687   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   15688   if (IdxVal == 0) // the operation is legal
   15689     return Op;
   15690 
   15691   MVT VecVT = Vec.getSimpleValueType();
   15692   unsigned NumElems = VecVT.getVectorNumElements();
   15693 
   15694   // Extend to natively supported kshift.
   15695   MVT WideVecVT = VecVT;
   15696   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
   15697     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
   15698     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
   15699                       DAG.getUNDEF(WideVecVT), Vec,
   15700                       DAG.getIntPtrConstant(0, dl));
   15701   }
   15702 
   15703   // Shift to the LSB.
   15704   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
   15705                     DAG.getConstant(IdxVal, dl, MVT::i8));
   15706 
   15707   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
   15708                      DAG.getIntPtrConstant(0, dl));
   15709 }
   15710 
   15711 // Returns the appropriate wrapper opcode for a global reference.
   15712 unsigned X86TargetLowering::getGlobalWrapperKind(
   15713     const GlobalValue *GV, const unsigned char OpFlags) const {
   15714   // References to absolute symbols are never PC-relative.
   15715   if (GV && GV->isAbsoluteSymbolRef())
   15716     return X86ISD::Wrapper;
   15717 
   15718   CodeModel::Model M = getTargetMachine().getCodeModel();
   15719   if (Subtarget.isPICStyleRIPRel() &&
   15720       (M == CodeModel::Small || M == CodeModel::Kernel))
   15721     return X86ISD::WrapperRIP;
   15722 
   15723   // GOTPCREL references must always use RIP.
   15724   if (OpFlags == X86II::MO_GOTPCREL)
   15725     return X86ISD::WrapperRIP;
   15726 
   15727   return X86ISD::Wrapper;
   15728 }
   15729 
   15730 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
   15731 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
   15732 // one of the above mentioned nodes. It has to be wrapped because otherwise
   15733 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
   15734 // be used to form addressing mode. These wrapped nodes will be selected
   15735 // into MOV32ri.
   15736 SDValue
   15737 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   15738   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   15739 
   15740   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   15741   // global base reg.
   15742   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
   15743 
   15744   auto PtrVT = getPointerTy(DAG.getDataLayout());
   15745   SDValue Result = DAG.getTargetConstantPool(
   15746       CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
   15747   SDLoc DL(CP);
   15748   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
   15749   // With PIC, the address is actually $g + Offset.
   15750   if (OpFlag) {
   15751     Result =
   15752         DAG.getNode(ISD::ADD, DL, PtrVT,
   15753                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   15754   }
   15755 
   15756   return Result;
   15757 }
   15758 
   15759 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   15760   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   15761 
   15762   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   15763   // global base reg.
   15764   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
   15765 
   15766   auto PtrVT = getPointerTy(DAG.getDataLayout());
   15767   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
   15768   SDLoc DL(JT);
   15769   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
   15770 
   15771   // With PIC, the address is actually $g + Offset.
   15772   if (OpFlag)
   15773     Result =
   15774         DAG.getNode(ISD::ADD, DL, PtrVT,
   15775                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   15776 
   15777   return Result;
   15778 }
   15779 
   15780 SDValue
   15781 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   15782   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
   15783 
   15784   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   15785   // global base reg.
   15786   const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
   15787   unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
   15788 
   15789   auto PtrVT = getPointerTy(DAG.getDataLayout());
   15790   SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
   15791 
   15792   SDLoc DL(Op);
   15793   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
   15794 
   15795   // With PIC, the address is actually $g + Offset.
   15796   if (isPositionIndependent() && !Subtarget.is64Bit()) {
   15797     Result =
   15798         DAG.getNode(ISD::ADD, DL, PtrVT,
   15799                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   15800   }
   15801 
   15802   // For symbols that require a load from a stub to get the address, emit the
   15803   // load.
   15804   if (isGlobalStubReference(OpFlag))
   15805     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
   15806                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   15807 
   15808   return Result;
   15809 }
   15810 
   15811 SDValue
   15812 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   15813   // Create the TargetBlockAddressAddress node.
   15814   unsigned char OpFlags =
   15815     Subtarget.classifyBlockAddressReference();
   15816   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   15817   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   15818   SDLoc dl(Op);
   15819   auto PtrVT = getPointerTy(DAG.getDataLayout());
   15820   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
   15821   Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
   15822 
   15823   // With PIC, the address is actually $g + Offset.
   15824   if (isGlobalRelativeToPICBase(OpFlags)) {
   15825     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
   15826                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   15827   }
   15828 
   15829   return Result;
   15830 }
   15831 
   15832 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
   15833                                               const SDLoc &dl, int64_t Offset,
   15834                                               SelectionDAG &DAG) const {
   15835   // Create the TargetGlobalAddress node, folding in the constant
   15836   // offset if it is legal.
   15837   unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
   15838   CodeModel::Model M = DAG.getTarget().getCodeModel();
   15839   auto PtrVT = getPointerTy(DAG.getDataLayout());
   15840   SDValue Result;
   15841   if (OpFlags == X86II::MO_NO_FLAG &&
   15842       X86::isOffsetSuitableForCodeModel(Offset, M)) {
   15843     // A direct static reference to a global.
   15844     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
   15845     Offset = 0;
   15846   } else {
   15847     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
   15848   }
   15849 
   15850   Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
   15851 
   15852   // With PIC, the address is actually $g + Offset.
   15853   if (isGlobalRelativeToPICBase(OpFlags)) {
   15854     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
   15855                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   15856   }
   15857 
   15858   // For globals that require a load from a stub to get the address, emit the
   15859   // load.
   15860   if (isGlobalStubReference(OpFlags))
   15861     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
   15862                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   15863 
   15864   // If there was a non-zero offset that we didn't fold, create an explicit
   15865   // addition for it.
   15866   if (Offset != 0)
   15867     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
   15868                          DAG.getConstant(Offset, dl, PtrVT));
   15869 
   15870   return Result;
   15871 }
   15872 
   15873 SDValue
   15874 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   15875   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   15876   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
   15877   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
   15878 }
   15879 
   15880 static SDValue
   15881 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
   15882            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
   15883            unsigned char OperandFlags, bool LocalDynamic = false) {
   15884   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   15885   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   15886   SDLoc dl(GA);
   15887   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   15888                                            GA->getValueType(0),
   15889                                            GA->getOffset(),
   15890                                            OperandFlags);
   15891 
   15892   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
   15893                                            : X86ISD::TLSADDR;
   15894 
   15895   if (InFlag) {
   15896     SDValue Ops[] = { Chain,  TGA, *InFlag };
   15897     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   15898   } else {
   15899     SDValue Ops[]  = { Chain, TGA };
   15900     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   15901   }
   15902 
   15903   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   15904   MFI.setAdjustsStack(true);
   15905   MFI.setHasCalls(true);
   15906 
   15907   SDValue Flag = Chain.getValue(1);
   15908   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
   15909 }
   15910 
   15911 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
   15912 static SDValue
   15913 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   15914                                 const EVT PtrVT) {
   15915   SDValue InFlag;
   15916   SDLoc dl(GA);  // ? function entry point might be better
   15917   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
   15918                                    DAG.getNode(X86ISD::GlobalBaseReg,
   15919                                                SDLoc(), PtrVT), InFlag);
   15920   InFlag = Chain.getValue(1);
   15921 
   15922   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
   15923 }
   15924 
   15925 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
   15926 static SDValue
   15927 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   15928                                 const EVT PtrVT) {
   15929   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
   15930                     X86::RAX, X86II::MO_TLSGD);
   15931 }
   15932 
   15933 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
   15934                                            SelectionDAG &DAG,
   15935                                            const EVT PtrVT,
   15936                                            bool is64Bit) {
   15937   SDLoc dl(GA);
   15938 
   15939   // Get the start address of the TLS block for this module.
   15940   X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
   15941       .getInfo<X86MachineFunctionInfo>();
   15942   MFI->incNumLocalDynamicTLSAccesses();
   15943 
   15944   SDValue Base;
   15945   if (is64Bit) {
   15946     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
   15947                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
   15948   } else {
   15949     SDValue InFlag;
   15950     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
   15951         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
   15952     InFlag = Chain.getValue(1);
   15953     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
   15954                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
   15955   }
   15956 
   15957   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
   15958   // of Base.
   15959 
   15960   // Build x@dtpoff.
   15961   unsigned char OperandFlags = X86II::MO_DTPOFF;
   15962   unsigned WrapperKind = X86ISD::Wrapper;
   15963   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   15964                                            GA->getValueType(0),
   15965                                            GA->getOffset(), OperandFlags);
   15966   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
   15967 
   15968   // Add x@dtpoff with the base.
   15969   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
   15970 }
   15971 
   15972 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
   15973 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   15974                                    const EVT PtrVT, TLSModel::Model model,
   15975                                    bool is64Bit, bool isPIC) {
   15976   SDLoc dl(GA);
   15977 
   15978   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
   15979   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
   15980                                                          is64Bit ? 257 : 256));
   15981 
   15982   SDValue ThreadPointer =
   15983       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
   15984                   MachinePointerInfo(Ptr));
   15985 
   15986   unsigned char OperandFlags = 0;
   15987   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
   15988   // initialexec.
   15989   unsigned WrapperKind = X86ISD::Wrapper;
   15990   if (model == TLSModel::LocalExec) {
   15991     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
   15992   } else if (model == TLSModel::InitialExec) {
   15993     if (is64Bit) {
   15994       OperandFlags = X86II::MO_GOTTPOFF;
   15995       WrapperKind = X86ISD::WrapperRIP;
   15996     } else {
   15997       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
   15998     }
   15999   } else {
   16000     llvm_unreachable("Unexpected model");
   16001   }
   16002 
   16003   // emit "addl x@ntpoff,%eax" (local exec)
   16004   // or "addl x@indntpoff,%eax" (initial exec)
   16005   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
   16006   SDValue TGA =
   16007       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
   16008                                  GA->getOffset(), OperandFlags);
   16009   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
   16010 
   16011   if (model == TLSModel::InitialExec) {
   16012     if (isPIC && !is64Bit) {
   16013       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
   16014                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
   16015                            Offset);
   16016     }
   16017 
   16018     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
   16019                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   16020   }
   16021 
   16022   // The address of the thread local variable is the add of the thread
   16023   // pointer with the offset of the variable.
   16024   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
   16025 }
   16026 
   16027 SDValue
   16028 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   16029 
   16030   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   16031 
   16032   if (DAG.getTarget().useEmulatedTLS())
   16033     return LowerToTLSEmulatedModel(GA, DAG);
   16034 
   16035   const GlobalValue *GV = GA->getGlobal();
   16036   auto PtrVT = getPointerTy(DAG.getDataLayout());
   16037   bool PositionIndependent = isPositionIndependent();
   16038 
   16039   if (Subtarget.isTargetELF()) {
   16040     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
   16041     switch (model) {
   16042       case TLSModel::GeneralDynamic:
   16043         if (Subtarget.is64Bit())
   16044           return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
   16045         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
   16046       case TLSModel::LocalDynamic:
   16047         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
   16048                                            Subtarget.is64Bit());
   16049       case TLSModel::InitialExec:
   16050       case TLSModel::LocalExec:
   16051         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
   16052                                    PositionIndependent);
   16053     }
   16054     llvm_unreachable("Unknown TLS model.");
   16055   }
   16056 
   16057   if (Subtarget.isTargetDarwin()) {
   16058     // Darwin only has one model of TLS.  Lower to that.
   16059     unsigned char OpFlag = 0;
   16060     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
   16061                            X86ISD::WrapperRIP : X86ISD::Wrapper;
   16062 
   16063     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   16064     // global base reg.
   16065     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
   16066     if (PIC32)
   16067       OpFlag = X86II::MO_TLVP_PIC_BASE;
   16068     else
   16069       OpFlag = X86II::MO_TLVP;
   16070     SDLoc DL(Op);
   16071     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
   16072                                                 GA->getValueType(0),
   16073                                                 GA->getOffset(), OpFlag);
   16074     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
   16075 
   16076     // With PIC32, the address is actually $g + Offset.
   16077     if (PIC32)
   16078       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
   16079                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
   16080                            Offset);
   16081 
   16082     // Lowering the machine isd will make sure everything is in the right
   16083     // location.
   16084     SDValue Chain = DAG.getEntryNode();
   16085     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   16086     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
   16087     SDValue Args[] = { Chain, Offset };
   16088     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
   16089     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
   16090                                DAG.getIntPtrConstant(0, DL, true),
   16091                                Chain.getValue(1), DL);
   16092 
   16093     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
   16094     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   16095     MFI.setAdjustsStack(true);
   16096 
   16097     // And our return value (tls address) is in the standard call return value
   16098     // location.
   16099     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
   16100     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
   16101   }
   16102 
   16103   if (Subtarget.isTargetKnownWindowsMSVC() ||
   16104       Subtarget.isTargetWindowsItanium() ||
   16105       Subtarget.isTargetWindowsGNU()) {
   16106     // Just use the implicit TLS architecture
   16107     // Need to generate something similar to:
   16108     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
   16109     //                                  ; from TEB
   16110     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
   16111     //   mov     rcx, qword [rdx+rcx*8]
   16112     //   mov     eax, .tls$:tlsvar
   16113     //   [rax+rcx] contains the address
   16114     // Windows 64bit: gs:0x58
   16115     // Windows 32bit: fs:__tls_array
   16116 
   16117     SDLoc dl(GA);
   16118     SDValue Chain = DAG.getEntryNode();
   16119 
   16120     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
   16121     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
   16122     // use its literal value of 0x2C.
   16123     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
   16124                                         ? Type::getInt8PtrTy(*DAG.getContext(),
   16125                                                              256)
   16126                                         : Type::getInt32PtrTy(*DAG.getContext(),
   16127                                                               257));
   16128 
   16129     SDValue TlsArray = Subtarget.is64Bit()
   16130                            ? DAG.getIntPtrConstant(0x58, dl)
   16131                            : (Subtarget.isTargetWindowsGNU()
   16132                                   ? DAG.getIntPtrConstant(0x2C, dl)
   16133                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
   16134 
   16135     SDValue ThreadPointer =
   16136         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
   16137 
   16138     SDValue res;
   16139     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
   16140       res = ThreadPointer;
   16141     } else {
   16142       // Load the _tls_index variable
   16143       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
   16144       if (Subtarget.is64Bit())
   16145         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
   16146                              MachinePointerInfo(), MVT::i32);
   16147       else
   16148         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
   16149 
   16150       auto &DL = DAG.getDataLayout();
   16151       SDValue Scale =
   16152           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
   16153       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
   16154 
   16155       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
   16156     }
   16157 
   16158     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
   16159 
   16160     // Get the offset of start of .tls section
   16161     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
   16162                                              GA->getValueType(0),
   16163                                              GA->getOffset(), X86II::MO_SECREL);
   16164     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
   16165 
   16166     // The address of the thread local variable is the add of the thread
   16167     // pointer with the offset of the variable.
   16168     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
   16169   }
   16170 
   16171   llvm_unreachable("TLS not implemented for this target.");
   16172 }
   16173 
   16174 /// Lower SRA_PARTS and friends, which return two i32 values
   16175 /// and take a 2 x i32 value to shift plus a shift amount.
   16176 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   16177   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   16178   MVT VT = Op.getSimpleValueType();
   16179   unsigned VTBits = VT.getSizeInBits();
   16180   SDLoc dl(Op);
   16181   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   16182   SDValue ShOpLo = Op.getOperand(0);
   16183   SDValue ShOpHi = Op.getOperand(1);
   16184   SDValue ShAmt  = Op.getOperand(2);
   16185   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
   16186   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
   16187   // during isel.
   16188   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
   16189                                   DAG.getConstant(VTBits - 1, dl, MVT::i8));
   16190   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
   16191                                      DAG.getConstant(VTBits - 1, dl, MVT::i8))
   16192                        : DAG.getConstant(0, dl, VT);
   16193 
   16194   SDValue Tmp2, Tmp3;
   16195   if (Op.getOpcode() == ISD::SHL_PARTS) {
   16196     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
   16197     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
   16198   } else {
   16199     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
   16200     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
   16201   }
   16202 
   16203   // If the shift amount is larger or equal than the width of a part we can't
   16204   // rely on the results of shld/shrd. Insert a test and select the appropriate
   16205   // values for large shift amounts.
   16206   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
   16207                                 DAG.getConstant(VTBits, dl, MVT::i8));
   16208   SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
   16209                              DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
   16210 
   16211   SDValue Hi, Lo;
   16212   if (Op.getOpcode() == ISD::SHL_PARTS) {
   16213     Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
   16214     Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
   16215   } else {
   16216     Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
   16217     Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
   16218   }
   16219 
   16220   return DAG.getMergeValues({ Lo, Hi }, dl);
   16221 }
   16222 
   16223 // Try to use a packed vector operation to handle i64 on 32-bit targets when
   16224 // AVX512DQ is enabled.
   16225 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
   16226                                         const X86Subtarget &Subtarget) {
   16227   assert((Op.getOpcode() == ISD::SINT_TO_FP ||
   16228           Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
   16229   SDValue Src = Op.getOperand(0);
   16230   MVT SrcVT = Src.getSimpleValueType();
   16231   MVT VT = Op.getSimpleValueType();
   16232 
   16233    if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
   16234        (VT != MVT::f32 && VT != MVT::f64))
   16235     return SDValue();
   16236 
   16237   // Pack the i64 into a vector, do the operation and extract.
   16238 
   16239   // Using 256-bit to ensure result is 128-bits for f32 case.
   16240   unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
   16241   MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
   16242   MVT VecVT = MVT::getVectorVT(VT, NumElts);
   16243 
   16244   SDLoc dl(Op);
   16245   SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
   16246   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
   16247   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
   16248                      DAG.getIntPtrConstant(0, dl));
   16249 }
   16250 
   16251 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   16252                                            SelectionDAG &DAG) const {
   16253   SDValue Src = Op.getOperand(0);
   16254   MVT SrcVT = Src.getSimpleValueType();
   16255   MVT VT = Op.getSimpleValueType();
   16256   SDLoc dl(Op);
   16257 
   16258   if (SrcVT.isVector()) {
   16259     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
   16260       return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
   16261                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
   16262                                      DAG.getUNDEF(SrcVT)));
   16263     }
   16264     return SDValue();
   16265   }
   16266 
   16267   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
   16268          "Unknown SINT_TO_FP to lower!");
   16269 
   16270   // These are really Legal; return the operand so the caller accepts it as
   16271   // Legal.
   16272   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
   16273     return Op;
   16274   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit()) {
   16275     return Op;
   16276   }
   16277 
   16278   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
   16279     return V;
   16280 
   16281   SDValue ValueToStore = Op.getOperand(0);
   16282   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
   16283       !Subtarget.is64Bit())
   16284     // Bitcasting to f64 here allows us to do a single 64-bit store from
   16285     // an SSE register, avoiding the store forwarding penalty that would come
   16286     // with two 32-bit stores.
   16287     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
   16288 
   16289   unsigned Size = SrcVT.getSizeInBits()/8;
   16290   MachineFunction &MF = DAG.getMachineFunction();
   16291   auto PtrVT = getPointerTy(MF.getDataLayout());
   16292   int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
   16293   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   16294   SDValue Chain = DAG.getStore(
   16295       DAG.getEntryNode(), dl, ValueToStore, StackSlot,
   16296       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
   16297   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
   16298 }
   16299 
   16300 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
   16301                                      SDValue StackSlot,
   16302                                      SelectionDAG &DAG) const {
   16303   // Build the FILD
   16304   SDLoc DL(Op);
   16305   SDVTList Tys;
   16306   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
   16307   if (useSSE)
   16308     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
   16309   else
   16310     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
   16311 
   16312   unsigned ByteSize = SrcVT.getSizeInBits()/8;
   16313 
   16314   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
   16315   MachineMemOperand *MMO;
   16316   if (FI) {
   16317     int SSFI = FI->getIndex();
   16318     MMO = DAG.getMachineFunction().getMachineMemOperand(
   16319         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
   16320         MachineMemOperand::MOLoad, ByteSize, ByteSize);
   16321   } else {
   16322     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
   16323     StackSlot = StackSlot.getOperand(1);
   16324   }
   16325   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   16326   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
   16327                                            X86ISD::FILD, DL,
   16328                                            Tys, Ops, SrcVT, MMO);
   16329 
   16330   if (useSSE) {
   16331     Chain = Result.getValue(1);
   16332     SDValue InFlag = Result.getValue(2);
   16333 
   16334     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
   16335     // shouldn't be necessary except that RFP cannot be live across
   16336     // multiple blocks. When stackifier is fixed, they can be uncoupled.
   16337     MachineFunction &MF = DAG.getMachineFunction();
   16338     unsigned SSFISize = Op.getValueSizeInBits()/8;
   16339     int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
   16340     auto PtrVT = getPointerTy(MF.getDataLayout());
   16341     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   16342     Tys = DAG.getVTList(MVT::Other);
   16343     SDValue Ops[] = {
   16344       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
   16345     };
   16346     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
   16347         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
   16348         MachineMemOperand::MOStore, SSFISize, SSFISize);
   16349 
   16350     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
   16351                                     Ops, Op.getValueType(), MMO);
   16352     Result = DAG.getLoad(
   16353         Op.getValueType(), DL, Chain, StackSlot,
   16354         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
   16355   }
   16356 
   16357   return Result;
   16358 }
   16359 
   16360 /// 64-bit unsigned integer to double expansion.
   16361 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
   16362                                    const X86Subtarget &Subtarget) {
   16363   // This algorithm is not obvious. Here it is what we're trying to output:
   16364   /*
   16365      movq       %rax,  %xmm0
   16366      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
   16367      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
   16368      #ifdef __SSE3__
   16369        haddpd   %xmm0, %xmm0
   16370      #else
   16371        pshufd   $0x4e, %xmm0, %xmm1
   16372        addpd    %xmm1, %xmm0
   16373      #endif
   16374   */
   16375 
   16376   SDLoc dl(Op);
   16377   LLVMContext *Context = DAG.getContext();
   16378 
   16379   // Build some magic constants.
   16380   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   16381   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   16382   auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   16383   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
   16384 
   16385   SmallVector<Constant*,2> CV1;
   16386   CV1.push_back(
   16387     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
   16388                                       APInt(64, 0x4330000000000000ULL))));
   16389   CV1.push_back(
   16390     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
   16391                                       APInt(64, 0x4530000000000000ULL))));
   16392   Constant *C1 = ConstantVector::get(CV1);
   16393   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
   16394 
   16395   // Load the 64-bit value into an XMM register.
   16396   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
   16397                             Op.getOperand(0));
   16398   SDValue CLod0 =
   16399       DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
   16400                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   16401                   /* Alignment = */ 16);
   16402   SDValue Unpck1 =
   16403       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
   16404 
   16405   SDValue CLod1 =
   16406       DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
   16407                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
   16408                   /* Alignment = */ 16);
   16409   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
   16410   // TODO: Are there any fast-math-flags to propagate here?
   16411   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   16412   SDValue Result;
   16413 
   16414   if (Subtarget.hasSSE3()) {
   16415     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
   16416     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   16417   } else {
   16418     SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
   16419     SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
   16420     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
   16421                          DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
   16422   }
   16423 
   16424   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
   16425                      DAG.getIntPtrConstant(0, dl));
   16426 }
   16427 
   16428 /// 32-bit unsigned integer to float expansion.
   16429 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
   16430                                    const X86Subtarget &Subtarget) {
   16431   SDLoc dl(Op);
   16432   // FP constant to bias correct the final result.
   16433   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
   16434                                    MVT::f64);
   16435 
   16436   // Load the 32-bit value into an XMM register.
   16437   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
   16438                              Op.getOperand(0));
   16439 
   16440   // Zero out the upper parts of the register.
   16441   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
   16442 
   16443   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   16444                      DAG.getBitcast(MVT::v2f64, Load),
   16445                      DAG.getIntPtrConstant(0, dl));
   16446 
   16447   // Or the load with the bias.
   16448   SDValue Or = DAG.getNode(
   16449       ISD::OR, dl, MVT::v2i64,
   16450       DAG.getBitcast(MVT::v2i64,
   16451                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
   16452       DAG.getBitcast(MVT::v2i64,
   16453                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
   16454   Or =
   16455       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   16456                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
   16457 
   16458   // Subtract the bias.
   16459   // TODO: Are there any fast-math-flags to propagate here?
   16460   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
   16461 
   16462   // Handle final rounding.
   16463   return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
   16464 }
   16465 
   16466 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
   16467                                      const X86Subtarget &Subtarget,
   16468                                      const SDLoc &DL) {
   16469   if (Op.getSimpleValueType() != MVT::v2f64)
   16470     return SDValue();
   16471 
   16472   SDValue N0 = Op.getOperand(0);
   16473   assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
   16474 
   16475   // Legalize to v4i32 type.
   16476   N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
   16477                    DAG.getUNDEF(MVT::v2i32));
   16478 
   16479   if (Subtarget.hasAVX512())
   16480     return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
   16481 
   16482   // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
   16483   // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
   16484   SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
   16485   SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
   16486 
   16487   // Two to the power of half-word-size.
   16488   SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
   16489 
   16490   // Clear upper part of LO, lower HI.
   16491   SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
   16492   SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
   16493 
   16494   SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
   16495           fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
   16496   SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
   16497 
   16498   // Add the two halves.
   16499   return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
   16500 }
   16501 
   16502 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
   16503                                      const X86Subtarget &Subtarget) {
   16504   // The algorithm is the following:
   16505   // #ifdef __SSE4_1__
   16506   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
   16507   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
   16508   //                                 (uint4) 0x53000000, 0xaa);
   16509   // #else
   16510   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
   16511   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
   16512   // #endif
   16513   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   16514   //     return (float4) lo + fhi;
   16515 
   16516   // We shouldn't use it when unsafe-fp-math is enabled though: we might later
   16517   // reassociate the two FADDs, and if we do that, the algorithm fails
   16518   // spectacularly (PR24512).
   16519   // FIXME: If we ever have some kind of Machine FMF, this should be marked
   16520   // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
   16521   // there's also the MachineCombiner reassociations happening on Machine IR.
   16522   if (DAG.getTarget().Options.UnsafeFPMath)
   16523     return SDValue();
   16524 
   16525   SDLoc DL(Op);
   16526   SDValue V = Op->getOperand(0);
   16527   MVT VecIntVT = V.getSimpleValueType();
   16528   bool Is128 = VecIntVT == MVT::v4i32;
   16529   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
   16530   // If we convert to something else than the supported type, e.g., to v4f64,
   16531   // abort early.
   16532   if (VecFloatVT != Op->getSimpleValueType(0))
   16533     return SDValue();
   16534 
   16535   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
   16536          "Unsupported custom type");
   16537 
   16538   // In the #idef/#else code, we have in common:
   16539   // - The vector of constants:
   16540   // -- 0x4b000000
   16541   // -- 0x53000000
   16542   // - A shift:
   16543   // -- v >> 16
   16544 
   16545   // Create the splat vector for 0x4b000000.
   16546   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
   16547   // Create the splat vector for 0x53000000.
   16548   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
   16549 
   16550   // Create the right shift.
   16551   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
   16552   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
   16553 
   16554   SDValue Low, High;
   16555   if (Subtarget.hasSSE41()) {
   16556     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
   16557     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
   16558     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
   16559     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
   16560     // Low will be bitcasted right away, so do not bother bitcasting back to its
   16561     // original type.
   16562     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
   16563                       VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
   16564     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
   16565     //                                 (uint4) 0x53000000, 0xaa);
   16566     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
   16567     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
   16568     // High will be bitcasted right away, so do not bother bitcasting back to
   16569     // its original type.
   16570     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
   16571                        VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
   16572   } else {
   16573     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
   16574     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
   16575     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
   16576     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
   16577 
   16578     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
   16579     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
   16580   }
   16581 
   16582   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
   16583   SDValue VecCstFAdd = DAG.getConstantFP(
   16584       APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
   16585 
   16586   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   16587   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
   16588   // TODO: Are there any fast-math-flags to propagate here?
   16589   SDValue FHigh =
   16590       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
   16591   //     return (float4) lo + fhi;
   16592   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
   16593   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
   16594 }
   16595 
   16596 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
   16597                                    const X86Subtarget &Subtarget) {
   16598   SDValue N0 = Op.getOperand(0);
   16599   MVT SrcVT = N0.getSimpleValueType();
   16600   SDLoc dl(Op);
   16601 
   16602   switch (SrcVT.SimpleTy) {
   16603   default:
   16604     llvm_unreachable("Custom UINT_TO_FP is not supported!");
   16605   case MVT::v2i32:
   16606     return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
   16607   case MVT::v4i32:
   16608   case MVT::v8i32:
   16609     assert(!Subtarget.hasAVX512());
   16610     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
   16611   }
   16612 }
   16613 
   16614 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   16615                                            SelectionDAG &DAG) const {
   16616   SDValue N0 = Op.getOperand(0);
   16617   SDLoc dl(Op);
   16618   auto PtrVT = getPointerTy(DAG.getDataLayout());
   16619 
   16620   if (Op.getSimpleValueType().isVector())
   16621     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
   16622 
   16623   MVT SrcVT = N0.getSimpleValueType();
   16624   MVT DstVT = Op.getSimpleValueType();
   16625 
   16626   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
   16627       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
   16628     // Conversions from unsigned i32 to f32/f64 are legal,
   16629     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
   16630     return Op;
   16631   }
   16632 
   16633   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
   16634     return V;
   16635 
   16636   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
   16637     return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
   16638   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
   16639     return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
   16640   if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
   16641     return SDValue();
   16642 
   16643   // Make a 64-bit buffer, and use it to build an FILD.
   16644   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
   16645   if (SrcVT == MVT::i32) {
   16646     SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
   16647     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
   16648                                   StackSlot, MachinePointerInfo());
   16649     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
   16650                                   OffsetSlot, MachinePointerInfo());
   16651     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
   16652     return Fild;
   16653   }
   16654 
   16655   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
   16656   SDValue ValueToStore = Op.getOperand(0);
   16657   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
   16658     // Bitcasting to f64 here allows us to do a single 64-bit store from
   16659     // an SSE register, avoiding the store forwarding penalty that would come
   16660     // with two 32-bit stores.
   16661     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
   16662   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
   16663                                MachinePointerInfo());
   16664   // For i64 source, we need to add the appropriate power of 2 if the input
   16665   // was negative.  This is the same as the optimization in
   16666   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
   16667   // we must be careful to do the computation in x87 extended precision, not
   16668   // in SSE. (The generic code can't know it's OK to do this, or how to.)
   16669   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
   16670   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
   16671       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
   16672       MachineMemOperand::MOLoad, 8, 8);
   16673 
   16674   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   16675   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
   16676   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
   16677                                          MVT::i64, MMO);
   16678 
   16679   APInt FF(32, 0x5F800000ULL);
   16680 
   16681   // Check whether the sign bit is set.
   16682   SDValue SignSet = DAG.getSetCC(
   16683       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
   16684       Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
   16685 
   16686   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
   16687   SDValue FudgePtr = DAG.getConstantPool(
   16688       ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
   16689 
   16690   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   16691   SDValue Zero = DAG.getIntPtrConstant(0, dl);
   16692   SDValue Four = DAG.getIntPtrConstant(4, dl);
   16693   SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
   16694   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
   16695 
   16696   // Load the value out, extending it from f32 to f80.
   16697   // FIXME: Avoid the extend by constructing the right constant pool?
   16698   SDValue Fudge = DAG.getExtLoad(
   16699       ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
   16700       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
   16701       /* Alignment = */ 4);
   16702   // Extend everything to 80 bits to force it to be done on x87.
   16703   // TODO: Are there any fast-math-flags to propagate here?
   16704   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
   16705   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
   16706                      DAG.getIntPtrConstant(0, dl));
   16707 }
   16708 
   16709 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
   16710 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
   16711 // just return an <SDValue(), SDValue()> pair.
   16712 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
   16713 // to i16, i32 or i64, and we lower it to a legal sequence.
   16714 // If lowered to the final integer result we return a <result, SDValue()> pair.
   16715 // Otherwise we lower it to a sequence ending with a FIST, return a
   16716 // <FIST, StackSlot> pair, and the caller is responsible for loading
   16717 // the final integer result from StackSlot.
   16718 std::pair<SDValue,SDValue>
   16719 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   16720                                    bool IsSigned, bool IsReplace) const {
   16721   SDLoc DL(Op);
   16722 
   16723   EVT DstTy = Op.getValueType();
   16724   EVT TheVT = Op.getOperand(0).getValueType();
   16725   auto PtrVT = getPointerTy(DAG.getDataLayout());
   16726 
   16727   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
   16728     // f16 must be promoted before using the lowering in this routine.
   16729     // fp128 does not use this lowering.
   16730     return std::make_pair(SDValue(), SDValue());
   16731   }
   16732 
   16733   // If using FIST to compute an unsigned i64, we'll need some fixup
   16734   // to handle values above the maximum signed i64.  A FIST is always
   16735   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
   16736   bool UnsignedFixup = !IsSigned &&
   16737                        DstTy == MVT::i64 &&
   16738                        (!Subtarget.is64Bit() ||
   16739                         !isScalarFPTypeInSSEReg(TheVT));
   16740 
   16741   if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
   16742     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
   16743     // The low 32 bits of the fist result will have the correct uint32 result.
   16744     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
   16745     DstTy = MVT::i64;
   16746   }
   16747 
   16748   assert(DstTy.getSimpleVT() <= MVT::i64 &&
   16749          DstTy.getSimpleVT() >= MVT::i16 &&
   16750          "Unknown FP_TO_INT to lower!");
   16751 
   16752   // These are really Legal.
   16753   if (DstTy == MVT::i32 &&
   16754       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   16755     return std::make_pair(SDValue(), SDValue());
   16756   if (Subtarget.is64Bit() &&
   16757       DstTy == MVT::i64 &&
   16758       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
   16759     return std::make_pair(SDValue(), SDValue());
   16760 
   16761   // We lower FP->int64 into FISTP64 followed by a load from a temporary
   16762   // stack slot.
   16763   MachineFunction &MF = DAG.getMachineFunction();
   16764   unsigned MemSize = DstTy.getSizeInBits()/8;
   16765   int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
   16766   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   16767 
   16768   unsigned Opc;
   16769   switch (DstTy.getSimpleVT().SimpleTy) {
   16770   default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
   16771   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
   16772   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
   16773   case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
   16774   }
   16775 
   16776   SDValue Chain = DAG.getEntryNode();
   16777   SDValue Value = Op.getOperand(0);
   16778   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
   16779 
   16780   if (UnsignedFixup) {
   16781     //
   16782     // Conversion to unsigned i64 is implemented with a select,
   16783     // depending on whether the source value fits in the range
   16784     // of a signed i64.  Let Thresh be the FP equivalent of
   16785     // 0x8000000000000000ULL.
   16786     //
   16787     //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
   16788     //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
   16789     //  Fist-to-mem64 FistSrc
   16790     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
   16791     //  to XOR'ing the high 32 bits with Adjust.
   16792     //
   16793     // Being a power of 2, Thresh is exactly representable in all FP formats.
   16794     // For X87 we'd like to use the smallest FP type for this constant, but
   16795     // for DAG type consistency we have to match the FP operand type.
   16796 
   16797     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
   16798     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
   16799     bool LosesInfo = false;
   16800     if (TheVT == MVT::f64)
   16801       // The rounding mode is irrelevant as the conversion should be exact.
   16802       Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
   16803                               &LosesInfo);
   16804     else if (TheVT == MVT::f80)
   16805       Status = Thresh.convert(APFloat::x87DoubleExtended(),
   16806                               APFloat::rmNearestTiesToEven, &LosesInfo);
   16807 
   16808     assert(Status == APFloat::opOK && !LosesInfo &&
   16809            "FP conversion should have been exact");
   16810 
   16811     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
   16812 
   16813     SDValue Cmp = DAG.getSetCC(DL,
   16814                                getSetCCResultType(DAG.getDataLayout(),
   16815                                                   *DAG.getContext(), TheVT),
   16816                                Value, ThreshVal, ISD::SETLT);
   16817     Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
   16818                            DAG.getConstant(0, DL, MVT::i32),
   16819                            DAG.getConstant(0x80000000, DL, MVT::i32));
   16820     SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
   16821     Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
   16822                                               *DAG.getContext(), TheVT),
   16823                        Value, ThreshVal, ISD::SETLT);
   16824     Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
   16825   }
   16826 
   16827   // FIXME This causes a redundant load/store if the SSE-class value is already
   16828   // in memory, such as if it is on the callstack.
   16829   if (isScalarFPTypeInSSEReg(TheVT)) {
   16830     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
   16831     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
   16832                          MachinePointerInfo::getFixedStack(MF, SSFI));
   16833     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
   16834     SDValue Ops[] = {
   16835       Chain, StackSlot, DAG.getValueType(TheVT)
   16836     };
   16837 
   16838     MachineMemOperand *MMO =
   16839         MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
   16840                                 MachineMemOperand::MOLoad, MemSize, MemSize);
   16841     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
   16842     Chain = Value.getValue(1);
   16843     SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
   16844     StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   16845   }
   16846 
   16847   MachineMemOperand *MMO =
   16848       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
   16849                               MachineMemOperand::MOStore, MemSize, MemSize);
   16850 
   16851   if (UnsignedFixup) {
   16852 
   16853     // Insert the FIST, load its result as two i32's,
   16854     // and XOR the high i32 with Adjust.
   16855 
   16856     SDValue FistOps[] = { Chain, Value, StackSlot };
   16857     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
   16858                                            FistOps, DstTy, MMO);
   16859 
   16860     SDValue Low32 =
   16861         DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
   16862     SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
   16863 
   16864     SDValue High32 =
   16865         DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
   16866     High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
   16867 
   16868     if (Subtarget.is64Bit()) {
   16869       // Join High32 and Low32 into a 64-bit result.
   16870       // (High32 << 32) | Low32
   16871       Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
   16872       High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
   16873       High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
   16874                            DAG.getConstant(32, DL, MVT::i8));
   16875       SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
   16876       return std::make_pair(Result, SDValue());
   16877     }
   16878 
   16879     SDValue ResultOps[] = { Low32, High32 };
   16880 
   16881     SDValue pair = IsReplace
   16882       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
   16883       : DAG.getMergeValues(ResultOps, DL);
   16884     return std::make_pair(pair, SDValue());
   16885   } else {
   16886     // Build the FP_TO_INT*_IN_MEM
   16887     SDValue Ops[] = { Chain, Value, StackSlot };
   16888     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
   16889                                            Ops, DstTy, MMO);
   16890     return std::make_pair(FIST, StackSlot);
   16891   }
   16892 }
   16893 
   16894 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   16895                               const X86Subtarget &Subtarget) {
   16896   MVT VT = Op->getSimpleValueType(0);
   16897   SDValue In = Op->getOperand(0);
   16898   MVT InVT = In.getSimpleValueType();
   16899   SDLoc dl(Op);
   16900 
   16901   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
   16902   assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
   16903          "Expected same number of elements");
   16904   assert((VT.getVectorElementType() == MVT::i16 ||
   16905           VT.getVectorElementType() == MVT::i32 ||
   16906           VT.getVectorElementType() == MVT::i64) &&
   16907          "Unexpected element type");
   16908   assert((InVT.getVectorElementType() == MVT::i8 ||
   16909           InVT.getVectorElementType() == MVT::i16 ||
   16910           InVT.getVectorElementType() == MVT::i32) &&
   16911          "Unexpected element type");
   16912 
   16913   if (Subtarget.hasInt256())
   16914     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
   16915 
   16916   // Optimize vectors in AVX mode:
   16917   //
   16918   //   v8i16 -> v8i32
   16919   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
   16920   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
   16921   //   Concat upper and lower parts.
   16922   //
   16923   //   v4i32 -> v4i64
   16924   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
   16925   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   16926   //   Concat upper and lower parts.
   16927   //
   16928 
   16929   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
   16930   SDValue Undef = DAG.getUNDEF(InVT);
   16931   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
   16932   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   16933   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   16934 
   16935   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
   16936                              VT.getVectorNumElements()/2);
   16937 
   16938   OpLo = DAG.getBitcast(HVT, OpLo);
   16939   OpHi = DAG.getBitcast(HVT, OpHi);
   16940 
   16941   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   16942 }
   16943 
   16944 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
   16945 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
   16946                                    const SDLoc &dl, SelectionDAG &DAG) {
   16947   assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
   16948   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
   16949                            DAG.getIntPtrConstant(0, dl));
   16950   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
   16951                            DAG.getIntPtrConstant(8, dl));
   16952   Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
   16953   Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
   16954   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
   16955   return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
   16956 }
   16957 
   16958 static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
   16959                                       const X86Subtarget &Subtarget,
   16960                                       SelectionDAG &DAG) {
   16961   MVT VT = Op->getSimpleValueType(0);
   16962   SDValue In = Op->getOperand(0);
   16963   MVT InVT = In.getSimpleValueType();
   16964   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
   16965   SDLoc DL(Op);
   16966   unsigned NumElts = VT.getVectorNumElements();
   16967 
   16968   // For all vectors, but vXi8 we can just emit a sign_extend a shift. This
   16969   // avoids a constant pool load.
   16970   if (VT.getVectorElementType() != MVT::i8) {
   16971     SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
   16972     return DAG.getNode(ISD::SRL, DL, VT, Extend,
   16973                        DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
   16974   }
   16975 
   16976   // Extend VT if BWI is not supported.
   16977   MVT ExtVT = VT;
   16978   if (!Subtarget.hasBWI()) {
   16979     // If v16i32 is to be avoided, we'll need to split and concatenate.
   16980     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
   16981       return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
   16982 
   16983     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
   16984   }
   16985 
   16986   // Widen to 512-bits if VLX is not supported.
   16987   MVT WideVT = ExtVT;
   16988   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
   16989     NumElts *= 512 / ExtVT.getSizeInBits();
   16990     InVT = MVT::getVectorVT(MVT::i1, NumElts);
   16991     In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
   16992                      In, DAG.getIntPtrConstant(0, DL));
   16993     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
   16994                               NumElts);
   16995   }
   16996 
   16997   SDValue One = DAG.getConstant(1, DL, WideVT);
   16998   SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
   16999 
   17000   SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
   17001 
   17002   // Truncate if we had to extend above.
   17003   if (VT != ExtVT) {
   17004     WideVT = MVT::getVectorVT(MVT::i8, NumElts);
   17005     SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
   17006   }
   17007 
   17008   // Extract back to 128/256-bit if we widened.
   17009   if (WideVT != VT)
   17010     SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
   17011                               DAG.getIntPtrConstant(0, DL));
   17012 
   17013   return SelectedVal;
   17014 }
   17015 
   17016 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   17017                                 SelectionDAG &DAG) {
   17018   SDValue In = Op.getOperand(0);
   17019   MVT SVT = In.getSimpleValueType();
   17020 
   17021   if (SVT.getVectorElementType() == MVT::i1)
   17022     return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
   17023 
   17024   assert(Subtarget.hasAVX() && "Expected AVX support");
   17025   return LowerAVXExtend(Op, DAG, Subtarget);
   17026 }
   17027 
   17028 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
   17029 /// It makes use of the fact that vectors with enough leading sign/zero bits
   17030 /// prevent the PACKSS/PACKUS from saturating the results.
   17031 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
   17032 /// within each 128-bit lane.
   17033 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
   17034                                       const SDLoc &DL, SelectionDAG &DAG,
   17035                                       const X86Subtarget &Subtarget) {
   17036   assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
   17037          "Unexpected PACK opcode");
   17038 
   17039   // Requires SSE2 but AVX512 has fast vector truncate.
   17040   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512() || !DstVT.isVector())
   17041     return SDValue();
   17042 
   17043   EVT SrcVT = In.getValueType();
   17044 
   17045   // No truncation required, we might get here due to recursive calls.
   17046   if (SrcVT == DstVT)
   17047     return In;
   17048 
   17049   // We only support vector truncation to 64bits or greater from a
   17050   // 128bits or greater source.
   17051   unsigned DstSizeInBits = DstVT.getSizeInBits();
   17052   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
   17053   if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
   17054     return SDValue();
   17055 
   17056   unsigned NumElems = SrcVT.getVectorNumElements();
   17057   if (!isPowerOf2_32(NumElems))
   17058     return SDValue();
   17059 
   17060   LLVMContext &Ctx = *DAG.getContext();
   17061   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
   17062   assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
   17063 
   17064   EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
   17065 
   17066   // Pack to the largest type possible:
   17067   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
   17068   EVT InVT = MVT::i16, OutVT = MVT::i8;
   17069   if (SrcVT.getScalarSizeInBits() > 16 &&
   17070       (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
   17071     InVT = MVT::i32;
   17072     OutVT = MVT::i16;
   17073   }
   17074 
   17075   // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
   17076   if (SrcVT.is128BitVector()) {
   17077     InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
   17078     OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
   17079     In = DAG.getBitcast(InVT, In);
   17080     SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
   17081     Res = extractSubVector(Res, 0, DAG, DL, 64);
   17082     return DAG.getBitcast(DstVT, Res);
   17083   }
   17084 
   17085   // Extract lower/upper subvectors.
   17086   unsigned NumSubElts = NumElems / 2;
   17087   SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
   17088   SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
   17089 
   17090   unsigned SubSizeInBits = SrcSizeInBits / 2;
   17091   InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
   17092   OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
   17093 
   17094   // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
   17095   if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
   17096     Lo = DAG.getBitcast(InVT, Lo);
   17097     Hi = DAG.getBitcast(InVT, Hi);
   17098     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
   17099     return DAG.getBitcast(DstVT, Res);
   17100   }
   17101 
   17102   // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
   17103   // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
   17104   if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
   17105     Lo = DAG.getBitcast(InVT, Lo);
   17106     Hi = DAG.getBitcast(InVT, Hi);
   17107     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
   17108 
   17109     // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
   17110     // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
   17111     Res = DAG.getBitcast(MVT::v4i64, Res);
   17112     Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
   17113 
   17114     if (DstVT.is256BitVector())
   17115       return DAG.getBitcast(DstVT, Res);
   17116 
   17117     // If 512bit -> 128bit truncate another stage.
   17118     EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
   17119     Res = DAG.getBitcast(PackedVT, Res);
   17120     return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
   17121   }
   17122 
   17123   // Recursively pack lower/upper subvectors, concat result and pack again.
   17124   assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
   17125   EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
   17126   Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
   17127   Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
   17128 
   17129   PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
   17130   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
   17131   return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
   17132 }
   17133 
   17134 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
   17135                                   const X86Subtarget &Subtarget) {
   17136 
   17137   SDLoc DL(Op);
   17138   MVT VT = Op.getSimpleValueType();
   17139   SDValue In = Op.getOperand(0);
   17140   MVT InVT = In.getSimpleValueType();
   17141 
   17142   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
   17143 
   17144   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
   17145   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
   17146   if (InVT.getScalarSizeInBits() <= 16) {
   17147     if (Subtarget.hasBWI()) {
   17148       // legal, will go to VPMOVB2M, VPMOVW2M
   17149       if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
   17150         // We need to shift to get the lsb into sign position.
   17151         // Shift packed bytes not supported natively, bitcast to word
   17152         MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
   17153         In = DAG.getNode(ISD::SHL, DL, ExtVT,
   17154                          DAG.getBitcast(ExtVT, In),
   17155                          DAG.getConstant(ShiftInx, DL, ExtVT));
   17156         In = DAG.getBitcast(InVT, In);
   17157       }
   17158       return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
   17159                           In, ISD::SETGT);
   17160     }
   17161     // Use TESTD/Q, extended vector to packed dword/qword.
   17162     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
   17163            "Unexpected vector type.");
   17164     unsigned NumElts = InVT.getVectorNumElements();
   17165     assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
   17166     // We need to change to a wider element type that we have support for.
   17167     // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
   17168     // For 16 element vectors we extend to v16i32 unless we are explicitly
   17169     // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
   17170     // we need to split into two 8 element vectors which we can extend to v8i32,
   17171     // truncate and concat the results. There's an additional complication if
   17172     // the original type is v16i8. In that case we can't split the v16i8 so
   17173     // first we pre-extend it to v16i16 which we can split to v8i16, then extend
   17174     // to v8i32, truncate that to v8i1 and concat the two halves.
   17175     if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
   17176       if (InVT == MVT::v16i8) {
   17177         // First we need to sign extend up to 256-bits so we can split that.
   17178         InVT = MVT::v16i16;
   17179         In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
   17180       }
   17181       SDValue Lo = extract128BitVector(In, 0, DAG, DL);
   17182       SDValue Hi = extract128BitVector(In, 8, DAG, DL);
   17183       // We're split now, just emit two truncates and a concat. The two
   17184       // truncates will trigger legalization to come back to this function.
   17185       Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
   17186       Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
   17187       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
   17188     }
   17189     // We either have 8 elements or we're allowed to use 512-bit vectors.
   17190     // If we have VLX, we want to use the narrowest vector that can get the
   17191     // job done so we use vXi32.
   17192     MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
   17193     MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
   17194     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
   17195     InVT = ExtVT;
   17196     ShiftInx = InVT.getScalarSizeInBits() - 1;
   17197   }
   17198 
   17199   if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
   17200     // We need to shift to get the lsb into sign position.
   17201     In = DAG.getNode(ISD::SHL, DL, InVT, In,
   17202                      DAG.getConstant(ShiftInx, DL, InVT));
   17203   }
   17204   // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
   17205   if (Subtarget.hasDQI())
   17206     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
   17207                        In, ISD::SETGT);
   17208   return DAG.getSetCC(DL, VT, In, getZeroVector(InVT, Subtarget, DAG, DL),
   17209                       ISD::SETNE);
   17210 }
   17211 
   17212 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   17213   SDLoc DL(Op);
   17214   MVT VT = Op.getSimpleValueType();
   17215   SDValue In = Op.getOperand(0);
   17216   MVT InVT = In.getSimpleValueType();
   17217   unsigned InNumEltBits = InVT.getScalarSizeInBits();
   17218 
   17219   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
   17220          "Invalid TRUNCATE operation");
   17221 
   17222   if (VT.getVectorElementType() == MVT::i1)
   17223     return LowerTruncateVecI1(Op, DAG, Subtarget);
   17224 
   17225   // vpmovqb/w/d, vpmovdb/w, vpmovwb
   17226   if (Subtarget.hasAVX512()) {
   17227     // word to byte only under BWI
   17228     if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8
   17229       // Make sure we're allowed to promote 512-bits.
   17230       if (Subtarget.canExtendTo512DQ())
   17231         return DAG.getNode(ISD::TRUNCATE, DL, VT,
   17232                            DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
   17233     } else {
   17234       return Op;
   17235     }
   17236   }
   17237 
   17238   unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
   17239   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
   17240 
   17241   // Truncate with PACKUS if we are truncating a vector with leading zero bits
   17242   // that extend all the way to the packed/truncated value.
   17243   // Pre-SSE41 we can only use PACKUSWB.
   17244   KnownBits Known;
   17245   DAG.computeKnownBits(In, Known);
   17246   if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
   17247     if (SDValue V =
   17248             truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
   17249       return V;
   17250 
   17251   // Truncate with PACKSS if we are truncating a vector with sign-bits that
   17252   // extend all the way to the packed/truncated value.
   17253   if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
   17254     if (SDValue V =
   17255             truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
   17256       return V;
   17257 
   17258   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
   17259     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
   17260     if (Subtarget.hasInt256()) {
   17261       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
   17262       In = DAG.getBitcast(MVT::v8i32, In);
   17263       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
   17264       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
   17265                          DAG.getIntPtrConstant(0, DL));
   17266     }
   17267 
   17268     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   17269                                DAG.getIntPtrConstant(0, DL));
   17270     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   17271                                DAG.getIntPtrConstant(2, DL));
   17272     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
   17273     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
   17274     static const int ShufMask[] = {0, 2, 4, 6};
   17275     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
   17276   }
   17277 
   17278   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
   17279     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
   17280     if (Subtarget.hasInt256()) {
   17281       In = DAG.getBitcast(MVT::v32i8, In);
   17282 
   17283       // The PSHUFB mask:
   17284       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
   17285                                       -1, -1, -1, -1, -1, -1, -1, -1,
   17286                                       16, 17, 20, 21, 24, 25, 28, 29,
   17287                                       -1, -1, -1, -1, -1, -1, -1, -1 };
   17288       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
   17289       In = DAG.getBitcast(MVT::v4i64, In);
   17290 
   17291       static const int ShufMask2[] = {0,  2,  -1,  -1};
   17292       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, In, ShufMask2);
   17293       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
   17294                        DAG.getIntPtrConstant(0, DL));
   17295       return DAG.getBitcast(VT, In);
   17296     }
   17297 
   17298     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
   17299                                DAG.getIntPtrConstant(0, DL));
   17300 
   17301     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
   17302                                DAG.getIntPtrConstant(4, DL));
   17303 
   17304     OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
   17305     OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
   17306 
   17307     // The PSHUFB mask:
   17308     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
   17309                                    -1, -1, -1, -1, -1, -1, -1, -1};
   17310 
   17311     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
   17312     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
   17313 
   17314     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
   17315     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
   17316 
   17317     // The MOVLHPS Mask:
   17318     static const int ShufMask2[] = {0, 1, 4, 5};
   17319     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
   17320     return DAG.getBitcast(MVT::v8i16, res);
   17321   }
   17322 
   17323   // Handle truncation of V256 to V128 using shuffles.
   17324   assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
   17325 
   17326   assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
   17327 
   17328   unsigned NumElems = VT.getVectorNumElements();
   17329   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
   17330 
   17331   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
   17332   // Prepare truncation shuffle mask
   17333   for (unsigned i = 0; i != NumElems; ++i)
   17334     MaskVec[i] = i * 2;
   17335   In = DAG.getBitcast(NVT, In);
   17336   SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
   17337   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
   17338                      DAG.getIntPtrConstant(0, DL));
   17339 }
   17340 
   17341 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   17342   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
   17343   MVT VT = Op.getSimpleValueType();
   17344 
   17345   if (VT.isVector()) {
   17346     SDValue Src = Op.getOperand(0);
   17347     SDLoc dl(Op);
   17348 
   17349     if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
   17350       MVT ResVT = MVT::v4i32;
   17351       MVT TruncVT = MVT::v4i1;
   17352       unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
   17353       if (!IsSigned && !Subtarget.hasVLX()) {
   17354         // Widen to 512-bits.
   17355         ResVT = MVT::v8i32;
   17356         TruncVT = MVT::v8i1;
   17357         Opc = ISD::FP_TO_UINT;
   17358         Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
   17359                           DAG.getUNDEF(MVT::v8f64),
   17360                           Src, DAG.getIntPtrConstant(0, dl));
   17361       }
   17362       SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
   17363       Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
   17364       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
   17365                          DAG.getIntPtrConstant(0, dl));
   17366     }
   17367 
   17368     assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
   17369     if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
   17370       return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
   17371                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
   17372                                      DAG.getUNDEF(MVT::v2f32)));
   17373     }
   17374 
   17375     return SDValue();
   17376   }
   17377 
   17378   assert(!VT.isVector());
   17379 
   17380   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
   17381     IsSigned, /*IsReplace=*/ false);
   17382   SDValue FIST = Vals.first, StackSlot = Vals.second;
   17383   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
   17384   if (!FIST.getNode())
   17385     return Op;
   17386 
   17387   if (StackSlot.getNode())
   17388     // Load the result.
   17389     return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
   17390 
   17391   // The node is the result.
   17392   return FIST;
   17393 }
   17394 
   17395 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
   17396   SDLoc DL(Op);
   17397   MVT VT = Op.getSimpleValueType();
   17398   SDValue In = Op.getOperand(0);
   17399   MVT SVT = In.getSimpleValueType();
   17400 
   17401   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
   17402 
   17403   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
   17404                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
   17405                                  In, DAG.getUNDEF(SVT)));
   17406 }
   17407 
   17408 /// The only differences between FABS and FNEG are the mask and the logic op.
   17409 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
   17410 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   17411   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
   17412          "Wrong opcode for lowering FABS or FNEG.");
   17413 
   17414   bool IsFABS = (Op.getOpcode() == ISD::FABS);
   17415 
   17416   // If this is a FABS and it has an FNEG user, bail out to fold the combination
   17417   // into an FNABS. We'll lower the FABS after that if it is still in use.
   17418   if (IsFABS)
   17419     for (SDNode *User : Op->uses())
   17420       if (User->getOpcode() == ISD::FNEG)
   17421         return Op;
   17422 
   17423   SDLoc dl(Op);
   17424   MVT VT = Op.getSimpleValueType();
   17425 
   17426   bool IsF128 = (VT == MVT::f128);
   17427 
   17428   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
   17429   // decide if we should generate a 16-byte constant mask when we only need 4 or
   17430   // 8 bytes for the scalar case.
   17431 
   17432   MVT LogicVT;
   17433   MVT EltVT;
   17434 
   17435   if (VT.isVector()) {
   17436     LogicVT = VT;
   17437     EltVT = VT.getVectorElementType();
   17438   } else if (IsF128) {
   17439     // SSE instructions are used for optimized f128 logical operations.
   17440     LogicVT = MVT::f128;
   17441     EltVT = VT;
   17442   } else {
   17443     // There are no scalar bitwise logical SSE/AVX instructions, so we
   17444     // generate a 16-byte vector constant and logic op even for the scalar case.
   17445     // Using a 16-byte mask allows folding the load of the mask with
   17446     // the logic op, so it can save (~4 bytes) on code size.
   17447     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
   17448     EltVT = VT;
   17449   }
   17450 
   17451   unsigned EltBits = EltVT.getSizeInBits();
   17452   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
   17453   APInt MaskElt =
   17454     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
   17455   const fltSemantics &Sem =
   17456       EltVT == MVT::f64 ? APFloat::IEEEdouble() :
   17457           (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
   17458   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
   17459 
   17460   SDValue Op0 = Op.getOperand(0);
   17461   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
   17462   unsigned LogicOp =
   17463     IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
   17464   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
   17465 
   17466   if (VT.isVector() || IsF128)
   17467     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
   17468 
   17469   // For the scalar case extend to a 128-bit vector, perform the logic op,
   17470   // and extract the scalar result back out.
   17471   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
   17472   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
   17473   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
   17474                      DAG.getIntPtrConstant(0, dl));
   17475 }
   17476 
   17477 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   17478   SDValue Mag = Op.getOperand(0);
   17479   SDValue Sign = Op.getOperand(1);
   17480   SDLoc dl(Op);
   17481 
   17482   // If the sign operand is smaller, extend it first.
   17483   MVT VT = Op.getSimpleValueType();
   17484   if (Sign.getSimpleValueType().bitsLT(VT))
   17485     Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
   17486 
   17487   // And if it is bigger, shrink it first.
   17488   if (Sign.getSimpleValueType().bitsGT(VT))
   17489     Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
   17490 
   17491   // At this point the operands and the result should have the same
   17492   // type, and that won't be f80 since that is not custom lowered.
   17493   bool IsF128 = (VT == MVT::f128);
   17494   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
   17495           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
   17496           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
   17497          "Unexpected type in LowerFCOPYSIGN");
   17498 
   17499   MVT EltVT = VT.getScalarType();
   17500   const fltSemantics &Sem =
   17501       EltVT == MVT::f64 ? APFloat::IEEEdouble()
   17502                         : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
   17503 
   17504   // Perform all scalar logic operations as 16-byte vectors because there are no
   17505   // scalar FP logic instructions in SSE.
   17506   // TODO: This isn't necessary. If we used scalar types, we might avoid some
   17507   // unnecessary splats, but we might miss load folding opportunities. Should
   17508   // this decision be based on OptimizeForSize?
   17509   bool IsFakeVector = !VT.isVector() && !IsF128;
   17510   MVT LogicVT = VT;
   17511   if (IsFakeVector)
   17512     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
   17513 
   17514   // The mask constants are automatically splatted for vector types.
   17515   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   17516   SDValue SignMask = DAG.getConstantFP(
   17517       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
   17518   SDValue MagMask = DAG.getConstantFP(
   17519       APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
   17520 
   17521   // First, clear all bits but the sign bit from the second operand (sign).
   17522   if (IsFakeVector)
   17523     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
   17524   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
   17525 
   17526   // Next, clear the sign bit from the first operand (magnitude).
   17527   // TODO: If we had general constant folding for FP logic ops, this check
   17528   // wouldn't be necessary.
   17529   SDValue MagBits;
   17530   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
   17531     APFloat APF = Op0CN->getValueAPF();
   17532     APF.clearSign();
   17533     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
   17534   } else {
   17535     // If the magnitude operand wasn't a constant, we need to AND out the sign.
   17536     if (IsFakeVector)
   17537       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
   17538     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
   17539   }
   17540 
   17541   // OR the magnitude value with the sign bit.
   17542   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
   17543   return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
   17544                                           DAG.getIntPtrConstant(0, dl));
   17545 }
   17546 
   17547 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   17548   SDValue N0 = Op.getOperand(0);
   17549   SDLoc dl(Op);
   17550   MVT VT = Op.getSimpleValueType();
   17551 
   17552   MVT OpVT = N0.getSimpleValueType();
   17553   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
   17554          "Unexpected type for FGETSIGN");
   17555 
   17556   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
   17557   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
   17558   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
   17559   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
   17560   Res = DAG.getZExtOrTrunc(Res, dl, VT);
   17561   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
   17562   return Res;
   17563 }
   17564 
   17565 /// Helper for creating a X86ISD::SETCC node.
   17566 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
   17567                         SelectionDAG &DAG) {
   17568   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
   17569                      DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
   17570 }
   17571 
   17572 // Check whether an OR'd tree is PTEST-able.
   17573 static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
   17574                                       const X86Subtarget &Subtarget,
   17575                                       SelectionDAG &DAG) {
   17576   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
   17577 
   17578   if (!Subtarget.hasSSE41())
   17579     return SDValue();
   17580 
   17581   if (!Op->hasOneUse())
   17582     return SDValue();
   17583 
   17584   SDNode *N = Op.getNode();
   17585   SDLoc DL(N);
   17586 
   17587   SmallVector<SDValue, 8> Opnds;
   17588   DenseMap<SDValue, unsigned> VecInMap;
   17589   SmallVector<SDValue, 8> VecIns;
   17590   EVT VT = MVT::Other;
   17591 
   17592   // Recognize a special case where a vector is casted into wide integer to
   17593   // test all 0s.
   17594   Opnds.push_back(N->getOperand(0));
   17595   Opnds.push_back(N->getOperand(1));
   17596 
   17597   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
   17598     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
   17599     // BFS traverse all OR'd operands.
   17600     if (I->getOpcode() == ISD::OR) {
   17601       Opnds.push_back(I->getOperand(0));
   17602       Opnds.push_back(I->getOperand(1));
   17603       // Re-evaluate the number of nodes to be traversed.
   17604       e += 2; // 2 more nodes (LHS and RHS) are pushed.
   17605       continue;
   17606     }
   17607 
   17608     // Quit if a non-EXTRACT_VECTOR_ELT
   17609     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   17610       return SDValue();
   17611 
   17612     // Quit if without a constant index.
   17613     SDValue Idx = I->getOperand(1);
   17614     if (!isa<ConstantSDNode>(Idx))
   17615       return SDValue();
   17616 
   17617     SDValue ExtractedFromVec = I->getOperand(0);
   17618     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
   17619     if (M == VecInMap.end()) {
   17620       VT = ExtractedFromVec.getValueType();
   17621       // Quit if not 128/256-bit vector.
   17622       if (!VT.is128BitVector() && !VT.is256BitVector())
   17623         return SDValue();
   17624       // Quit if not the same type.
   17625       if (VecInMap.begin() != VecInMap.end() &&
   17626           VT != VecInMap.begin()->first.getValueType())
   17627         return SDValue();
   17628       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
   17629       VecIns.push_back(ExtractedFromVec);
   17630     }
   17631     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
   17632   }
   17633 
   17634   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   17635          "Not extracted from 128-/256-bit vector.");
   17636 
   17637   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
   17638 
   17639   for (DenseMap<SDValue, unsigned>::const_iterator
   17640         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
   17641     // Quit if not all elements are used.
   17642     if (I->second != FullMask)
   17643       return SDValue();
   17644   }
   17645 
   17646   MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
   17647 
   17648   // Cast all vectors into TestVT for PTEST.
   17649   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
   17650     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
   17651 
   17652   // If more than one full vector is evaluated, OR them first before PTEST.
   17653   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
   17654     // Each iteration will OR 2 nodes and append the result until there is only
   17655     // 1 node left, i.e. the final OR'd value of all vectors.
   17656     SDValue LHS = VecIns[Slot];
   17657     SDValue RHS = VecIns[Slot + 1];
   17658     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
   17659   }
   17660 
   17661   SDValue Res = DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
   17662                             VecIns.back(), VecIns.back());
   17663   return getSETCC(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, Res, DL, DAG);
   17664 }
   17665 
   17666 /// return true if \c Op has a use that doesn't just read flags.
   17667 static bool hasNonFlagsUse(SDValue Op) {
   17668   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
   17669        ++UI) {
   17670     SDNode *User = *UI;
   17671     unsigned UOpNo = UI.getOperandNo();
   17672     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
   17673       // Look pass truncate.
   17674       UOpNo = User->use_begin().getOperandNo();
   17675       User = *User->use_begin();
   17676     }
   17677 
   17678     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
   17679         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
   17680       return true;
   17681   }
   17682   return false;
   17683 }
   17684 
   17685 /// Emit nodes that will be selected as "test Op0,Op0", or something
   17686 /// equivalent.
   17687 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
   17688                                     SelectionDAG &DAG) const {
   17689   // CF and OF aren't always set the way we want. Determine which
   17690   // of these we need.
   17691   bool NeedCF = false;
   17692   bool NeedOF = false;
   17693   switch (X86CC) {
   17694   default: break;
   17695   case X86::COND_A: case X86::COND_AE:
   17696   case X86::COND_B: case X86::COND_BE:
   17697     NeedCF = true;
   17698     break;
   17699   case X86::COND_G: case X86::COND_GE:
   17700   case X86::COND_L: case X86::COND_LE:
   17701   case X86::COND_O: case X86::COND_NO: {
   17702     // Check if we really need to set the
   17703     // Overflow flag. If NoSignedWrap is present
   17704     // that is not actually needed.
   17705     switch (Op->getOpcode()) {
   17706     case ISD::ADD:
   17707     case ISD::SUB:
   17708     case ISD::MUL:
   17709     case ISD::SHL:
   17710       if (Op.getNode()->getFlags().hasNoSignedWrap())
   17711         break;
   17712       LLVM_FALLTHROUGH;
   17713     default:
   17714       NeedOF = true;
   17715       break;
   17716     }
   17717     break;
   17718   }
   17719   }
   17720   // See if we can use the EFLAGS value from the operand instead of
   17721   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   17722   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   17723   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
   17724     // Emit a CMP with 0, which is the TEST pattern.
   17725     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   17726                        DAG.getConstant(0, dl, Op.getValueType()));
   17727   }
   17728   unsigned Opcode = 0;
   17729   unsigned NumOperands = 0;
   17730 
   17731   // Truncate operations may prevent the merge of the SETCC instruction
   17732   // and the arithmetic instruction before it. Attempt to truncate the operands
   17733   // of the arithmetic instruction and use a reduced bit-width instruction.
   17734   bool NeedTruncation = false;
   17735   SDValue ArithOp = Op;
   17736   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
   17737     SDValue Arith = Op->getOperand(0);
   17738     // Both the trunc and the arithmetic op need to have one user each.
   17739     if (Arith->hasOneUse())
   17740       switch (Arith.getOpcode()) {
   17741         default: break;
   17742         case ISD::ADD:
   17743         case ISD::SUB:
   17744         case ISD::AND:
   17745         case ISD::OR:
   17746         case ISD::XOR: {
   17747           NeedTruncation = true;
   17748           ArithOp = Arith;
   17749         }
   17750       }
   17751   }
   17752 
   17753   // Sometimes flags can be set either with an AND or with an SRL/SHL
   17754   // instruction. SRL/SHL variant should be preferred for masks longer than this
   17755   // number of bits.
   17756   const int ShiftToAndMaxMaskWidth = 32;
   17757   const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
   17758 
   17759   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
   17760   // which may be the result of a CAST.  We use the variable 'Op', which is the
   17761   // non-casted variable when we check for possible users.
   17762   switch (ArithOp.getOpcode()) {
   17763   case ISD::ADD:
   17764     // We only want to rewrite this as a target-specific node with attached
   17765     // flags if there is a reasonable chance of either using that to do custom
   17766     // instructions selection that can fold some of the memory operands, or if
   17767     // only the flags are used. If there are other uses, leave the node alone
   17768     // and emit a test instruction.
   17769     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   17770          UE = Op.getNode()->use_end(); UI != UE; ++UI)
   17771       if (UI->getOpcode() != ISD::CopyToReg &&
   17772           UI->getOpcode() != ISD::SETCC &&
   17773           UI->getOpcode() != ISD::STORE)
   17774         goto default_case;
   17775 
   17776     if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
   17777       // An add of one will be selected as an INC.
   17778       if (C->isOne() &&
   17779           (!Subtarget.slowIncDec() ||
   17780            DAG.getMachineFunction().getFunction().optForSize())) {
   17781         Opcode = X86ISD::INC;
   17782         NumOperands = 1;
   17783         break;
   17784       }
   17785 
   17786       // An add of negative one (subtract of one) will be selected as a DEC.
   17787       if (C->isAllOnesValue() &&
   17788           (!Subtarget.slowIncDec() ||
   17789            DAG.getMachineFunction().getFunction().optForSize())) {
   17790         Opcode = X86ISD::DEC;
   17791         NumOperands = 1;
   17792         break;
   17793       }
   17794     }
   17795 
   17796     // Otherwise use a regular EFLAGS-setting add.
   17797     Opcode = X86ISD::ADD;
   17798     NumOperands = 2;
   17799     break;
   17800   case ISD::SHL:
   17801   case ISD::SRL:
   17802     // If we have a constant logical shift that's only used in a comparison
   17803     // against zero turn it into an equivalent AND. This allows turning it into
   17804     // a TEST instruction later.
   17805     if (ZeroCheck && Op->hasOneUse() &&
   17806         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
   17807       EVT VT = Op.getValueType();
   17808       unsigned BitWidth = VT.getSizeInBits();
   17809       unsigned ShAmt = Op->getConstantOperandVal(1);
   17810       if (ShAmt >= BitWidth) // Avoid undefined shifts.
   17811         break;
   17812       APInt Mask = ArithOp.getOpcode() == ISD::SRL
   17813                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
   17814                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
   17815       if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
   17816         break;
   17817       Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
   17818                        DAG.getConstant(Mask, dl, VT));
   17819     }
   17820     break;
   17821 
   17822   case ISD::AND:
   17823     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
   17824     // because a TEST instruction will be better. However, AND should be
   17825     // preferred if the instruction can be combined into ANDN.
   17826     if (!hasNonFlagsUse(Op)) {
   17827       SDValue Op0 = ArithOp->getOperand(0);
   17828       SDValue Op1 = ArithOp->getOperand(1);
   17829       EVT VT = ArithOp.getValueType();
   17830       bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
   17831       bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
   17832       bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
   17833 
   17834       // If we cannot select an ANDN instruction, check if we can replace
   17835       // AND+IMM64 with a shift before giving up. This is possible for masks
   17836       // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
   17837       if (!isProperAndn) {
   17838         if (!ZeroCheck)
   17839           break;
   17840 
   17841         assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
   17842         auto *CN = dyn_cast<ConstantSDNode>(Op1);
   17843         if (!CN)
   17844           break;
   17845 
   17846         const APInt &Mask = CN->getAPIntValue();
   17847         if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
   17848           break; // Prefer TEST instruction.
   17849 
   17850         unsigned BitWidth = Mask.getBitWidth();
   17851         unsigned LeadingOnes = Mask.countLeadingOnes();
   17852         unsigned TrailingZeros = Mask.countTrailingZeros();
   17853 
   17854         if (LeadingOnes + TrailingZeros == BitWidth) {
   17855           assert(TrailingZeros < VT.getSizeInBits() &&
   17856                  "Shift amount should be less than the type width");
   17857           MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
   17858           SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
   17859           Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
   17860           break;
   17861         }
   17862 
   17863         unsigned LeadingZeros = Mask.countLeadingZeros();
   17864         unsigned TrailingOnes = Mask.countTrailingOnes();
   17865 
   17866         if (LeadingZeros + TrailingOnes == BitWidth) {
   17867           assert(LeadingZeros < VT.getSizeInBits() &&
   17868                  "Shift amount should be less than the type width");
   17869           MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
   17870           SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
   17871           Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
   17872           break;
   17873         }
   17874 
   17875         break;
   17876       }
   17877     }
   17878     LLVM_FALLTHROUGH;
   17879   case ISD::SUB:
   17880   case ISD::OR:
   17881   case ISD::XOR:
   17882     // Similar to ISD::ADD above, check if the uses will preclude useful
   17883     // lowering of the target-specific node.
   17884     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
   17885            UE = Op.getNode()->use_end(); UI != UE; ++UI)
   17886       if (UI->getOpcode() != ISD::CopyToReg &&
   17887           UI->getOpcode() != ISD::SETCC &&
   17888           UI->getOpcode() != ISD::STORE)
   17889         goto default_case;
   17890 
   17891     // Otherwise use a regular EFLAGS-setting instruction.
   17892     switch (ArithOp.getOpcode()) {
   17893     default: llvm_unreachable("unexpected operator!");
   17894     case ISD::SUB: Opcode = X86ISD::SUB; break;
   17895     case ISD::XOR: Opcode = X86ISD::XOR; break;
   17896     case ISD::AND: Opcode = X86ISD::AND; break;
   17897     case ISD::OR:  Opcode = X86ISD::OR;  break;
   17898     }
   17899 
   17900     NumOperands = 2;
   17901     break;
   17902   case X86ISD::ADD:
   17903   case X86ISD::SUB:
   17904   case X86ISD::INC:
   17905   case X86ISD::DEC:
   17906   case X86ISD::OR:
   17907   case X86ISD::XOR:
   17908   case X86ISD::AND:
   17909     return SDValue(Op.getNode(), 1);
   17910   default:
   17911   default_case:
   17912     break;
   17913   }
   17914 
   17915   // If we found that truncation is beneficial, perform the truncation and
   17916   // update 'Op'.
   17917   if (NeedTruncation) {
   17918     EVT VT = Op.getValueType();
   17919     SDValue WideVal = Op->getOperand(0);
   17920     EVT WideVT = WideVal.getValueType();
   17921     unsigned ConvertedOp = 0;
   17922     // Use a target machine opcode to prevent further DAGCombine
   17923     // optimizations that may separate the arithmetic operations
   17924     // from the setcc node.
   17925     switch (WideVal.getOpcode()) {
   17926       default: break;
   17927       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
   17928       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
   17929       case ISD::AND: ConvertedOp = X86ISD::AND; break;
   17930       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
   17931       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
   17932     }
   17933 
   17934     if (ConvertedOp) {
   17935       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   17936       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
   17937         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
   17938         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
   17939         SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   17940         Op = DAG.getNode(ConvertedOp, dl, VTs, V0, V1);
   17941       }
   17942     }
   17943   }
   17944 
   17945   if (Opcode == 0) {
   17946     // Emit a CMP with 0, which is the TEST pattern.
   17947     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
   17948                        DAG.getConstant(0, dl, Op.getValueType()));
   17949   }
   17950   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   17951   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
   17952 
   17953   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
   17954   DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
   17955   return SDValue(New.getNode(), 1);
   17956 }
   17957 
   17958 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
   17959 /// equivalent.
   17960 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
   17961                                    const SDLoc &dl, SelectionDAG &DAG) const {
   17962   if (isNullConstant(Op1))
   17963     return EmitTest(Op0, X86CC, dl, DAG);
   17964 
   17965   assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
   17966          "Unexpected comparison operation for MVT::i1 operands");
   17967 
   17968   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
   17969        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
   17970     // Only promote the compare up to I32 if it is a 16 bit operation
   17971     // with an immediate.  16 bit immediates are to be avoided.
   17972     if ((Op0.getValueType() == MVT::i16 &&
   17973          (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
   17974         !DAG.getMachineFunction().getFunction().optForMinSize() &&
   17975         !Subtarget.isAtom()) {
   17976       unsigned ExtendOp =
   17977           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
   17978       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
   17979       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
   17980     }
   17981     // Use SUB instead of CMP to enable CSE between SUB and CMP.
   17982     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
   17983     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
   17984     return SDValue(Sub.getNode(), 1);
   17985   }
   17986   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
   17987 }
   17988 
   17989 /// Convert a comparison if required by the subtarget.
   17990 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
   17991                                                  SelectionDAG &DAG) const {
   17992   // If the subtarget does not support the FUCOMI instruction, floating-point
   17993   // comparisons have to be converted.
   17994   if (Subtarget.hasCMov() ||
   17995       Cmp.getOpcode() != X86ISD::CMP ||
   17996       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
   17997       !Cmp.getOperand(1).getValueType().isFloatingPoint())
   17998     return Cmp;
   17999 
   18000   // The instruction selector will select an FUCOM instruction instead of
   18001   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
   18002   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
   18003   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
   18004   SDLoc dl(Cmp);
   18005   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
   18006   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
   18007   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
   18008                             DAG.getConstant(8, dl, MVT::i8));
   18009   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
   18010 
   18011   // Some 64-bit targets lack SAHF support, but they do support FCOMI.
   18012   assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
   18013   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
   18014 }
   18015 
   18016 /// Check if replacement of SQRT with RSQRT should be disabled.
   18017 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
   18018   EVT VT = Op.getValueType();
   18019 
   18020   // We never want to use both SQRT and RSQRT instructions for the same input.
   18021   if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
   18022     return false;
   18023 
   18024   if (VT.isVector())
   18025     return Subtarget.hasFastVectorFSQRT();
   18026   return Subtarget.hasFastScalarFSQRT();
   18027 }
   18028 
   18029 /// The minimum architected relative accuracy is 2^-12. We need one
   18030 /// Newton-Raphson step to have a good float result (24 bits of precision).
   18031 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
   18032                                            SelectionDAG &DAG, int Enabled,
   18033                                            int &RefinementSteps,
   18034                                            bool &UseOneConstNR,
   18035                                            bool Reciprocal) const {
   18036   EVT VT = Op.getValueType();
   18037 
   18038   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
   18039   // It is likely not profitable to do this for f64 because a double-precision
   18040   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
   18041   // instructions: convert to single, rsqrtss, convert back to double, refine
   18042   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
   18043   // along with FMA, this could be a throughput win.
   18044   // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
   18045   // after legalize types.
   18046   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
   18047       (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
   18048       (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
   18049       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
   18050       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
   18051     if (RefinementSteps == ReciprocalEstimate::Unspecified)
   18052       RefinementSteps = 1;
   18053 
   18054     UseOneConstNR = false;
   18055     // There is no FSQRT for 512-bits, but there is RSQRT14.
   18056     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
   18057     return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
   18058   }
   18059   return SDValue();
   18060 }
   18061 
   18062 /// The minimum architected relative accuracy is 2^-12. We need one
   18063 /// Newton-Raphson step to have a good float result (24 bits of precision).
   18064 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
   18065                                             int Enabled,
   18066                                             int &RefinementSteps) const {
   18067   EVT VT = Op.getValueType();
   18068 
   18069   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
   18070   // It is likely not profitable to do this for f64 because a double-precision
   18071   // reciprocal estimate with refinement on x86 prior to FMA requires
   18072   // 15 instructions: convert to single, rcpss, convert back to double, refine
   18073   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
   18074   // along with FMA, this could be a throughput win.
   18075 
   18076   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
   18077       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
   18078       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
   18079       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
   18080     // Enable estimate codegen with 1 refinement step for vector division.
   18081     // Scalar division estimates are disabled because they break too much
   18082     // real-world code. These defaults are intended to match GCC behavior.
   18083     if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
   18084       return SDValue();
   18085 
   18086     if (RefinementSteps == ReciprocalEstimate::Unspecified)
   18087       RefinementSteps = 1;
   18088 
   18089     // There is no FSQRT for 512-bits, but there is RCP14.
   18090     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
   18091     return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
   18092   }
   18093   return SDValue();
   18094 }
   18095 
   18096 /// If we have at least two divisions that use the same divisor, convert to
   18097 /// multiplication by a reciprocal. This may need to be adjusted for a given
   18098 /// CPU if a division's cost is not at least twice the cost of a multiplication.
   18099 /// This is because we still need one division to calculate the reciprocal and
   18100 /// then we need two multiplies by that reciprocal as replacements for the
   18101 /// original divisions.
   18102 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
   18103   return 2;
   18104 }
   18105 
   18106 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
   18107 /// according to equal/not-equal condition code \p CC.
   18108 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
   18109                                    const SDLoc &dl, SelectionDAG &DAG) {
   18110   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
   18111   // instruction.  Since the shift amount is in-range-or-undefined, we know
   18112   // that doing a bittest on the i32 value is ok.  We extend to i32 because
   18113   // the encoding for the i16 version is larger than the i32 version.
   18114   // Also promote i16 to i32 for performance / code size reason.
   18115   if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
   18116     Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
   18117 
   18118   // See if we can use the 32-bit instruction instead of the 64-bit one for a
   18119   // shorter encoding. Since the former takes the modulo 32 of BitNo and the
   18120   // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
   18121   // known to be zero.
   18122   if (Src.getValueType() == MVT::i64 &&
   18123       DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
   18124     Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
   18125 
   18126   // If the operand types disagree, extend the shift amount to match.  Since
   18127   // BT ignores high bits (like shifts) we can use anyextend.
   18128   if (Src.getValueType() != BitNo.getValueType())
   18129     BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
   18130 
   18131   SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
   18132   X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
   18133   return getSETCC(Cond, BT, dl , DAG);
   18134 }
   18135 
   18136 /// Result of 'and' is compared against zero. Change to a BT node if possible.
   18137 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
   18138                             const SDLoc &dl, SelectionDAG &DAG) {
   18139   assert(And.getOpcode() == ISD::AND && "Expected AND node!");
   18140   SDValue Op0 = And.getOperand(0);
   18141   SDValue Op1 = And.getOperand(1);
   18142   if (Op0.getOpcode() == ISD::TRUNCATE)
   18143     Op0 = Op0.getOperand(0);
   18144   if (Op1.getOpcode() == ISD::TRUNCATE)
   18145     Op1 = Op1.getOperand(0);
   18146 
   18147   SDValue LHS, RHS;
   18148   if (Op1.getOpcode() == ISD::SHL)
   18149     std::swap(Op0, Op1);
   18150   if (Op0.getOpcode() == ISD::SHL) {
   18151     if (isOneConstant(Op0.getOperand(0))) {
   18152       // If we looked past a truncate, check that it's only truncating away
   18153       // known zeros.
   18154       unsigned BitWidth = Op0.getValueSizeInBits();
   18155       unsigned AndBitWidth = And.getValueSizeInBits();
   18156       if (BitWidth > AndBitWidth) {
   18157         KnownBits Known;
   18158         DAG.computeKnownBits(Op0, Known);
   18159         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
   18160           return SDValue();
   18161       }
   18162       LHS = Op1;
   18163       RHS = Op0.getOperand(1);
   18164     }
   18165   } else if (Op1.getOpcode() == ISD::Constant) {
   18166     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
   18167     uint64_t AndRHSVal = AndRHS->getZExtValue();
   18168     SDValue AndLHS = Op0;
   18169 
   18170     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
   18171       LHS = AndLHS.getOperand(0);
   18172       RHS = AndLHS.getOperand(1);
   18173     } else {
   18174       // Use BT if the immediate can't be encoded in a TEST instruction or we
   18175       // are optimizing for size and the immedaite won't fit in a byte.
   18176       bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
   18177       if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
   18178           isPowerOf2_64(AndRHSVal)) {
   18179         LHS = AndLHS;
   18180         RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
   18181       }
   18182     }
   18183   }
   18184 
   18185   if (LHS.getNode())
   18186     return getBitTestCondition(LHS, RHS, CC, dl, DAG);
   18187 
   18188   return SDValue();
   18189 }
   18190 
   18191 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
   18192 /// CMPs.
   18193 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
   18194                                    SDValue &Op1) {
   18195   unsigned SSECC;
   18196   bool Swap = false;
   18197 
   18198   // SSE Condition code mapping:
   18199   //  0 - EQ
   18200   //  1 - LT
   18201   //  2 - LE
   18202   //  3 - UNORD
   18203   //  4 - NEQ
   18204   //  5 - NLT
   18205   //  6 - NLE
   18206   //  7 - ORD
   18207   switch (SetCCOpcode) {
   18208   default: llvm_unreachable("Unexpected SETCC condition");
   18209   case ISD::SETOEQ:
   18210   case ISD::SETEQ:  SSECC = 0; break;
   18211   case ISD::SETOGT:
   18212   case ISD::SETGT:  Swap = true; LLVM_FALLTHROUGH;
   18213   case ISD::SETLT:
   18214   case ISD::SETOLT: SSECC = 1; break;
   18215   case ISD::SETOGE:
   18216   case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
   18217   case ISD::SETLE:
   18218   case ISD::SETOLE: SSECC = 2; break;
   18219   case ISD::SETUO:  SSECC = 3; break;
   18220   case ISD::SETUNE:
   18221   case ISD::SETNE:  SSECC = 4; break;
   18222   case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
   18223   case ISD::SETUGE: SSECC = 5; break;
   18224   case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
   18225   case ISD::SETUGT: SSECC = 6; break;
   18226   case ISD::SETO:   SSECC = 7; break;
   18227   case ISD::SETUEQ: SSECC = 8; break;
   18228   case ISD::SETONE: SSECC = 12; break;
   18229   }
   18230   if (Swap)
   18231     std::swap(Op0, Op1);
   18232 
   18233   return SSECC;
   18234 }
   18235 
   18236 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
   18237 /// concatenate the result back.
   18238 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   18239   MVT VT = Op.getSimpleValueType();
   18240 
   18241   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
   18242          "Unsupported value type for operation");
   18243 
   18244   unsigned NumElems = VT.getVectorNumElements();
   18245   SDLoc dl(Op);
   18246   SDValue CC = Op.getOperand(2);
   18247 
   18248   // Extract the LHS vectors
   18249   SDValue LHS = Op.getOperand(0);
   18250   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
   18251   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
   18252 
   18253   // Extract the RHS vectors
   18254   SDValue RHS = Op.getOperand(1);
   18255   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
   18256   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
   18257 
   18258   // Issue the operation on the smaller types and concatenate the result back
   18259   MVT EltVT = VT.getVectorElementType();
   18260   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   18261   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   18262                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
   18263                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
   18264 }
   18265 
   18266 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
   18267 
   18268   SDValue Op0 = Op.getOperand(0);
   18269   SDValue Op1 = Op.getOperand(1);
   18270   SDValue CC = Op.getOperand(2);
   18271   MVT VT = Op.getSimpleValueType();
   18272   SDLoc dl(Op);
   18273 
   18274   assert(VT.getVectorElementType() == MVT::i1 &&
   18275          "Cannot set masked compare for this operation");
   18276 
   18277   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   18278 
   18279   // If this is a seteq make sure any build vectors of all zeros are on the RHS.
   18280   // This helps with vptestm matching.
   18281   // TODO: Should we just canonicalize the setcc during DAG combine?
   18282   if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
   18283       ISD::isBuildVectorAllZeros(Op0.getNode()))
   18284     std::swap(Op0, Op1);
   18285 
   18286   // Prefer SETGT over SETLT.
   18287   if (SetCCOpcode == ISD::SETLT) {
   18288     SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
   18289     std::swap(Op0, Op1);
   18290   }
   18291 
   18292   return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
   18293 }
   18294 
   18295 /// Try to turn a VSETULT into a VSETULE by modifying its second
   18296 /// operand \p Op1.  If non-trivial (for example because it's not constant)
   18297 /// return an empty value.
   18298 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
   18299                                       SelectionDAG &DAG) {
   18300   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
   18301   if (!BV)
   18302     return SDValue();
   18303 
   18304   MVT VT = Op1.getSimpleValueType();
   18305   MVT EVT = VT.getVectorElementType();
   18306   unsigned n = VT.getVectorNumElements();
   18307   SmallVector<SDValue, 8> ULTOp1;
   18308 
   18309   for (unsigned i = 0; i < n; ++i) {
   18310     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
   18311     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
   18312       return SDValue();
   18313 
   18314     // Avoid underflow.
   18315     APInt Val = Elt->getAPIntValue();
   18316     if (Val == 0)
   18317       return SDValue();
   18318 
   18319     ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
   18320   }
   18321 
   18322   return DAG.getBuildVector(VT, dl, ULTOp1);
   18323 }
   18324 
   18325 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
   18326 /// Op0 u<= Op1:
   18327 ///   t = psubus Op0, Op1
   18328 ///   pcmpeq t, <0..0>
   18329 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
   18330                                     ISD::CondCode Cond, const SDLoc &dl,
   18331                                     const X86Subtarget &Subtarget,
   18332                                     SelectionDAG &DAG) {
   18333   if (!Subtarget.hasSSE2())
   18334     return SDValue();
   18335 
   18336   MVT VET = VT.getVectorElementType();
   18337   if (VET != MVT::i8 && VET != MVT::i16)
   18338     return SDValue();
   18339 
   18340   switch (Cond) {
   18341   default:
   18342     return SDValue();
   18343   case ISD::SETULT: {
   18344     // If the comparison is against a constant we can turn this into a
   18345     // setule.  With psubus, setule does not require a swap.  This is
   18346     // beneficial because the constant in the register is no longer
   18347     // destructed as the destination so it can be hoisted out of a loop.
   18348     // Only do this pre-AVX since vpcmp* is no longer destructive.
   18349     if (Subtarget.hasAVX())
   18350       return SDValue();
   18351     SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
   18352     if (!ULEOp1)
   18353       return SDValue();
   18354     Op1 = ULEOp1;
   18355     break;
   18356   }
   18357   // Psubus is better than flip-sign because it requires no inversion.
   18358   case ISD::SETUGE:
   18359     std::swap(Op0, Op1);
   18360     break;
   18361   case ISD::SETULE:
   18362     break;
   18363   }
   18364 
   18365   SDValue Result = DAG.getNode(X86ISD::SUBUS, dl, VT, Op0, Op1);
   18366   return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
   18367                      getZeroVector(VT, Subtarget, DAG, dl));
   18368 }
   18369 
   18370 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   18371                            SelectionDAG &DAG) {
   18372   SDValue Op0 = Op.getOperand(0);
   18373   SDValue Op1 = Op.getOperand(1);
   18374   SDValue CC = Op.getOperand(2);
   18375   MVT VT = Op.getSimpleValueType();
   18376   ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
   18377   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
   18378   SDLoc dl(Op);
   18379 
   18380   if (isFP) {
   18381 #ifndef NDEBUG
   18382     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
   18383     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
   18384 #endif
   18385 
   18386     unsigned Opc;
   18387     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
   18388       assert(VT.getVectorNumElements() <= 16);
   18389       Opc = X86ISD::CMPM;
   18390     } else {
   18391       Opc = X86ISD::CMPP;
   18392       // The SSE/AVX packed FP comparison nodes are defined with a
   18393       // floating-point vector result that matches the operand type. This allows
   18394       // them to work with an SSE1 target (integer vector types are not legal).
   18395       VT = Op0.getSimpleValueType();
   18396     }
   18397 
   18398     // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
   18399     // emit two comparisons and a logic op to tie them together.
   18400     SDValue Cmp;
   18401     unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
   18402     if (SSECC >= 8 && !Subtarget.hasAVX()) {
   18403       // LLVM predicate is SETUEQ or SETONE.
   18404       unsigned CC0, CC1;
   18405       unsigned CombineOpc;
   18406       if (Cond == ISD::SETUEQ) {
   18407         CC0 = 3; // UNORD
   18408         CC1 = 0; // EQ
   18409         CombineOpc = X86ISD::FOR;
   18410       } else {
   18411         assert(Cond == ISD::SETONE);
   18412         CC0 = 7; // ORD
   18413         CC1 = 4; // NEQ
   18414         CombineOpc = X86ISD::FAND;
   18415       }
   18416 
   18417       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
   18418                                  DAG.getConstant(CC0, dl, MVT::i8));
   18419       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
   18420                                  DAG.getConstant(CC1, dl, MVT::i8));
   18421       Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
   18422     } else {
   18423       // Handle all other FP comparisons here.
   18424       Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
   18425                         DAG.getConstant(SSECC, dl, MVT::i8));
   18426     }
   18427 
   18428     // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
   18429     // result type of SETCC. The bitcast is expected to be optimized away
   18430     // during combining/isel.
   18431     if (Opc == X86ISD::CMPP)
   18432       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
   18433 
   18434     return Cmp;
   18435   }
   18436 
   18437   MVT VTOp0 = Op0.getSimpleValueType();
   18438   assert(VTOp0 == Op1.getSimpleValueType() &&
   18439          "Expected operands with same type!");
   18440   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
   18441          "Invalid number of packed elements for source and destination!");
   18442 
   18443   // This is being called by type legalization because v2i32 is marked custom
   18444   // for result type legalization for v2f32.
   18445   if (VTOp0 == MVT::v2i32)
   18446     return SDValue();
   18447 
   18448   // The non-AVX512 code below works under the assumption that source and
   18449   // destination types are the same.
   18450   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
   18451          "Value types for source and destination must be the same!");
   18452 
   18453   // Break 256-bit integer vector compare into smaller ones.
   18454   if (VT.is256BitVector() && !Subtarget.hasInt256())
   18455     return Lower256IntVSETCC(Op, DAG);
   18456 
   18457   // The result is boolean, but operands are int/float
   18458   if (VT.getVectorElementType() == MVT::i1) {
   18459     // In AVX-512 architecture setcc returns mask with i1 elements,
   18460     // But there is no compare instruction for i8 and i16 elements in KNL.
   18461     assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
   18462            "Unexpected operand type");
   18463     return LowerIntVSETCC_AVX512(Op, DAG);
   18464   }
   18465 
   18466   // Lower using XOP integer comparisons.
   18467   if (VT.is128BitVector() && Subtarget.hasXOP()) {
   18468     // Translate compare code to XOP PCOM compare mode.
   18469     unsigned CmpMode = 0;
   18470     switch (Cond) {
   18471     default: llvm_unreachable("Unexpected SETCC condition");
   18472     case ISD::SETULT:
   18473     case ISD::SETLT: CmpMode = 0x00; break;
   18474     case ISD::SETULE:
   18475     case ISD::SETLE: CmpMode = 0x01; break;
   18476     case ISD::SETUGT:
   18477     case ISD::SETGT: CmpMode = 0x02; break;
   18478     case ISD::SETUGE:
   18479     case ISD::SETGE: CmpMode = 0x03; break;
   18480     case ISD::SETEQ: CmpMode = 0x04; break;
   18481     case ISD::SETNE: CmpMode = 0x05; break;
   18482     }
   18483 
   18484     // Are we comparing unsigned or signed integers?
   18485     unsigned Opc =
   18486         ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
   18487 
   18488     return DAG.getNode(Opc, dl, VT, Op0, Op1,
   18489                        DAG.getConstant(CmpMode, dl, MVT::i8));
   18490   }
   18491 
   18492   // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
   18493   // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
   18494   if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
   18495     SDValue BC0 = peekThroughBitcasts(Op0);
   18496     if (BC0.getOpcode() == ISD::AND) {
   18497       APInt UndefElts;
   18498       SmallVector<APInt, 64> EltBits;
   18499       if (getTargetConstantBitsFromNode(BC0.getOperand(1),
   18500                                         VT.getScalarSizeInBits(), UndefElts,
   18501                                         EltBits, false, false)) {
   18502         if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
   18503           Cond = ISD::SETEQ;
   18504           Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
   18505         }
   18506       }
   18507     }
   18508   }
   18509 
   18510   // If this is a SETNE against the signed minimum value, change it to SETGT.
   18511   // If this is a SETNE against the signed maximum value, change it to SETLT.
   18512   // which will be swapped to SETGT.
   18513   // Otherwise we use PCMPEQ+invert.
   18514   APInt ConstValue;
   18515   if (Cond == ISD::SETNE &&
   18516       ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
   18517     if (ConstValue.isMinSignedValue())
   18518       Cond = ISD::SETGT;
   18519     else if (ConstValue.isMaxSignedValue())
   18520       Cond = ISD::SETLT;
   18521   }
   18522 
   18523   // If both operands are known non-negative, then an unsigned compare is the
   18524   // same as a signed compare and there's no need to flip signbits.
   18525   // TODO: We could check for more general simplifications here since we're
   18526   // computing known bits.
   18527   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
   18528                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
   18529 
   18530   // Special case: Use min/max operations for unsigned compares. We only want
   18531   // to do this for unsigned compares if we need to flip signs or if it allows
   18532   // use to avoid an invert.
   18533   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   18534   if (ISD::isUnsignedIntSetCC(Cond) &&
   18535       (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
   18536       TLI.isOperationLegal(ISD::UMIN, VT)) {
   18537     bool Invert = false;
   18538     unsigned Opc;
   18539     switch (Cond) {
   18540     default: llvm_unreachable("Unexpected condition code");
   18541     case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
   18542     case ISD::SETULE: Opc = ISD::UMIN; break;
   18543     case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
   18544     case ISD::SETUGE: Opc = ISD::UMAX; break;
   18545     }
   18546 
   18547     SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
   18548     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
   18549 
   18550     // If the logical-not of the result is required, perform that now.
   18551     if (Invert)
   18552       Result = DAG.getNOT(dl, Result, VT);
   18553 
   18554     return Result;
   18555   }
   18556 
   18557   // Try to use SUBUS and PCMPEQ.
   18558   if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
   18559     return V;
   18560 
   18561   // We are handling one of the integer comparisons here. Since SSE only has
   18562   // GT and EQ comparisons for integer, swapping operands and multiple
   18563   // operations may be required for some comparisons.
   18564   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
   18565                                                             : X86ISD::PCMPGT;
   18566   bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
   18567               Cond == ISD::SETGE || Cond == ISD::SETUGE;
   18568   bool Invert = Cond == ISD::SETNE ||
   18569                 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
   18570 
   18571   if (Swap)
   18572     std::swap(Op0, Op1);
   18573 
   18574   // Check that the operation in question is available (most are plain SSE2,
   18575   // but PCMPGTQ and PCMPEQQ have different requirements).
   18576   if (VT == MVT::v2i64) {
   18577     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
   18578       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
   18579 
   18580       // First cast everything to the right type.
   18581       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
   18582       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
   18583 
   18584       // Since SSE has no unsigned integer comparisons, we need to flip the sign
   18585       // bits of the inputs before performing those operations. The lower
   18586       // compare is always unsigned.
   18587       SDValue SB;
   18588       if (FlipSigns) {
   18589         SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
   18590       } else {
   18591         SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
   18592         SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
   18593         SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
   18594       }
   18595       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
   18596       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
   18597 
   18598       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
   18599       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
   18600       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
   18601 
   18602       // Create masks for only the low parts/high parts of the 64 bit integers.
   18603       static const int MaskHi[] = { 1, 1, 3, 3 };
   18604       static const int MaskLo[] = { 0, 0, 2, 2 };
   18605       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
   18606       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
   18607       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
   18608 
   18609       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
   18610       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
   18611 
   18612       if (Invert)
   18613         Result = DAG.getNOT(dl, Result, MVT::v4i32);
   18614 
   18615       return DAG.getBitcast(VT, Result);
   18616     }
   18617 
   18618     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
   18619       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
   18620       // pcmpeqd + pshufd + pand.
   18621       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
   18622 
   18623       // First cast everything to the right type.
   18624       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
   18625       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
   18626 
   18627       // Do the compare.
   18628       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
   18629 
   18630       // Make sure the lower and upper halves are both all-ones.
   18631       static const int Mask[] = { 1, 0, 3, 2 };
   18632       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
   18633       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
   18634 
   18635       if (Invert)
   18636         Result = DAG.getNOT(dl, Result, MVT::v4i32);
   18637 
   18638       return DAG.getBitcast(VT, Result);
   18639     }
   18640   }
   18641 
   18642   // Since SSE has no unsigned integer comparisons, we need to flip the sign
   18643   // bits of the inputs before performing those operations.
   18644   if (FlipSigns) {
   18645     MVT EltVT = VT.getVectorElementType();
   18646     SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
   18647                                  VT);
   18648     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
   18649     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
   18650   }
   18651 
   18652   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
   18653 
   18654   // If the logical-not of the result is required, perform that now.
   18655   if (Invert)
   18656     Result = DAG.getNOT(dl, Result, VT);
   18657 
   18658   return Result;
   18659 }
   18660 
   18661 // Try to select this as a KTEST+SETCC if possible.
   18662 static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
   18663                          const SDLoc &dl, SelectionDAG &DAG,
   18664                          const X86Subtarget &Subtarget) {
   18665   // Only support equality comparisons.
   18666   if (CC != ISD::SETEQ && CC != ISD::SETNE)
   18667     return SDValue();
   18668 
   18669   // Must be a bitcast from vXi1.
   18670   if (Op0.getOpcode() != ISD::BITCAST)
   18671     return SDValue();
   18672 
   18673   Op0 = Op0.getOperand(0);
   18674   MVT VT = Op0.getSimpleValueType();
   18675   if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
   18676       !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
   18677       !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
   18678     return SDValue();
   18679 
   18680   X86::CondCode X86CC;
   18681   if (isNullConstant(Op1)) {
   18682     X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
   18683   } else if (isAllOnesConstant(Op1)) {
   18684     // C flag is set for all ones.
   18685     X86CC = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
   18686   } else
   18687     return SDValue();
   18688 
   18689   // If the input is an OR, we can combine it's operands into the KORTEST.
   18690   SDValue LHS = Op0;
   18691   SDValue RHS = Op0;
   18692   if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
   18693     LHS = Op0.getOperand(0);
   18694     RHS = Op0.getOperand(1);
   18695   }
   18696 
   18697   SDValue KORTEST = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
   18698   return getSETCC(X86CC, KORTEST, dl, DAG);
   18699 }
   18700 
   18701 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   18702 
   18703   MVT VT = Op.getSimpleValueType();
   18704 
   18705   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
   18706 
   18707   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
   18708   SDValue Op0 = Op.getOperand(0);
   18709   SDValue Op1 = Op.getOperand(1);
   18710   SDLoc dl(Op);
   18711   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   18712 
   18713   // Optimize to BT if possible.
   18714   // Lower (X & (1 << N)) == 0 to BT(X, N).
   18715   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
   18716   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
   18717   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
   18718       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   18719     if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
   18720       return NewSetCC;
   18721   }
   18722 
   18723   // Try to use PTEST for a tree ORs equality compared with 0.
   18724   // TODO: We could do AND tree with all 1s as well by using the C flag.
   18725   if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
   18726       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   18727     if (SDValue NewSetCC = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG))
   18728       return NewSetCC;
   18729   }
   18730 
   18731   // Try to lower using KTEST.
   18732   if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
   18733     return NewSetCC;
   18734 
   18735   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   18736   // these.
   18737   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
   18738       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
   18739 
   18740     // If the input is a setcc, then reuse the input setcc or use a new one with
   18741     // the inverted condition.
   18742     if (Op0.getOpcode() == X86ISD::SETCC) {
   18743       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
   18744       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
   18745       if (!Invert)
   18746         return Op0;
   18747 
   18748       CCode = X86::GetOppositeBranchCondition(CCode);
   18749       return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
   18750     }
   18751   }
   18752 
   18753   bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
   18754   X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
   18755   if (X86CC == X86::COND_INVALID)
   18756     return SDValue();
   18757 
   18758   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
   18759   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   18760   return getSETCC(X86CC, EFLAGS, dl, DAG);
   18761 }
   18762 
   18763 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
   18764   SDValue LHS = Op.getOperand(0);
   18765   SDValue RHS = Op.getOperand(1);
   18766   SDValue Carry = Op.getOperand(2);
   18767   SDValue Cond = Op.getOperand(3);
   18768   SDLoc DL(Op);
   18769 
   18770   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
   18771   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
   18772 
   18773   // Recreate the carry if needed.
   18774   EVT CarryVT = Carry.getValueType();
   18775   APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
   18776   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
   18777                       Carry, DAG.getConstant(NegOne, DL, CarryVT));
   18778 
   18779   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   18780   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
   18781   return getSETCC(CC, Cmp.getValue(1), DL, DAG);
   18782 }
   18783 
   18784 /// Return true if opcode is a X86 logical comparison.
   18785 static bool isX86LogicalCmp(SDValue Op) {
   18786   unsigned Opc = Op.getOpcode();
   18787   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
   18788       Opc == X86ISD::SAHF)
   18789     return true;
   18790   if (Op.getResNo() == 1 &&
   18791       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
   18792        Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
   18793        Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
   18794        Opc == X86ISD::XOR || Opc == X86ISD::AND))
   18795     return true;
   18796 
   18797   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
   18798     return true;
   18799 
   18800   return false;
   18801 }
   18802 
   18803 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   18804   if (V.getOpcode() != ISD::TRUNCATE)
   18805     return false;
   18806 
   18807   SDValue VOp0 = V.getOperand(0);
   18808   unsigned InBits = VOp0.getValueSizeInBits();
   18809   unsigned Bits = V.getValueSizeInBits();
   18810   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
   18811 }
   18812 
   18813 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   18814   bool AddTest = true;
   18815   SDValue Cond  = Op.getOperand(0);
   18816   SDValue Op1 = Op.getOperand(1);
   18817   SDValue Op2 = Op.getOperand(2);
   18818   SDLoc DL(Op);
   18819   MVT VT = Op1.getSimpleValueType();
   18820   SDValue CC;
   18821 
   18822   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
   18823   // are available or VBLENDV if AVX is available.
   18824   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
   18825   if (Cond.getOpcode() == ISD::SETCC &&
   18826       ((Subtarget.hasSSE2() && VT == MVT::f64) ||
   18827        (Subtarget.hasSSE1() && VT == MVT::f32)) &&
   18828       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
   18829     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
   18830     unsigned SSECC = translateX86FSETCC(
   18831         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
   18832 
   18833     if (Subtarget.hasAVX512()) {
   18834       SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
   18835                                 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
   18836       assert(!VT.isVector() && "Not a scalar type?");
   18837       return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
   18838     }
   18839 
   18840     if (SSECC < 8 || Subtarget.hasAVX()) {
   18841       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
   18842                                 DAG.getConstant(SSECC, DL, MVT::i8));
   18843 
   18844       // If we have AVX, we can use a variable vector select (VBLENDV) instead
   18845       // of 3 logic instructions for size savings and potentially speed.
   18846       // Unfortunately, there is no scalar form of VBLENDV.
   18847 
   18848       // If either operand is a constant, don't try this. We can expect to
   18849       // optimize away at least one of the logic instructions later in that
   18850       // case, so that sequence would be faster than a variable blend.
   18851 
   18852       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
   18853       // uses XMM0 as the selection register. That may need just as many
   18854       // instructions as the AND/ANDN/OR sequence due to register moves, so
   18855       // don't bother.
   18856 
   18857       if (Subtarget.hasAVX() &&
   18858           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
   18859 
   18860         // Convert to vectors, do a VSELECT, and convert back to scalar.
   18861         // All of the conversions should be optimized away.
   18862 
   18863         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
   18864         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
   18865         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
   18866         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
   18867 
   18868         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
   18869         VCmp = DAG.getBitcast(VCmpVT, VCmp);
   18870 
   18871         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
   18872 
   18873         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
   18874                            VSel, DAG.getIntPtrConstant(0, DL));
   18875       }
   18876       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
   18877       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
   18878       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
   18879     }
   18880   }
   18881 
   18882   // AVX512 fallback is to lower selects of scalar floats to masked moves.
   18883   if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
   18884     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
   18885     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
   18886   }
   18887 
   18888   // For v64i1 without 64-bit support we need to split and rejoin.
   18889   if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
   18890     assert(Subtarget.hasBWI() && "Expected BWI to be legal");
   18891     SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
   18892     SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
   18893     SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
   18894     SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
   18895     SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
   18896     SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
   18897     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
   18898   }
   18899 
   18900   if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
   18901     SDValue Op1Scalar;
   18902     if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
   18903       Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
   18904     else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
   18905       Op1Scalar = Op1.getOperand(0);
   18906     SDValue Op2Scalar;
   18907     if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
   18908       Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
   18909     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
   18910       Op2Scalar = Op2.getOperand(0);
   18911     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
   18912       SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
   18913                                         Op1Scalar, Op2Scalar);
   18914       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
   18915         return DAG.getBitcast(VT, newSelect);
   18916       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
   18917       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
   18918                          DAG.getIntPtrConstant(0, DL));
   18919     }
   18920   }
   18921 
   18922   if (VT == MVT::v4i1 || VT == MVT::v2i1) {
   18923     SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
   18924     Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
   18925                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
   18926     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
   18927                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
   18928     SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
   18929     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
   18930   }
   18931 
   18932   if (Cond.getOpcode() == ISD::SETCC) {
   18933     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
   18934       Cond = NewCond;
   18935       // If the condition was updated, it's possible that the operands of the
   18936       // select were also updated (for example, EmitTest has a RAUW). Refresh
   18937       // the local references to the select operands in case they got stale.
   18938       Op1 = Op.getOperand(1);
   18939       Op2 = Op.getOperand(2);
   18940     }
   18941   }
   18942 
   18943   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   18944   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
   18945   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
   18946   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
   18947   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
   18948   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
   18949   if (Cond.getOpcode() == X86ISD::SETCC &&
   18950       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
   18951       isNullConstant(Cond.getOperand(1).getOperand(1))) {
   18952     SDValue Cmp = Cond.getOperand(1);
   18953     unsigned CondCode =
   18954         cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
   18955 
   18956     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
   18957         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
   18958       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
   18959       SDValue CmpOp0 = Cmp.getOperand(0);
   18960 
   18961       // Apply further optimizations for special cases
   18962       // (select (x != 0), -1, 0) -> neg & sbb
   18963       // (select (x == 0), 0, -1) -> neg & sbb
   18964       if (isNullConstant(Y) &&
   18965           (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
   18966         SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
   18967         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
   18968         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
   18969         SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   18970                                   DAG.getConstant(X86::COND_B, DL, MVT::i8),
   18971                                   SDValue(Neg.getNode(), 1));
   18972         return Res;
   18973       }
   18974 
   18975       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
   18976                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
   18977       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   18978 
   18979       SDValue Res =   // Res = 0 or -1.
   18980         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   18981                     DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
   18982 
   18983       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
   18984         Res = DAG.getNOT(DL, Res, Res.getValueType());
   18985 
   18986       if (!isNullConstant(Op2))
   18987         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
   18988       return Res;
   18989     } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
   18990                Cmp.getOperand(0).getOpcode() == ISD::AND &&
   18991                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
   18992       SDValue CmpOp0 = Cmp.getOperand(0);
   18993       SDValue Src1, Src2;
   18994       // true if Op2 is XOR or OR operator and one of its operands
   18995       // is equal to Op1
   18996       // ( a , a op b) || ( b , a op b)
   18997       auto isOrXorPattern = [&]() {
   18998         if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
   18999             (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
   19000           Src1 =
   19001               Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
   19002           Src2 = Op1;
   19003           return true;
   19004         }
   19005         return false;
   19006       };
   19007 
   19008       if (isOrXorPattern()) {
   19009         SDValue Neg;
   19010         unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
   19011         // we need mask of all zeros or ones with same size of the other
   19012         // operands.
   19013         if (CmpSz > VT.getSizeInBits())
   19014           Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
   19015         else if (CmpSz < VT.getSizeInBits())
   19016           Neg = DAG.getNode(ISD::AND, DL, VT,
   19017               DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
   19018               DAG.getConstant(1, DL, VT));
   19019         else
   19020           Neg = CmpOp0;
   19021         SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
   19022                                    Neg); // -(and (x, 0x1))
   19023         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
   19024         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
   19025       }
   19026     }
   19027   }
   19028 
   19029   // Look past (and (setcc_carry (cmp ...)), 1).
   19030   if (Cond.getOpcode() == ISD::AND &&
   19031       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
   19032       isOneConstant(Cond.getOperand(1)))
   19033     Cond = Cond.getOperand(0);
   19034 
   19035   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   19036   // setting operand in place of the X86ISD::SETCC.
   19037   unsigned CondOpcode = Cond.getOpcode();
   19038   if (CondOpcode == X86ISD::SETCC ||
   19039       CondOpcode == X86ISD::SETCC_CARRY) {
   19040     CC = Cond.getOperand(0);
   19041 
   19042     SDValue Cmp = Cond.getOperand(1);
   19043     unsigned Opc = Cmp.getOpcode();
   19044     MVT VT = Op.getSimpleValueType();
   19045 
   19046     bool IllegalFPCMov = false;
   19047     if (VT.isFloatingPoint() && !VT.isVector() &&
   19048         !isScalarFPTypeInSSEReg(VT))  // FPStack?
   19049       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
   19050 
   19051     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
   19052         Opc == X86ISD::BT) { // FIXME
   19053       Cond = Cmp;
   19054       AddTest = false;
   19055     }
   19056   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   19057              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   19058              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   19059               Cond.getOperand(0).getValueType() != MVT::i8)) {
   19060     SDValue LHS = Cond.getOperand(0);
   19061     SDValue RHS = Cond.getOperand(1);
   19062     unsigned X86Opcode;
   19063     unsigned X86Cond;
   19064     SDVTList VTs;
   19065     switch (CondOpcode) {
   19066     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   19067     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   19068     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   19069     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   19070     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   19071     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   19072     default: llvm_unreachable("unexpected overflowing operator");
   19073     }
   19074     if (CondOpcode == ISD::UMULO)
   19075       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   19076                           MVT::i32);
   19077     else
   19078       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   19079 
   19080     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
   19081 
   19082     if (CondOpcode == ISD::UMULO)
   19083       Cond = X86Op.getValue(2);
   19084     else
   19085       Cond = X86Op.getValue(1);
   19086 
   19087     CC = DAG.getConstant(X86Cond, DL, MVT::i8);
   19088     AddTest = false;
   19089   }
   19090 
   19091   if (AddTest) {
   19092     // Look past the truncate if the high bits are known zero.
   19093     if (isTruncWithZeroHighBitsInput(Cond, DAG))
   19094       Cond = Cond.getOperand(0);
   19095 
   19096     // We know the result of AND is compared against zero. Try to match
   19097     // it to BT.
   19098     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   19099       if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
   19100         CC = NewSetCC.getOperand(0);
   19101         Cond = NewSetCC.getOperand(1);
   19102         AddTest = false;
   19103       }
   19104     }
   19105   }
   19106 
   19107   if (AddTest) {
   19108     CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
   19109     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
   19110   }
   19111 
   19112   // a <  b ? -1 :  0 -> RES = ~setcc_carry
   19113   // a <  b ?  0 : -1 -> RES = setcc_carry
   19114   // a >= b ? -1 :  0 -> RES = setcc_carry
   19115   // a >= b ?  0 : -1 -> RES = ~setcc_carry
   19116   if (Cond.getOpcode() == X86ISD::SUB) {
   19117     Cond = ConvertCmpIfNecessary(Cond, DAG);
   19118     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
   19119 
   19120     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
   19121         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
   19122         (isNullConstant(Op1) || isNullConstant(Op2))) {
   19123       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
   19124                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
   19125                                 Cond);
   19126       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
   19127         return DAG.getNOT(DL, Res, Res.getValueType());
   19128       return Res;
   19129     }
   19130   }
   19131 
   19132   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
   19133   // widen the cmov and push the truncate through. This avoids introducing a new
   19134   // branch during isel and doesn't add any extensions.
   19135   if (Op.getValueType() == MVT::i8 &&
   19136       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
   19137     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
   19138     if (T1.getValueType() == T2.getValueType() &&
   19139         // Blacklist CopyFromReg to avoid partial register stalls.
   19140         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
   19141       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
   19142                                  CC, Cond);
   19143       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
   19144     }
   19145   }
   19146 
   19147   // Promote i16 cmovs if it won't prevent folding a load.
   19148   if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
   19149     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
   19150     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
   19151     SDValue Ops[] = { Op2, Op1, CC, Cond };
   19152     SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
   19153     return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
   19154   }
   19155 
   19156   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   19157   // condition is true.
   19158   SDValue Ops[] = { Op2, Op1, CC, Cond };
   19159   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
   19160 }
   19161 
   19162 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
   19163                                      const X86Subtarget &Subtarget,
   19164                                      SelectionDAG &DAG) {
   19165   MVT VT = Op->getSimpleValueType(0);
   19166   SDValue In = Op->getOperand(0);
   19167   MVT InVT = In.getSimpleValueType();
   19168   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
   19169   MVT VTElt = VT.getVectorElementType();
   19170   SDLoc dl(Op);
   19171 
   19172   unsigned NumElts = VT.getVectorNumElements();
   19173 
   19174   // Extend VT if the scalar type is v8/v16 and BWI is not supported.
   19175   MVT ExtVT = VT;
   19176   if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
   19177     // If v16i32 is to be avoided, we'll need to split and concatenate.
   19178     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
   19179       return SplitAndExtendv16i1(ISD::SIGN_EXTEND, VT, In, dl, DAG);
   19180 
   19181     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
   19182   }
   19183 
   19184   // Widen to 512-bits if VLX is not supported.
   19185   MVT WideVT = ExtVT;
   19186   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
   19187     NumElts *= 512 / ExtVT.getSizeInBits();
   19188     InVT = MVT::getVectorVT(MVT::i1, NumElts);
   19189     In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
   19190                      In, DAG.getIntPtrConstant(0, dl));
   19191     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
   19192   }
   19193 
   19194   SDValue V;
   19195   MVT WideEltVT = WideVT.getVectorElementType();
   19196   if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
   19197       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
   19198     V = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, In);
   19199   } else {
   19200     SDValue NegOne = getOnesVector(WideVT, DAG, dl);
   19201     SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
   19202     V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
   19203   }
   19204 
   19205   // Truncate if we had to extend i16/i8 above.
   19206   if (VT != ExtVT) {
   19207     WideVT = MVT::getVectorVT(VTElt, NumElts);
   19208     V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
   19209   }
   19210 
   19211   // Extract back to 128/256-bit if we widened.
   19212   if (WideVT != VT)
   19213     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
   19214                     DAG.getIntPtrConstant(0, dl));
   19215 
   19216   return V;
   19217 }
   19218 
   19219 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   19220                                SelectionDAG &DAG) {
   19221   SDValue In = Op->getOperand(0);
   19222   MVT InVT = In.getSimpleValueType();
   19223 
   19224   if (InVT.getVectorElementType() == MVT::i1)
   19225     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
   19226 
   19227   assert(Subtarget.hasAVX() && "Expected AVX support");
   19228   return LowerAVXExtend(Op, DAG, Subtarget);
   19229 }
   19230 
   19231 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
   19232 // For sign extend this needs to handle all vector sizes and SSE4.1 and
   19233 // non-SSE4.1 targets. For zero extend this should only handle inputs of
   19234 // MVT::v64i8 when BWI is not supported, but AVX512 is.
   19235 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
   19236                                         const X86Subtarget &Subtarget,
   19237                                         SelectionDAG &DAG) {
   19238   SDValue In = Op->getOperand(0);
   19239   MVT VT = Op->getSimpleValueType(0);
   19240   MVT InVT = In.getSimpleValueType();
   19241   assert(VT.getSizeInBits() == InVT.getSizeInBits());
   19242 
   19243   MVT SVT = VT.getVectorElementType();
   19244   MVT InSVT = InVT.getVectorElementType();
   19245   assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
   19246 
   19247   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
   19248     return SDValue();
   19249   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
   19250     return SDValue();
   19251   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
   19252       !(VT.is256BitVector() && Subtarget.hasInt256()) &&
   19253       !(VT.is512BitVector() && Subtarget.hasAVX512()))
   19254     return SDValue();
   19255 
   19256   SDLoc dl(Op);
   19257 
   19258   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
   19259   // For 512-bit vectors, we need 128-bits or 256-bits.
   19260   if (VT.getSizeInBits() > 128) {
   19261     // Input needs to be at least the same number of elements as output, and
   19262     // at least 128-bits.
   19263     int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
   19264     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
   19265   }
   19266 
   19267   assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
   19268           InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
   19269 
   19270   // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
   19271   // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
   19272   // need to be handled here for 256/512-bit results.
   19273   if (Subtarget.hasInt256()) {
   19274     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
   19275     unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
   19276                         X86ISD::VSEXT : X86ISD::VZEXT;
   19277     return DAG.getNode(ExtOpc, dl, VT, In);
   19278   }
   19279 
   19280   // We should only get here for sign extend.
   19281   assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
   19282          "Unexpected opcode!");
   19283 
   19284   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
   19285   SDValue Curr = In;
   19286   MVT CurrVT = InVT;
   19287 
   19288   // As SRAI is only available on i16/i32 types, we expand only up to i32
   19289   // and handle i64 separately.
   19290   while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
   19291     Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
   19292     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
   19293     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
   19294     Curr = DAG.getBitcast(CurrVT, Curr);
   19295   }
   19296 
   19297   SDValue SignExt = Curr;
   19298   if (CurrVT != InVT) {
   19299     unsigned SignExtShift =
   19300         CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
   19301     SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
   19302                           DAG.getConstant(SignExtShift, dl, MVT::i8));
   19303   }
   19304 
   19305   if (CurrVT == VT)
   19306     return SignExt;
   19307 
   19308   if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
   19309     SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
   19310                                DAG.getConstant(31, dl, MVT::i8));
   19311     SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
   19312     return DAG.getBitcast(VT, Ext);
   19313   }
   19314 
   19315   return SDValue();
   19316 }
   19317 
   19318 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   19319                                 SelectionDAG &DAG) {
   19320   MVT VT = Op->getSimpleValueType(0);
   19321   SDValue In = Op->getOperand(0);
   19322   MVT InVT = In.getSimpleValueType();
   19323   SDLoc dl(Op);
   19324 
   19325   if (InVT.getVectorElementType() == MVT::i1)
   19326     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
   19327 
   19328   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
   19329   assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
   19330          "Expected same number of elements");
   19331   assert((VT.getVectorElementType() == MVT::i16 ||
   19332           VT.getVectorElementType() == MVT::i32 ||
   19333           VT.getVectorElementType() == MVT::i64) &&
   19334          "Unexpected element type");
   19335   assert((InVT.getVectorElementType() == MVT::i8 ||
   19336           InVT.getVectorElementType() == MVT::i16 ||
   19337           InVT.getVectorElementType() == MVT::i32) &&
   19338          "Unexpected element type");
   19339 
   19340   if (Subtarget.hasInt256())
   19341     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   19342 
   19343   // Optimize vectors in AVX mode
   19344   // Sign extend  v8i16 to v8i32 and
   19345   //              v4i32 to v4i64
   19346   //
   19347   // Divide input vector into two parts
   19348   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
   19349   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
   19350   // concat the vectors to original VT
   19351 
   19352   unsigned NumElems = InVT.getVectorNumElements();
   19353   SDValue Undef = DAG.getUNDEF(InVT);
   19354 
   19355   SmallVector<int,8> ShufMask1(NumElems, -1);
   19356   for (unsigned i = 0; i != NumElems/2; ++i)
   19357     ShufMask1[i] = i;
   19358 
   19359   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
   19360 
   19361   SmallVector<int,8> ShufMask2(NumElems, -1);
   19362   for (unsigned i = 0; i != NumElems/2; ++i)
   19363     ShufMask2[i] = i + NumElems/2;
   19364 
   19365   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
   19366 
   19367   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
   19368                                 VT.getVectorNumElements() / 2);
   19369 
   19370   OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
   19371   OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
   19372 
   19373   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   19374 }
   19375 
   19376 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
   19377                           SelectionDAG &DAG) {
   19378   StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
   19379   SDLoc dl(St);
   19380   SDValue StoredVal = St->getValue();
   19381 
   19382   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
   19383   assert(StoredVal.getValueType().isVector() &&
   19384          StoredVal.getValueType().getVectorElementType() == MVT::i1 &&
   19385          StoredVal.getValueType().getVectorNumElements() <= 8 &&
   19386          "Unexpected VT");
   19387   assert(!St->isTruncatingStore() && "Expected non-truncating store");
   19388   assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
   19389          "Expected AVX512F without AVX512DQI");
   19390 
   19391   StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
   19392                           DAG.getUNDEF(MVT::v8i1), StoredVal,
   19393                           DAG.getIntPtrConstant(0, dl));
   19394   StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
   19395 
   19396   return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
   19397                       St->getPointerInfo(), St->getAlignment(),
   19398                       St->getMemOperand()->getFlags());
   19399 }
   19400 
   19401 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
   19402 // may emit an illegal shuffle but the expansion is still better than scalar
   19403 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
   19404 // we'll emit a shuffle and a arithmetic shift.
   19405 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
   19406 // TODO: It is possible to support ZExt by zeroing the undef values during
   19407 // the shuffle phase or after the shuffle.
   19408 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
   19409                                  SelectionDAG &DAG) {
   19410   MVT RegVT = Op.getSimpleValueType();
   19411   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
   19412   assert(RegVT.isInteger() &&
   19413          "We only custom lower integer vector sext loads.");
   19414 
   19415   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   19416   SDLoc dl(Ld);
   19417   EVT MemVT = Ld->getMemoryVT();
   19418 
   19419   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
   19420   if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) {
   19421     assert(EVT(RegVT) == MemVT && "Expected non-extending load");
   19422     assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
   19423     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
   19424            "Expected AVX512F without AVX512DQI");
   19425 
   19426     SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
   19427                                 Ld->getPointerInfo(), Ld->getAlignment(),
   19428                                 Ld->getMemOperand()->getFlags());
   19429 
   19430     // Replace chain users with the new chain.
   19431     assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
   19432     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1));
   19433 
   19434     SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
   19435                                   DAG.getBitcast(MVT::v8i1, NewLd),
   19436                                   DAG.getIntPtrConstant(0, dl));
   19437     return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl);
   19438   }
   19439 
   19440   // Nothing useful we can do without SSE2 shuffles.
   19441   assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
   19442 
   19443   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   19444   unsigned RegSz = RegVT.getSizeInBits();
   19445 
   19446   ISD::LoadExtType Ext = Ld->getExtensionType();
   19447 
   19448   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
   19449          && "Only anyext and sext are currently implemented.");
   19450   assert(MemVT != RegVT && "Cannot extend to the same type");
   19451   assert(MemVT.isVector() && "Must load a vector from memory");
   19452 
   19453   unsigned NumElems = RegVT.getVectorNumElements();
   19454   unsigned MemSz = MemVT.getSizeInBits();
   19455   assert(RegSz > MemSz && "Register size must be greater than the mem size");
   19456 
   19457   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
   19458     // The only way in which we have a legal 256-bit vector result but not the
   19459     // integer 256-bit operations needed to directly lower a sextload is if we
   19460     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
   19461     // a 128-bit vector and a normal sign_extend to 256-bits that should get
   19462     // correctly legalized. We do this late to allow the canonical form of
   19463     // sextload to persist throughout the rest of the DAG combiner -- it wants
   19464     // to fold together any extensions it can, and so will fuse a sign_extend
   19465     // of an sextload into a sextload targeting a wider value.
   19466     SDValue Load;
   19467     if (MemSz == 128) {
   19468       // Just switch this to a normal load.
   19469       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
   19470                                        "it must be a legal 128-bit vector "
   19471                                        "type!");
   19472       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
   19473                          Ld->getPointerInfo(), Ld->getAlignment(),
   19474                          Ld->getMemOperand()->getFlags());
   19475     } else {
   19476       assert(MemSz < 128 &&
   19477              "Can't extend a type wider than 128 bits to a 256 bit vector!");
   19478       // Do an sext load to a 128-bit vector type. We want to use the same
   19479       // number of elements, but elements half as wide. This will end up being
   19480       // recursively lowered by this routine, but will succeed as we definitely
   19481       // have all the necessary features if we're using AVX1.
   19482       EVT HalfEltVT =
   19483           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
   19484       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
   19485       Load =
   19486           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
   19487                          Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
   19488                          Ld->getMemOperand()->getFlags());
   19489     }
   19490 
   19491     // Replace chain users with the new chain.
   19492     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
   19493     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
   19494 
   19495     // Finally, do a normal sign-extend to the desired register.
   19496     return DAG.getSExtOrTrunc(Load, dl, RegVT);
   19497   }
   19498 
   19499   // All sizes must be a power of two.
   19500   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
   19501          "Non-power-of-two elements are not custom lowered!");
   19502 
   19503   // Attempt to load the original value using scalar loads.
   19504   // Find the largest scalar type that divides the total loaded size.
   19505   MVT SclrLoadTy = MVT::i8;
   19506   for (MVT Tp : MVT::integer_valuetypes()) {
   19507     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
   19508       SclrLoadTy = Tp;
   19509     }
   19510   }
   19511 
   19512   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
   19513   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
   19514       (64 <= MemSz))
   19515     SclrLoadTy = MVT::f64;
   19516 
   19517   // Calculate the number of scalar loads that we need to perform
   19518   // in order to load our vector from memory.
   19519   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
   19520 
   19521   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
   19522          "Can only lower sext loads with a single scalar load!");
   19523 
   19524   unsigned loadRegZize = RegSz;
   19525   if (Ext == ISD::SEXTLOAD && RegSz >= 256)
   19526     loadRegZize = 128;
   19527 
   19528   // If we don't have BWI we won't be able to create the shuffle needed for
   19529   // v8i8->v8i64.
   19530   if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
   19531       MemVT == MVT::v8i8)
   19532     loadRegZize = 128;
   19533 
   19534   // Represent our vector as a sequence of elements which are the
   19535   // largest scalar that we can load.
   19536   EVT LoadUnitVecVT = EVT::getVectorVT(
   19537       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
   19538 
   19539   // Represent the data using the same element type that is stored in
   19540   // memory. In practice, we ''widen'' MemVT.
   19541   EVT WideVecVT =
   19542       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
   19543                        loadRegZize / MemVT.getScalarSizeInBits());
   19544 
   19545   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
   19546          "Invalid vector type");
   19547 
   19548   // We can't shuffle using an illegal type.
   19549   assert(TLI.isTypeLegal(WideVecVT) &&
   19550          "We only lower types that form legal widened vector types");
   19551 
   19552   SmallVector<SDValue, 8> Chains;
   19553   SDValue Ptr = Ld->getBasePtr();
   19554   SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
   19555                                       TLI.getPointerTy(DAG.getDataLayout()));
   19556   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
   19557 
   19558   for (unsigned i = 0; i < NumLoads; ++i) {
   19559     // Perform a single load.
   19560     SDValue ScalarLoad =
   19561         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
   19562                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
   19563     Chains.push_back(ScalarLoad.getValue(1));
   19564     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
   19565     // another round of DAGCombining.
   19566     if (i == 0)
   19567       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
   19568     else
   19569       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
   19570                         ScalarLoad, DAG.getIntPtrConstant(i, dl));
   19571 
   19572     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   19573   }
   19574 
   19575   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   19576 
   19577   // Bitcast the loaded value to a vector of the original element type, in
   19578   // the size of the target vector type.
   19579   SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
   19580   unsigned SizeRatio = RegSz / MemSz;
   19581 
   19582   if (Ext == ISD::SEXTLOAD) {
   19583     // If we have SSE4.1, we can directly emit a VSEXT node.
   19584     if (Subtarget.hasSSE41()) {
   19585       SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
   19586       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   19587       return Sext;
   19588     }
   19589 
   19590     // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
   19591     // lanes.
   19592     assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
   19593            "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
   19594 
   19595     SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
   19596     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   19597     return Shuff;
   19598   }
   19599 
   19600   if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
   19601       MemVT == MVT::v8i8) {
   19602     SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
   19603     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   19604     return Sext;
   19605   }
   19606 
   19607   // Redistribute the loaded elements into the different locations.
   19608   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   19609   for (unsigned i = 0; i != NumElems; ++i)
   19610     ShuffleVec[i * SizeRatio] = i;
   19611 
   19612   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
   19613                                        DAG.getUNDEF(WideVecVT), ShuffleVec);
   19614 
   19615   // Bitcast to the requested type.
   19616   Shuff = DAG.getBitcast(RegVT, Shuff);
   19617   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   19618   return Shuff;
   19619 }
   19620 
   19621 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
   19622 /// each of which has no other use apart from the AND / OR.
   19623 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
   19624   Opc = Op.getOpcode();
   19625   if (Opc != ISD::OR && Opc != ISD::AND)
   19626     return false;
   19627   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   19628           Op.getOperand(0).hasOneUse() &&
   19629           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
   19630           Op.getOperand(1).hasOneUse());
   19631 }
   19632 
   19633 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
   19634 /// SETCC node has a single use.
   19635 static bool isXor1OfSetCC(SDValue Op) {
   19636   if (Op.getOpcode() != ISD::XOR)
   19637     return false;
   19638   if (isOneConstant(Op.getOperand(1)))
   19639     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
   19640            Op.getOperand(0).hasOneUse();
   19641   return false;
   19642 }
   19643 
   19644 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   19645   bool addTest = true;
   19646   SDValue Chain = Op.getOperand(0);
   19647   SDValue Cond  = Op.getOperand(1);
   19648   SDValue Dest  = Op.getOperand(2);
   19649   SDLoc dl(Op);
   19650   SDValue CC;
   19651   bool Inverted = false;
   19652 
   19653   if (Cond.getOpcode() == ISD::SETCC) {
   19654     // Check for setcc([su]{add,sub,mul}o == 0).
   19655     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
   19656         isNullConstant(Cond.getOperand(1)) &&
   19657         Cond.getOperand(0).getResNo() == 1 &&
   19658         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
   19659          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
   19660          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
   19661          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
   19662          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
   19663          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
   19664       Inverted = true;
   19665       Cond = Cond.getOperand(0);
   19666     } else {
   19667       if (SDValue NewCond = LowerSETCC(Cond, DAG))
   19668         Cond = NewCond;
   19669     }
   19670   }
   19671 #if 0
   19672   // FIXME: LowerXALUO doesn't handle these!!
   19673   else if (Cond.getOpcode() == X86ISD::ADD  ||
   19674            Cond.getOpcode() == X86ISD::SUB  ||
   19675            Cond.getOpcode() == X86ISD::SMUL ||
   19676            Cond.getOpcode() == X86ISD::UMUL)
   19677     Cond = LowerXALUO(Cond, DAG);
   19678 #endif
   19679 
   19680   // Look pass (and (setcc_carry (cmp ...)), 1).
   19681   if (Cond.getOpcode() == ISD::AND &&
   19682       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
   19683       isOneConstant(Cond.getOperand(1)))
   19684     Cond = Cond.getOperand(0);
   19685 
   19686   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   19687   // setting operand in place of the X86ISD::SETCC.
   19688   unsigned CondOpcode = Cond.getOpcode();
   19689   if (CondOpcode == X86ISD::SETCC ||
   19690       CondOpcode == X86ISD::SETCC_CARRY) {
   19691     CC = Cond.getOperand(0);
   19692 
   19693     SDValue Cmp = Cond.getOperand(1);
   19694     unsigned Opc = Cmp.getOpcode();
   19695     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
   19696     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
   19697       Cond = Cmp;
   19698       addTest = false;
   19699     } else {
   19700       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
   19701       default: break;
   19702       case X86::COND_O:
   19703       case X86::COND_B:
   19704         // These can only come from an arithmetic instruction with overflow,
   19705         // e.g. SADDO, UADDO.
   19706         Cond = Cond.getOperand(1);
   19707         addTest = false;
   19708         break;
   19709       }
   19710     }
   19711   }
   19712   CondOpcode = Cond.getOpcode();
   19713   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
   19714       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
   19715       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
   19716        Cond.getOperand(0).getValueType() != MVT::i8)) {
   19717     SDValue LHS = Cond.getOperand(0);
   19718     SDValue RHS = Cond.getOperand(1);
   19719     unsigned X86Opcode;
   19720     unsigned X86Cond;
   19721     SDVTList VTs;
   19722     // Keep this in sync with LowerXALUO, otherwise we might create redundant
   19723     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
   19724     // X86ISD::INC).
   19725     switch (CondOpcode) {
   19726     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
   19727     case ISD::SADDO:
   19728       if (isOneConstant(RHS)) {
   19729           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
   19730           break;
   19731         }
   19732       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
   19733     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
   19734     case ISD::SSUBO:
   19735       if (isOneConstant(RHS)) {
   19736           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
   19737           break;
   19738         }
   19739       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
   19740     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
   19741     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
   19742     default: llvm_unreachable("unexpected overflowing operator");
   19743     }
   19744     if (Inverted)
   19745       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
   19746     if (CondOpcode == ISD::UMULO)
   19747       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
   19748                           MVT::i32);
   19749     else
   19750       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   19751 
   19752     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
   19753 
   19754     if (CondOpcode == ISD::UMULO)
   19755       Cond = X86Op.getValue(2);
   19756     else
   19757       Cond = X86Op.getValue(1);
   19758 
   19759     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
   19760     addTest = false;
   19761   } else {
   19762     unsigned CondOpc;
   19763     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
   19764       SDValue Cmp = Cond.getOperand(0).getOperand(1);
   19765       if (CondOpc == ISD::OR) {
   19766         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
   19767         // two branches instead of an explicit OR instruction with a
   19768         // separate test.
   19769         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   19770             isX86LogicalCmp(Cmp)) {
   19771           CC = Cond.getOperand(0).getOperand(0);
   19772           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   19773                               Chain, Dest, CC, Cmp);
   19774           CC = Cond.getOperand(1).getOperand(0);
   19775           Cond = Cmp;
   19776           addTest = false;
   19777         }
   19778       } else { // ISD::AND
   19779         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
   19780         // two branches instead of an explicit AND instruction with a
   19781         // separate test. However, we only do this if this block doesn't
   19782         // have a fall-through edge, because this requires an explicit
   19783         // jmp when the condition is false.
   19784         if (Cmp == Cond.getOperand(1).getOperand(1) &&
   19785             isX86LogicalCmp(Cmp) &&
   19786             Op.getNode()->hasOneUse()) {
   19787           X86::CondCode CCode =
   19788             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   19789           CCode = X86::GetOppositeBranchCondition(CCode);
   19790           CC = DAG.getConstant(CCode, dl, MVT::i8);
   19791           SDNode *User = *Op.getNode()->use_begin();
   19792           // Look for an unconditional branch following this conditional branch.
   19793           // We need this because we need to reverse the successors in order
   19794           // to implement FCMP_OEQ.
   19795           if (User->getOpcode() == ISD::BR) {
   19796             SDValue FalseBB = User->getOperand(1);
   19797             SDNode *NewBR =
   19798               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   19799             assert(NewBR == User);
   19800             (void)NewBR;
   19801             Dest = FalseBB;
   19802 
   19803             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   19804                                 Chain, Dest, CC, Cmp);
   19805             X86::CondCode CCode =
   19806               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
   19807             CCode = X86::GetOppositeBranchCondition(CCode);
   19808             CC = DAG.getConstant(CCode, dl, MVT::i8);
   19809             Cond = Cmp;
   19810             addTest = false;
   19811           }
   19812         }
   19813       }
   19814     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
   19815       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
   19816       // It should be transformed during dag combiner except when the condition
   19817       // is set by a arithmetics with overflow node.
   19818       X86::CondCode CCode =
   19819         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
   19820       CCode = X86::GetOppositeBranchCondition(CCode);
   19821       CC = DAG.getConstant(CCode, dl, MVT::i8);
   19822       Cond = Cond.getOperand(0).getOperand(1);
   19823       addTest = false;
   19824     } else if (Cond.getOpcode() == ISD::SETCC &&
   19825                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
   19826       // For FCMP_OEQ, we can emit
   19827       // two branches instead of an explicit AND instruction with a
   19828       // separate test. However, we only do this if this block doesn't
   19829       // have a fall-through edge, because this requires an explicit
   19830       // jmp when the condition is false.
   19831       if (Op.getNode()->hasOneUse()) {
   19832         SDNode *User = *Op.getNode()->use_begin();
   19833         // Look for an unconditional branch following this conditional branch.
   19834         // We need this because we need to reverse the successors in order
   19835         // to implement FCMP_OEQ.
   19836         if (User->getOpcode() == ISD::BR) {
   19837           SDValue FalseBB = User->getOperand(1);
   19838           SDNode *NewBR =
   19839             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   19840           assert(NewBR == User);
   19841           (void)NewBR;
   19842           Dest = FalseBB;
   19843 
   19844           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   19845                                     Cond.getOperand(0), Cond.getOperand(1));
   19846           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   19847           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
   19848           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   19849                               Chain, Dest, CC, Cmp);
   19850           CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
   19851           Cond = Cmp;
   19852           addTest = false;
   19853         }
   19854       }
   19855     } else if (Cond.getOpcode() == ISD::SETCC &&
   19856                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
   19857       // For FCMP_UNE, we can emit
   19858       // two branches instead of an explicit AND instruction with a
   19859       // separate test. However, we only do this if this block doesn't
   19860       // have a fall-through edge, because this requires an explicit
   19861       // jmp when the condition is false.
   19862       if (Op.getNode()->hasOneUse()) {
   19863         SDNode *User = *Op.getNode()->use_begin();
   19864         // Look for an unconditional branch following this conditional branch.
   19865         // We need this because we need to reverse the successors in order
   19866         // to implement FCMP_UNE.
   19867         if (User->getOpcode() == ISD::BR) {
   19868           SDValue FalseBB = User->getOperand(1);
   19869           SDNode *NewBR =
   19870             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
   19871           assert(NewBR == User);
   19872           (void)NewBR;
   19873 
   19874           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
   19875                                     Cond.getOperand(0), Cond.getOperand(1));
   19876           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
   19877           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
   19878           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   19879                               Chain, Dest, CC, Cmp);
   19880           CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
   19881           Cond = Cmp;
   19882           addTest = false;
   19883           Dest = FalseBB;
   19884         }
   19885       }
   19886     }
   19887   }
   19888 
   19889   if (addTest) {
   19890     // Look pass the truncate if the high bits are known zero.
   19891     if (isTruncWithZeroHighBitsInput(Cond, DAG))
   19892         Cond = Cond.getOperand(0);
   19893 
   19894     // We know the result of AND is compared against zero. Try to match
   19895     // it to BT.
   19896     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
   19897       if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
   19898         CC = NewSetCC.getOperand(0);
   19899         Cond = NewSetCC.getOperand(1);
   19900         addTest = false;
   19901       }
   19902     }
   19903   }
   19904 
   19905   if (addTest) {
   19906     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
   19907     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
   19908     Cond = EmitTest(Cond, X86Cond, dl, DAG);
   19909   }
   19910   Cond = ConvertCmpIfNecessary(Cond, DAG);
   19911   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
   19912                      Chain, Dest, CC, Cond);
   19913 }
   19914 
   19915 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
   19916 // Calls to _alloca are needed to probe the stack when allocating more than 4k
   19917 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
   19918 // that the guard pages used by the OS virtual memory manager are allocated in
   19919 // correct sequence.
   19920 SDValue
   19921 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   19922                                            SelectionDAG &DAG) const {
   19923   MachineFunction &MF = DAG.getMachineFunction();
   19924   bool SplitStack = MF.shouldSplitStack();
   19925   bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
   19926   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
   19927                SplitStack || EmitStackProbe;
   19928   SDLoc dl(Op);
   19929 
   19930   // Get the inputs.
   19931   SDNode *Node = Op.getNode();
   19932   SDValue Chain = Op.getOperand(0);
   19933   SDValue Size  = Op.getOperand(1);
   19934   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   19935   EVT VT = Node->getValueType(0);
   19936 
   19937   // Chain the dynamic stack allocation so that it doesn't modify the stack
   19938   // pointer when other instructions are using the stack.
   19939   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
   19940 
   19941   bool Is64Bit = Subtarget.is64Bit();
   19942   MVT SPTy = getPointerTy(DAG.getDataLayout());
   19943 
   19944   SDValue Result;
   19945   if (!Lower) {
   19946     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   19947     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
   19948     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
   19949                     " not tell us which reg is the stack pointer!");
   19950 
   19951     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
   19952     Chain = SP.getValue(1);
   19953     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   19954     unsigned StackAlign = TFI.getStackAlignment();
   19955     Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
   19956     if (Align > StackAlign)
   19957       Result = DAG.getNode(ISD::AND, dl, VT, Result,
   19958                          DAG.getConstant(-(uint64_t)Align, dl, VT));
   19959     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
   19960   } else if (SplitStack) {
   19961     MachineRegisterInfo &MRI = MF.getRegInfo();
   19962 
   19963     if (Is64Bit) {
   19964       // The 64 bit implementation of segmented stacks needs to clobber both r10
   19965       // r11. This makes it impossible to use it along with nested parameters.
   19966       const Function &F = MF.getFunction();
   19967       for (const auto &A : F.args()) {
   19968         if (A.hasNestAttr())
   19969           report_fatal_error("Cannot use segmented stacks with functions that "
   19970                              "have nested arguments.");
   19971       }
   19972     }
   19973 
   19974     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
   19975     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
   19976     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
   19977     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
   19978                                 DAG.getRegister(Vreg, SPTy));
   19979   } else {
   19980     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   19981     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
   19982     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
   19983 
   19984     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   19985     unsigned SPReg = RegInfo->getStackRegister();
   19986     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
   19987     Chain = SP.getValue(1);
   19988 
   19989     if (Align) {
   19990       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
   19991                        DAG.getConstant(-(uint64_t)Align, dl, VT));
   19992       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
   19993     }
   19994 
   19995     Result = SP;
   19996   }
   19997 
   19998   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
   19999                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
   20000 
   20001   SDValue Ops[2] = {Result, Chain};
   20002   return DAG.getMergeValues(Ops, dl);
   20003 }
   20004 
   20005 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   20006   MachineFunction &MF = DAG.getMachineFunction();
   20007   auto PtrVT = getPointerTy(MF.getDataLayout());
   20008   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   20009 
   20010   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   20011   SDLoc DL(Op);
   20012 
   20013   if (!Subtarget.is64Bit() ||
   20014       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
   20015     // vastart just stores the address of the VarArgsFrameIndex slot into the
   20016     // memory location argument.
   20017     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   20018     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
   20019                         MachinePointerInfo(SV));
   20020   }
   20021 
   20022   // __va_list_tag:
   20023   //   gp_offset         (0 - 6 * 8)
   20024   //   fp_offset         (48 - 48 + 8 * 16)
   20025   //   overflow_arg_area (point to parameters coming in memory).
   20026   //   reg_save_area
   20027   SmallVector<SDValue, 8> MemOps;
   20028   SDValue FIN = Op.getOperand(1);
   20029   // Store gp_offset
   20030   SDValue Store = DAG.getStore(
   20031       Op.getOperand(0), DL,
   20032       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
   20033       MachinePointerInfo(SV));
   20034   MemOps.push_back(Store);
   20035 
   20036   // Store fp_offset
   20037   FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
   20038   Store = DAG.getStore(
   20039       Op.getOperand(0), DL,
   20040       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
   20041       MachinePointerInfo(SV, 4));
   20042   MemOps.push_back(Store);
   20043 
   20044   // Store ptr to overflow_arg_area
   20045   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
   20046   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   20047   Store =
   20048       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
   20049   MemOps.push_back(Store);
   20050 
   20051   // Store ptr to reg_save_area.
   20052   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
   20053       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
   20054   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
   20055   Store = DAG.getStore(
   20056       Op.getOperand(0), DL, RSFIN, FIN,
   20057       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
   20058   MemOps.push_back(Store);
   20059   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
   20060 }
   20061 
   20062 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   20063   assert(Subtarget.is64Bit() &&
   20064          "LowerVAARG only handles 64-bit va_arg!");
   20065   assert(Op.getNumOperands() == 4);
   20066 
   20067   MachineFunction &MF = DAG.getMachineFunction();
   20068   if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
   20069     // The Win64 ABI uses char* instead of a structure.
   20070     return DAG.expandVAArg(Op.getNode());
   20071 
   20072   SDValue Chain = Op.getOperand(0);
   20073   SDValue SrcPtr = Op.getOperand(1);
   20074   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   20075   unsigned Align = Op.getConstantOperandVal(3);
   20076   SDLoc dl(Op);
   20077 
   20078   EVT ArgVT = Op.getNode()->getValueType(0);
   20079   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   20080   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
   20081   uint8_t ArgMode;
   20082 
   20083   // Decide which area this value should be read from.
   20084   // TODO: Implement the AMD64 ABI in its entirety. This simple
   20085   // selection mechanism works only for the basic types.
   20086   if (ArgVT == MVT::f80) {
   20087     llvm_unreachable("va_arg for f80 not yet implemented");
   20088   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
   20089     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
   20090   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
   20091     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
   20092   } else {
   20093     llvm_unreachable("Unhandled argument type in LowerVAARG");
   20094   }
   20095 
   20096   if (ArgMode == 2) {
   20097     // Sanity Check: Make sure using fp_offset makes sense.
   20098     assert(!Subtarget.useSoftFloat() &&
   20099            !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
   20100            Subtarget.hasSSE1());
   20101   }
   20102 
   20103   // Insert VAARG_64 node into the DAG
   20104   // VAARG_64 returns two values: Variable Argument Address, Chain
   20105   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
   20106                        DAG.getConstant(ArgMode, dl, MVT::i8),
   20107                        DAG.getConstant(Align, dl, MVT::i32)};
   20108   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
   20109   SDValue VAARG = DAG.getMemIntrinsicNode(
   20110     X86ISD::VAARG_64, dl,
   20111     VTs, InstOps, MVT::i64,
   20112     MachinePointerInfo(SV),
   20113     /*Align=*/0,
   20114     MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
   20115   Chain = VAARG.getValue(1);
   20116 
   20117   // Load the next argument and return it
   20118   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
   20119 }
   20120 
   20121 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
   20122                            SelectionDAG &DAG) {
   20123   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
   20124   // where a va_list is still an i8*.
   20125   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
   20126   if (Subtarget.isCallingConvWin64(
   20127         DAG.getMachineFunction().getFunction().getCallingConv()))
   20128     // Probably a Win64 va_copy.
   20129     return DAG.expandVACopy(Op.getNode());
   20130 
   20131   SDValue Chain = Op.getOperand(0);
   20132   SDValue DstPtr = Op.getOperand(1);
   20133   SDValue SrcPtr = Op.getOperand(2);
   20134   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   20135   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   20136   SDLoc DL(Op);
   20137 
   20138   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
   20139                        DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
   20140                        false, false,
   20141                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
   20142 }
   20143 
   20144 /// Handle vector element shifts where the shift amount is a constant.
   20145 /// Takes immediate version of shift as input.
   20146 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
   20147                                           SDValue SrcOp, uint64_t ShiftAmt,
   20148                                           SelectionDAG &DAG) {
   20149   MVT ElementType = VT.getVectorElementType();
   20150 
   20151   // Bitcast the source vector to the output type, this is mainly necessary for
   20152   // vXi8/vXi64 shifts.
   20153   if (VT != SrcOp.getSimpleValueType())
   20154     SrcOp = DAG.getBitcast(VT, SrcOp);
   20155 
   20156   // Fold this packed shift into its first operand if ShiftAmt is 0.
   20157   if (ShiftAmt == 0)
   20158     return SrcOp;
   20159 
   20160   // Check for ShiftAmt >= element width
   20161   if (ShiftAmt >= ElementType.getSizeInBits()) {
   20162     if (Opc == X86ISD::VSRAI)
   20163       ShiftAmt = ElementType.getSizeInBits() - 1;
   20164     else
   20165       return DAG.getConstant(0, dl, VT);
   20166   }
   20167 
   20168   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
   20169          && "Unknown target vector shift-by-constant node");
   20170 
   20171   // Fold this packed vector shift into a build vector if SrcOp is a
   20172   // vector of Constants or UNDEFs.
   20173   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
   20174     SmallVector<SDValue, 8> Elts;
   20175     unsigned NumElts = SrcOp->getNumOperands();
   20176     ConstantSDNode *ND;
   20177 
   20178     switch(Opc) {
   20179     default: llvm_unreachable("Unknown opcode!");
   20180     case X86ISD::VSHLI:
   20181       for (unsigned i=0; i!=NumElts; ++i) {
   20182         SDValue CurrentOp = SrcOp->getOperand(i);
   20183         if (CurrentOp->isUndef()) {
   20184           Elts.push_back(CurrentOp);
   20185           continue;
   20186         }
   20187         ND = cast<ConstantSDNode>(CurrentOp);
   20188         const APInt &C = ND->getAPIntValue();
   20189         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
   20190       }
   20191       break;
   20192     case X86ISD::VSRLI:
   20193       for (unsigned i=0; i!=NumElts; ++i) {
   20194         SDValue CurrentOp = SrcOp->getOperand(i);
   20195         if (CurrentOp->isUndef()) {
   20196           Elts.push_back(CurrentOp);
   20197           continue;
   20198         }
   20199         ND = cast<ConstantSDNode>(CurrentOp);
   20200         const APInt &C = ND->getAPIntValue();
   20201         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
   20202       }
   20203       break;
   20204     case X86ISD::VSRAI:
   20205       for (unsigned i=0; i!=NumElts; ++i) {
   20206         SDValue CurrentOp = SrcOp->getOperand(i);
   20207         if (CurrentOp->isUndef()) {
   20208           Elts.push_back(CurrentOp);
   20209           continue;
   20210         }
   20211         ND = cast<ConstantSDNode>(CurrentOp);
   20212         const APInt &C = ND->getAPIntValue();
   20213         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
   20214       }
   20215       break;
   20216     }
   20217 
   20218     return DAG.getBuildVector(VT, dl, Elts);
   20219   }
   20220 
   20221   return DAG.getNode(Opc, dl, VT, SrcOp,
   20222                      DAG.getConstant(ShiftAmt, dl, MVT::i8));
   20223 }
   20224 
   20225 /// Handle vector element shifts where the shift amount may or may not be a
   20226 /// constant. Takes immediate version of shift as input.
   20227 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
   20228                                    SDValue SrcOp, SDValue ShAmt,
   20229                                    const X86Subtarget &Subtarget,
   20230                                    SelectionDAG &DAG) {
   20231   MVT SVT = ShAmt.getSimpleValueType();
   20232   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
   20233 
   20234   // Catch shift-by-constant.
   20235   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
   20236     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
   20237                                       CShAmt->getZExtValue(), DAG);
   20238 
   20239   // Change opcode to non-immediate version
   20240   switch (Opc) {
   20241     default: llvm_unreachable("Unknown target vector shift node");
   20242     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
   20243     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
   20244     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
   20245   }
   20246 
   20247   // Need to build a vector containing shift amount.
   20248   // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
   20249   // +=================+============+=======================================+
   20250   // | ShAmt is        | HasSSE4.1? | Construct ShAmt vector as             |
   20251   // +=================+============+=======================================+
   20252   // | i64             | Yes, No    | Use ShAmt as lowest elt               |
   20253   // | i32             | Yes        | zero-extend in-reg                    |
   20254   // | (i32 zext(i16)) | Yes        | zero-extend in-reg                    |
   20255   // | i16/i32         | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
   20256   // +=================+============+=======================================+
   20257 
   20258   if (SVT == MVT::i64)
   20259     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
   20260   else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
   20261            ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
   20262     ShAmt = ShAmt.getOperand(0);
   20263     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
   20264     ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   20265   } else if (Subtarget.hasSSE41() &&
   20266              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   20267     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
   20268     ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   20269   } else {
   20270     SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
   20271                         DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
   20272     ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
   20273   }
   20274 
   20275   // The return type has to be a 128-bit type with the same element
   20276   // type as the input type.
   20277   MVT EltVT = VT.getVectorElementType();
   20278   MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
   20279 
   20280   ShAmt = DAG.getBitcast(ShVT, ShAmt);
   20281   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
   20282 }
   20283 
   20284 /// Return Mask with the necessary casting or extending
   20285 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
   20286 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
   20287                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
   20288                            const SDLoc &dl) {
   20289 
   20290   if (isAllOnesConstant(Mask))
   20291     return DAG.getConstant(1, dl, MaskVT);
   20292   if (X86::isZeroNode(Mask))
   20293     return DAG.getConstant(0, dl, MaskVT);
   20294 
   20295   if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
   20296     // Mask should be extended
   20297     Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
   20298                        MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
   20299   }
   20300 
   20301   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
   20302     assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
   20303     assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
   20304     // In case 32bit mode, bitcast i64 is illegal, extend/split it.
   20305     SDValue Lo, Hi;
   20306     Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
   20307                         DAG.getConstant(0, dl, MVT::i32));
   20308     Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
   20309                         DAG.getConstant(1, dl, MVT::i32));
   20310 
   20311     Lo = DAG.getBitcast(MVT::v32i1, Lo);
   20312     Hi = DAG.getBitcast(MVT::v32i1, Hi);
   20313 
   20314     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
   20315   } else {
   20316     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
   20317                                      Mask.getSimpleValueType().getSizeInBits());
   20318     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
   20319     // are extracted by EXTRACT_SUBVECTOR.
   20320     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
   20321                        DAG.getBitcast(BitcastVT, Mask),
   20322                        DAG.getIntPtrConstant(0, dl));
   20323   }
   20324 }
   20325 
   20326 /// Return (and \p Op, \p Mask) for compare instructions or
   20327 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
   20328 /// necessary casting or extending for \p Mask when lowering masking intrinsics
   20329 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
   20330                   SDValue PreservedSrc,
   20331                   const X86Subtarget &Subtarget,
   20332                   SelectionDAG &DAG) {
   20333   MVT VT = Op.getSimpleValueType();
   20334   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   20335   unsigned OpcodeSelect = ISD::VSELECT;
   20336   SDLoc dl(Op);
   20337 
   20338   if (isAllOnesConstant(Mask))
   20339     return Op;
   20340 
   20341   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   20342 
   20343   switch (Op.getOpcode()) {
   20344   default: break;
   20345   case X86ISD::CMPM:
   20346   case X86ISD::CMPM_RND:
   20347   case X86ISD::VPSHUFBITQMB:
   20348   case X86ISD::VFPCLASS:
   20349     return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
   20350   case ISD::TRUNCATE:
   20351   case X86ISD::VTRUNC:
   20352   case X86ISD::VTRUNCS:
   20353   case X86ISD::VTRUNCUS:
   20354   case X86ISD::CVTPS2PH:
   20355     // We can't use ISD::VSELECT here because it is not always "Legal"
   20356     // for the destination type. For example vpmovqb require only AVX512
   20357     // and vselect that can operate on byte element type require BWI
   20358     OpcodeSelect = X86ISD::SELECT;
   20359     break;
   20360   }
   20361   if (PreservedSrc.isUndef())
   20362     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   20363   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
   20364 }
   20365 
   20366 /// Creates an SDNode for a predicated scalar operation.
   20367 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
   20368 /// The mask is coming as MVT::i8 and it should be transformed
   20369 /// to MVT::v1i1 while lowering masking intrinsics.
   20370 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
   20371 /// "X86select" instead of "vselect". We just can't create the "vselect" node
   20372 /// for a scalar instruction.
   20373 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
   20374                                     SDValue PreservedSrc,
   20375                                     const X86Subtarget &Subtarget,
   20376                                     SelectionDAG &DAG) {
   20377 
   20378   if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
   20379     if (MaskConst->getZExtValue() & 0x1)
   20380       return Op;
   20381 
   20382   MVT VT = Op.getSimpleValueType();
   20383   SDLoc dl(Op);
   20384 
   20385   assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
   20386   SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
   20387   if (Op.getOpcode() == X86ISD::FSETCCM ||
   20388       Op.getOpcode() == X86ISD::FSETCCM_RND ||
   20389       Op.getOpcode() == X86ISD::VFPCLASSS)
   20390     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
   20391 
   20392   if (PreservedSrc.isUndef())
   20393     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   20394   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
   20395 }
   20396 
   20397 static int getSEHRegistrationNodeSize(const Function *Fn) {
   20398   if (!Fn->hasPersonalityFn())
   20399     report_fatal_error(
   20400         "querying registration node size for function without personality");
   20401   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
   20402   // WinEHStatePass for the full struct definition.
   20403   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
   20404   case EHPersonality::MSVC_X86SEH: return 24;
   20405   case EHPersonality::MSVC_CXX: return 16;
   20406   default: break;
   20407   }
   20408   report_fatal_error(
   20409       "can only recover FP for 32-bit MSVC EH personality functions");
   20410 }
   20411 
   20412 /// When the MSVC runtime transfers control to us, either to an outlined
   20413 /// function or when returning to a parent frame after catching an exception, we
   20414 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
   20415 /// Here's the math:
   20416 ///   RegNodeBase = EntryEBP - RegNodeSize
   20417 ///   ParentFP = RegNodeBase - ParentFrameOffset
   20418 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
   20419 /// subtracting the offset (negative on x86) takes us back to the parent FP.
   20420 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
   20421                                    SDValue EntryEBP) {
   20422   MachineFunction &MF = DAG.getMachineFunction();
   20423   SDLoc dl;
   20424 
   20425   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   20426   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   20427 
   20428   // It's possible that the parent function no longer has a personality function
   20429   // if the exceptional code was optimized away, in which case we just return
   20430   // the incoming EBP.
   20431   if (!Fn->hasPersonalityFn())
   20432     return EntryEBP;
   20433 
   20434   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
   20435   // registration, or the .set_setframe offset.
   20436   MCSymbol *OffsetSym =
   20437       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
   20438           GlobalValue::dropLLVMManglingEscape(Fn->getName()));
   20439   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
   20440   SDValue ParentFrameOffset =
   20441       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
   20442 
   20443   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
   20444   // prologue to RBP in the parent function.
   20445   const X86Subtarget &Subtarget =
   20446       static_cast<const X86Subtarget &>(DAG.getSubtarget());
   20447   if (Subtarget.is64Bit())
   20448     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
   20449 
   20450   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
   20451   // RegNodeBase = EntryEBP - RegNodeSize
   20452   // ParentFP = RegNodeBase - ParentFrameOffset
   20453   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
   20454                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
   20455   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
   20456 }
   20457 
   20458 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   20459                                                    SelectionDAG &DAG) const {
   20460   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
   20461   auto isRoundModeCurDirection = [](SDValue Rnd) {
   20462     if (!isa<ConstantSDNode>(Rnd))
   20463       return false;
   20464 
   20465     unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
   20466     return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
   20467   };
   20468 
   20469   SDLoc dl(Op);
   20470   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   20471   MVT VT = Op.getSimpleValueType();
   20472   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
   20473   if (IntrData) {
   20474     switch(IntrData->Type) {
   20475     case INTR_TYPE_1OP: {
   20476       // We specify 2 possible opcodes for intrinsics with rounding modes.
   20477       // First, we check if the intrinsic may have non-default rounding mode,
   20478       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   20479       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   20480       if (IntrWithRoundingModeOpcode != 0) {
   20481         SDValue Rnd = Op.getOperand(2);
   20482         if (!isRoundModeCurDirection(Rnd)) {
   20483           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
   20484                              Op.getOperand(1), Rnd);
   20485         }
   20486       }
   20487       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
   20488     }
   20489     case INTR_TYPE_2OP:
   20490     case INTR_TYPE_2OP_IMM8: {
   20491       SDValue Src2 = Op.getOperand(2);
   20492 
   20493       if (IntrData->Type == INTR_TYPE_2OP_IMM8)
   20494         Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
   20495 
   20496       // We specify 2 possible opcodes for intrinsics with rounding modes.
   20497       // First, we check if the intrinsic may have non-default rounding mode,
   20498       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   20499       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   20500       if (IntrWithRoundingModeOpcode != 0) {
   20501         SDValue Rnd = Op.getOperand(3);
   20502         if (!isRoundModeCurDirection(Rnd)) {
   20503           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
   20504                              Op.getOperand(1), Src2, Rnd);
   20505         }
   20506       }
   20507 
   20508       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
   20509                          Op.getOperand(1), Src2);
   20510     }
   20511     case INTR_TYPE_3OP:
   20512     case INTR_TYPE_3OP_IMM8: {
   20513       SDValue Src1 = Op.getOperand(1);
   20514       SDValue Src2 = Op.getOperand(2);
   20515       SDValue Src3 = Op.getOperand(3);
   20516 
   20517       if (IntrData->Type == INTR_TYPE_3OP_IMM8)
   20518         Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
   20519 
   20520       // We specify 2 possible opcodes for intrinsics with rounding modes.
   20521       // First, we check if the intrinsic may have non-default rounding mode,
   20522       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   20523       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   20524       if (IntrWithRoundingModeOpcode != 0) {
   20525         SDValue Rnd = Op.getOperand(4);
   20526         if (!isRoundModeCurDirection(Rnd)) {
   20527           return DAG.getNode(IntrWithRoundingModeOpcode,
   20528                              dl, Op.getValueType(),
   20529                              Src1, Src2, Src3, Rnd);
   20530         }
   20531       }
   20532 
   20533       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
   20534                          Src1, Src2, Src3);
   20535     }
   20536     case INTR_TYPE_4OP:
   20537       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
   20538         Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
   20539     case INTR_TYPE_1OP_MASK_RM: {
   20540       SDValue Src = Op.getOperand(1);
   20541       SDValue PassThru = Op.getOperand(2);
   20542       SDValue Mask = Op.getOperand(3);
   20543       SDValue RoundingMode;
   20544       // We always add rounding mode to the Node.
   20545       // If the rounding mode is not specified, we add the
   20546       // "current direction" mode.
   20547       if (Op.getNumOperands() == 4)
   20548         RoundingMode =
   20549           DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
   20550       else
   20551         RoundingMode = Op.getOperand(4);
   20552       assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
   20553       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
   20554                                               RoundingMode),
   20555                                   Mask, PassThru, Subtarget, DAG);
   20556     }
   20557     case INTR_TYPE_1OP_MASK: {
   20558       SDValue Src = Op.getOperand(1);
   20559       SDValue PassThru = Op.getOperand(2);
   20560       SDValue Mask = Op.getOperand(3);
   20561       // We add rounding mode to the Node when
   20562       //   - RM Opcode is specified and
   20563       //   - RM is not "current direction".
   20564       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   20565       if (IntrWithRoundingModeOpcode != 0) {
   20566         SDValue Rnd = Op.getOperand(4);
   20567         if (!isRoundModeCurDirection(Rnd)) {
   20568           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   20569                                       dl, Op.getValueType(),
   20570                                       Src, Rnd),
   20571                                       Mask, PassThru, Subtarget, DAG);
   20572         }
   20573       }
   20574       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
   20575                                   Mask, PassThru, Subtarget, DAG);
   20576     }
   20577     case INTR_TYPE_SCALAR_MASK: {
   20578       SDValue Src1 = Op.getOperand(1);
   20579       SDValue Src2 = Op.getOperand(2);
   20580       SDValue passThru = Op.getOperand(3);
   20581       SDValue Mask = Op.getOperand(4);
   20582       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   20583       // There are 2 kinds of intrinsics in this group:
   20584       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
   20585       // (2) With rounding mode and sae - 7 operands.
   20586       bool HasRounding = IntrWithRoundingModeOpcode != 0;
   20587       if (Op.getNumOperands() == (5U + HasRounding)) {
   20588         if (HasRounding) {
   20589           SDValue Rnd = Op.getOperand(5);
   20590           if (!isRoundModeCurDirection(Rnd))
   20591             return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   20592                                                     dl, VT, Src1, Src2, Rnd),
   20593                                         Mask, passThru, Subtarget, DAG);
   20594         }
   20595         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
   20596                                                 Src2),
   20597                                     Mask, passThru, Subtarget, DAG);
   20598       }
   20599 
   20600       assert(Op.getNumOperands() == (6U + HasRounding) &&
   20601              "Unexpected intrinsic form");
   20602       SDValue RoundingMode = Op.getOperand(5);
   20603       if (HasRounding) {
   20604         SDValue Sae = Op.getOperand(6);
   20605         if (!isRoundModeCurDirection(Sae))
   20606           return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   20607                                                   dl, VT, Src1, Src2,
   20608                                                   RoundingMode, Sae),
   20609                                       Mask, passThru, Subtarget, DAG);
   20610       }
   20611       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
   20612                                               Src2, RoundingMode),
   20613                                   Mask, passThru, Subtarget, DAG);
   20614     }
   20615     case INTR_TYPE_SCALAR_MASK_RM: {
   20616       SDValue Src1 = Op.getOperand(1);
   20617       SDValue Src2 = Op.getOperand(2);
   20618       SDValue Src0 = Op.getOperand(3);
   20619       SDValue Mask = Op.getOperand(4);
   20620       // There are 2 kinds of intrinsics in this group:
   20621       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
   20622       // (2) With rounding mode and sae - 7 operands.
   20623       if (Op.getNumOperands() == 6) {
   20624         SDValue Sae  = Op.getOperand(5);
   20625         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
   20626                                                 Sae),
   20627                                     Mask, Src0, Subtarget, DAG);
   20628       }
   20629       assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
   20630       SDValue RoundingMode  = Op.getOperand(5);
   20631       SDValue Sae  = Op.getOperand(6);
   20632       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
   20633                                               RoundingMode, Sae),
   20634                                   Mask, Src0, Subtarget, DAG);
   20635     }
   20636     case INTR_TYPE_2OP_MASK: {
   20637       SDValue Src1 = Op.getOperand(1);
   20638       SDValue Src2 = Op.getOperand(2);
   20639       SDValue PassThru = Op.getOperand(3);
   20640       SDValue Mask = Op.getOperand(4);
   20641 
   20642       // We specify 2 possible opcodes for intrinsics with rounding modes.
   20643       // First, we check if the intrinsic may have non-default rounding mode,
   20644       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   20645       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   20646       if (IntrWithRoundingModeOpcode != 0) {
   20647         SDValue Rnd = Op.getOperand(5);
   20648         if (!isRoundModeCurDirection(Rnd)) {
   20649           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   20650                                       dl, Op.getValueType(),
   20651                                       Src1, Src2, Rnd),
   20652                                       Mask, PassThru, Subtarget, DAG);
   20653         }
   20654       }
   20655       // TODO: Intrinsics should have fast-math-flags to propagate.
   20656       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
   20657                                   Mask, PassThru, Subtarget, DAG);
   20658     }
   20659     case INTR_TYPE_2OP_MASK_RM: {
   20660       SDValue Src1 = Op.getOperand(1);
   20661       SDValue Src2 = Op.getOperand(2);
   20662       SDValue PassThru = Op.getOperand(3);
   20663       SDValue Mask = Op.getOperand(4);
   20664       // We specify 2 possible modes for intrinsics, with/without rounding
   20665       // modes.
   20666       // First, we check if the intrinsic have rounding mode (6 operands),
   20667       // if not, we set rounding mode to "current".
   20668       SDValue Rnd;
   20669       if (Op.getNumOperands() == 6)
   20670         Rnd = Op.getOperand(5);
   20671       else
   20672         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
   20673       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   20674                                               Src1, Src2, Rnd),
   20675                                   Mask, PassThru, Subtarget, DAG);
   20676     }
   20677     case INTR_TYPE_3OP_SCALAR_MASK: {
   20678       SDValue Src1 = Op.getOperand(1);
   20679       SDValue Src2 = Op.getOperand(2);
   20680       SDValue Src3 = Op.getOperand(3);
   20681       SDValue PassThru = Op.getOperand(4);
   20682       SDValue Mask = Op.getOperand(5);
   20683 
   20684       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   20685       if (IntrWithRoundingModeOpcode != 0) {
   20686         SDValue Rnd = Op.getOperand(6);
   20687         if (!isRoundModeCurDirection(Rnd))
   20688           return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   20689                                                   dl, VT, Src1, Src2, Src3, Rnd),
   20690                                       Mask, PassThru, Subtarget, DAG);
   20691       }
   20692       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
   20693                                               Src2, Src3),
   20694                                   Mask, PassThru, Subtarget, DAG);
   20695     }
   20696     case INTR_TYPE_3OP_MASK: {
   20697       SDValue Src1 = Op.getOperand(1);
   20698       SDValue Src2 = Op.getOperand(2);
   20699       SDValue Src3 = Op.getOperand(3);
   20700       SDValue PassThru = Op.getOperand(4);
   20701       SDValue Mask = Op.getOperand(5);
   20702 
   20703       // We specify 2 possible opcodes for intrinsics with rounding modes.
   20704       // First, we check if the intrinsic may have non-default rounding mode,
   20705       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   20706       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   20707       if (IntrWithRoundingModeOpcode != 0) {
   20708         SDValue Rnd = Op.getOperand(6);
   20709         if (!isRoundModeCurDirection(Rnd)) {
   20710           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   20711                                       dl, Op.getValueType(),
   20712                                       Src1, Src2, Src3, Rnd),
   20713                                       Mask, PassThru, Subtarget, DAG);
   20714         }
   20715       }
   20716       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   20717                                               Src1, Src2, Src3),
   20718                                   Mask, PassThru, Subtarget, DAG);
   20719     }
   20720     case VPERM_2OP : {
   20721       SDValue Src1 = Op.getOperand(1);
   20722       SDValue Src2 = Op.getOperand(2);
   20723 
   20724       // Swap Src1 and Src2 in the node creation
   20725       return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
   20726     }
   20727     case FMA_OP_MASKZ:
   20728     case FMA_OP_MASK: {
   20729       SDValue Src1 = Op.getOperand(1);
   20730       SDValue Src2 = Op.getOperand(2);
   20731       SDValue Src3 = Op.getOperand(3);
   20732       SDValue Mask = Op.getOperand(4);
   20733       MVT VT = Op.getSimpleValueType();
   20734       SDValue PassThru = SDValue();
   20735 
   20736       // set PassThru element
   20737       if (IntrData->Type == FMA_OP_MASKZ)
   20738         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
   20739       else
   20740         PassThru = Src1;
   20741 
   20742       // We specify 2 possible opcodes for intrinsics with rounding modes.
   20743       // First, we check if the intrinsic may have non-default rounding mode,
   20744       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   20745       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   20746       if (IntrWithRoundingModeOpcode != 0) {
   20747         SDValue Rnd = Op.getOperand(5);
   20748         if (!isRoundModeCurDirection(Rnd))
   20749           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   20750                                                   dl, Op.getValueType(),
   20751                                                   Src1, Src2, Src3, Rnd),
   20752                                       Mask, PassThru, Subtarget, DAG);
   20753       }
   20754       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
   20755                                               dl, Op.getValueType(),
   20756                                               Src1, Src2, Src3),
   20757                                   Mask, PassThru, Subtarget, DAG);
   20758     }
   20759     case IFMA_OP:
   20760       // NOTE: We need to swizzle the operands to pass the multiply operands
   20761       // first.
   20762       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
   20763                          Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
   20764     case CVTPD2PS:
   20765       // ISD::FP_ROUND has a second argument that indicates if the truncation
   20766       // does not change the value. Set it to 0 since it can change.
   20767       return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
   20768                          DAG.getIntPtrConstant(0, dl));
   20769     case CVTPD2PS_MASK: {
   20770       SDValue Src = Op.getOperand(1);
   20771       SDValue PassThru = Op.getOperand(2);
   20772       SDValue Mask = Op.getOperand(3);
   20773       // We add rounding mode to the Node when
   20774       //   - RM Opcode is specified and
   20775       //   - RM is not "current direction".
   20776       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
   20777       if (IntrWithRoundingModeOpcode != 0) {
   20778         SDValue Rnd = Op.getOperand(4);
   20779         if (!isRoundModeCurDirection(Rnd)) {
   20780           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
   20781                                       dl, Op.getValueType(),
   20782                                       Src, Rnd),
   20783                                       Mask, PassThru, Subtarget, DAG);
   20784         }
   20785       }
   20786       assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
   20787       // ISD::FP_ROUND has a second argument that indicates if the truncation
   20788       // does not change the value. Set it to 0 since it can change.
   20789       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
   20790                                               DAG.getIntPtrConstant(0, dl)),
   20791                                   Mask, PassThru, Subtarget, DAG);
   20792     }
   20793     case FPCLASS: {
   20794       // FPclass intrinsics
   20795       SDValue Src1 = Op.getOperand(1);
   20796       MVT MaskVT = Op.getSimpleValueType();
   20797       SDValue Imm = Op.getOperand(2);
   20798       return DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
   20799     }
   20800     case FPCLASSS: {
   20801       SDValue Src1 = Op.getOperand(1);
   20802       SDValue Imm = Op.getOperand(2);
   20803       SDValue Mask = Op.getOperand(3);
   20804       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
   20805       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
   20806                                                  Subtarget, DAG);
   20807       // Need to fill with zeros to ensure the bitcast will produce zeroes
   20808       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
   20809       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
   20810                                 DAG.getConstant(0, dl, MVT::v8i1),
   20811                                 FPclassMask, DAG.getIntPtrConstant(0, dl));
   20812       return DAG.getBitcast(MVT::i8, Ins);
   20813     }
   20814     case CMP_MASK: {
   20815       // Comparison intrinsics with masks.
   20816       // Example of transformation:
   20817       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
   20818       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
   20819       // (i8 (bitcast
   20820       //   (v8i1 (insert_subvector zero,
   20821       //           (v2i1 (and (PCMPEQM %a, %b),
   20822       //                      (extract_subvector
   20823       //                         (v8i1 (bitcast %mask)), 0))), 0))))
   20824       MVT VT = Op.getOperand(1).getSimpleValueType();
   20825       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   20826       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
   20827       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
   20828                                        Mask.getSimpleValueType().getSizeInBits());
   20829       SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
   20830                                 Op.getOperand(2));
   20831       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
   20832                                              Subtarget, DAG);
   20833       // Need to fill with zeros to ensure the bitcast will produce zeroes
   20834       // for the upper bits in the v2i1/v4i1 case.
   20835       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
   20836                                 DAG.getConstant(0, dl, BitcastVT),
   20837                                 CmpMask, DAG.getIntPtrConstant(0, dl));
   20838       return DAG.getBitcast(Op.getValueType(), Res);
   20839     }
   20840 
   20841     case CMP_MASK_CC: {
   20842       MVT MaskVT = Op.getSimpleValueType();
   20843       SDValue Cmp;
   20844       SDValue CC = Op.getOperand(3);
   20845       CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
   20846       // We specify 2 possible opcodes for intrinsics with rounding modes.
   20847       // First, we check if the intrinsic may have non-default rounding mode,
   20848       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
   20849       if (IntrData->Opc1 != 0) {
   20850         SDValue Rnd = Op.getOperand(4);
   20851         if (!isRoundModeCurDirection(Rnd))
   20852           Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
   20853                             Op.getOperand(2), CC, Rnd);
   20854       }
   20855       //default rounding mode
   20856       if (!Cmp.getNode())
   20857         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
   20858                           Op.getOperand(2), CC);
   20859 
   20860       return Cmp;
   20861     }
   20862     case CMP_MASK_SCALAR_CC: {
   20863       SDValue Src1 = Op.getOperand(1);
   20864       SDValue Src2 = Op.getOperand(2);
   20865       SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
   20866       SDValue Mask = Op.getOperand(4);
   20867 
   20868       SDValue Cmp;
   20869       if (IntrData->Opc1 != 0) {
   20870         SDValue Rnd = Op.getOperand(5);
   20871         if (!isRoundModeCurDirection(Rnd))
   20872           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
   20873       }
   20874       //default rounding mode
   20875       if(!Cmp.getNode())
   20876         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
   20877 
   20878       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
   20879                                              Subtarget, DAG);
   20880       // Need to fill with zeros to ensure the bitcast will produce zeroes
   20881       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
   20882       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
   20883                                 DAG.getConstant(0, dl, MVT::v8i1),
   20884                                 CmpMask, DAG.getIntPtrConstant(0, dl));
   20885       return DAG.getBitcast(MVT::i8, Ins);
   20886     }
   20887     case COMI: { // Comparison intrinsics
   20888       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
   20889       SDValue LHS = Op.getOperand(1);
   20890       SDValue RHS = Op.getOperand(2);
   20891       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
   20892       SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
   20893       SDValue SetCC;
   20894       switch (CC) {
   20895       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
   20896         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
   20897         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
   20898         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
   20899         break;
   20900       }
   20901       case ISD::SETNE: { // (ZF = 1 or PF = 1)
   20902         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
   20903         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
   20904         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
   20905         break;
   20906       }
   20907       case ISD::SETGT: // (CF = 0 and ZF = 0)
   20908         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
   20909         break;
   20910       case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
   20911         SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
   20912         break;
   20913       }
   20914       case ISD::SETGE: // CF = 0
   20915         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
   20916         break;
   20917       case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
   20918         SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
   20919         break;
   20920       default:
   20921         llvm_unreachable("Unexpected illegal condition!");
   20922       }
   20923       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   20924     }
   20925     case COMI_RM: { // Comparison intrinsics with Sae
   20926       SDValue LHS = Op.getOperand(1);
   20927       SDValue RHS = Op.getOperand(2);
   20928       unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
   20929       SDValue Sae = Op.getOperand(4);
   20930 
   20931       SDValue FCmp;
   20932       if (isRoundModeCurDirection(Sae))
   20933         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
   20934                            DAG.getConstant(CondVal, dl, MVT::i8));
   20935       else
   20936         FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
   20937                            DAG.getConstant(CondVal, dl, MVT::i8), Sae);
   20938       // Need to fill with zeros to ensure the bitcast will produce zeroes
   20939       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
   20940       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
   20941                                 DAG.getConstant(0, dl, MVT::v16i1),
   20942                                 FCmp, DAG.getIntPtrConstant(0, dl));
   20943       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
   20944                          DAG.getBitcast(MVT::i16, Ins));
   20945     }
   20946     case VSHIFT:
   20947       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
   20948                                  Op.getOperand(1), Op.getOperand(2), Subtarget,
   20949                                  DAG);
   20950     case COMPRESS_EXPAND_IN_REG: {
   20951       SDValue Mask = Op.getOperand(3);
   20952       SDValue DataToCompress = Op.getOperand(1);
   20953       SDValue PassThru = Op.getOperand(2);
   20954       if (isAllOnesConstant(Mask)) // return data as is
   20955         return Op.getOperand(1);
   20956 
   20957       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   20958                                               DataToCompress),
   20959                                   Mask, PassThru, Subtarget, DAG);
   20960     }
   20961     case FIXUPIMMS:
   20962     case FIXUPIMMS_MASKZ:
   20963     case FIXUPIMM:
   20964     case FIXUPIMM_MASKZ:{
   20965       SDValue Src1 = Op.getOperand(1);
   20966       SDValue Src2 = Op.getOperand(2);
   20967       SDValue Src3 = Op.getOperand(3);
   20968       SDValue Imm = Op.getOperand(4);
   20969       SDValue Mask = Op.getOperand(5);
   20970       SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
   20971                                          Src1 : getZeroVector(VT, Subtarget, DAG, dl);
   20972       // We specify 2 possible modes for intrinsics, with/without rounding
   20973       // modes.
   20974       // First, we check if the intrinsic have rounding mode (7 operands),
   20975       // if not, we set rounding mode to "current".
   20976       SDValue Rnd;
   20977       if (Op.getNumOperands() == 7)
   20978         Rnd = Op.getOperand(6);
   20979       else
   20980         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
   20981       if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
   20982         return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   20983                                                 Src1, Src2, Src3, Imm, Rnd),
   20984                                     Mask, Passthru, Subtarget, DAG);
   20985       else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
   20986         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
   20987                                        Src1, Src2, Src3, Imm, Rnd),
   20988                                     Mask, Passthru, Subtarget, DAG);
   20989     }
   20990     case ROUNDP: {
   20991       assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
   20992       // Clear the upper bits of the rounding immediate so that the legacy
   20993       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
   20994       SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
   20995                                          Op.getOperand(2),
   20996                                          DAG.getConstant(0xf, dl, MVT::i32));
   20997       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
   20998                          Op.getOperand(1), RoundingMode);
   20999     }
   21000     case ROUNDS: {
   21001       assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
   21002       // Clear the upper bits of the rounding immediate so that the legacy
   21003       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
   21004       SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
   21005                                          Op.getOperand(3),
   21006                                          DAG.getConstant(0xf, dl, MVT::i32));
   21007       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
   21008                          Op.getOperand(1), Op.getOperand(2), RoundingMode);
   21009     }
   21010     default:
   21011       break;
   21012     }
   21013   }
   21014 
   21015   switch (IntNo) {
   21016   default: return SDValue();    // Don't custom lower most intrinsics.
   21017 
   21018   // ptest and testp intrinsics. The intrinsic these come from are designed to
   21019   // return an integer value, not just an instruction so lower it to the ptest
   21020   // or testp pattern and a setcc for the result.
   21021   case Intrinsic::x86_sse41_ptestz:
   21022   case Intrinsic::x86_sse41_ptestc:
   21023   case Intrinsic::x86_sse41_ptestnzc:
   21024   case Intrinsic::x86_avx_ptestz_256:
   21025   case Intrinsic::x86_avx_ptestc_256:
   21026   case Intrinsic::x86_avx_ptestnzc_256:
   21027   case Intrinsic::x86_avx_vtestz_ps:
   21028   case Intrinsic::x86_avx_vtestc_ps:
   21029   case Intrinsic::x86_avx_vtestnzc_ps:
   21030   case Intrinsic::x86_avx_vtestz_pd:
   21031   case Intrinsic::x86_avx_vtestc_pd:
   21032   case Intrinsic::x86_avx_vtestnzc_pd:
   21033   case Intrinsic::x86_avx_vtestz_ps_256:
   21034   case Intrinsic::x86_avx_vtestc_ps_256:
   21035   case Intrinsic::x86_avx_vtestnzc_ps_256:
   21036   case Intrinsic::x86_avx_vtestz_pd_256:
   21037   case Intrinsic::x86_avx_vtestc_pd_256:
   21038   case Intrinsic::x86_avx_vtestnzc_pd_256: {
   21039     bool IsTestPacked = false;
   21040     X86::CondCode X86CC;
   21041     switch (IntNo) {
   21042     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
   21043     case Intrinsic::x86_avx_vtestz_ps:
   21044     case Intrinsic::x86_avx_vtestz_pd:
   21045     case Intrinsic::x86_avx_vtestz_ps_256:
   21046     case Intrinsic::x86_avx_vtestz_pd_256:
   21047       IsTestPacked = true;
   21048       LLVM_FALLTHROUGH;
   21049     case Intrinsic::x86_sse41_ptestz:
   21050     case Intrinsic::x86_avx_ptestz_256:
   21051       // ZF = 1
   21052       X86CC = X86::COND_E;
   21053       break;
   21054     case Intrinsic::x86_avx_vtestc_ps:
   21055     case Intrinsic::x86_avx_vtestc_pd:
   21056     case Intrinsic::x86_avx_vtestc_ps_256:
   21057     case Intrinsic::x86_avx_vtestc_pd_256:
   21058       IsTestPacked = true;
   21059       LLVM_FALLTHROUGH;
   21060     case Intrinsic::x86_sse41_ptestc:
   21061     case Intrinsic::x86_avx_ptestc_256:
   21062       // CF = 1
   21063       X86CC = X86::COND_B;
   21064       break;
   21065     case Intrinsic::x86_avx_vtestnzc_ps:
   21066     case Intrinsic::x86_avx_vtestnzc_pd:
   21067     case Intrinsic::x86_avx_vtestnzc_ps_256:
   21068     case Intrinsic::x86_avx_vtestnzc_pd_256:
   21069       IsTestPacked = true;
   21070       LLVM_FALLTHROUGH;
   21071     case Intrinsic::x86_sse41_ptestnzc:
   21072     case Intrinsic::x86_avx_ptestnzc_256:
   21073       // ZF and CF = 0
   21074       X86CC = X86::COND_A;
   21075       break;
   21076     }
   21077 
   21078     SDValue LHS = Op.getOperand(1);
   21079     SDValue RHS = Op.getOperand(2);
   21080     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
   21081     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
   21082     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
   21083     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   21084   }
   21085 
   21086   case Intrinsic::x86_sse42_pcmpistria128:
   21087   case Intrinsic::x86_sse42_pcmpestria128:
   21088   case Intrinsic::x86_sse42_pcmpistric128:
   21089   case Intrinsic::x86_sse42_pcmpestric128:
   21090   case Intrinsic::x86_sse42_pcmpistrio128:
   21091   case Intrinsic::x86_sse42_pcmpestrio128:
   21092   case Intrinsic::x86_sse42_pcmpistris128:
   21093   case Intrinsic::x86_sse42_pcmpestris128:
   21094   case Intrinsic::x86_sse42_pcmpistriz128:
   21095   case Intrinsic::x86_sse42_pcmpestriz128: {
   21096     unsigned Opcode;
   21097     X86::CondCode X86CC;
   21098     switch (IntNo) {
   21099     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   21100     case Intrinsic::x86_sse42_pcmpistria128:
   21101       Opcode = X86ISD::PCMPISTR;
   21102       X86CC = X86::COND_A;
   21103       break;
   21104     case Intrinsic::x86_sse42_pcmpestria128:
   21105       Opcode = X86ISD::PCMPESTR;
   21106       X86CC = X86::COND_A;
   21107       break;
   21108     case Intrinsic::x86_sse42_pcmpistric128:
   21109       Opcode = X86ISD::PCMPISTR;
   21110       X86CC = X86::COND_B;
   21111       break;
   21112     case Intrinsic::x86_sse42_pcmpestric128:
   21113       Opcode = X86ISD::PCMPESTR;
   21114       X86CC = X86::COND_B;
   21115       break;
   21116     case Intrinsic::x86_sse42_pcmpistrio128:
   21117       Opcode = X86ISD::PCMPISTR;
   21118       X86CC = X86::COND_O;
   21119       break;
   21120     case Intrinsic::x86_sse42_pcmpestrio128:
   21121       Opcode = X86ISD::PCMPESTR;
   21122       X86CC = X86::COND_O;
   21123       break;
   21124     case Intrinsic::x86_sse42_pcmpistris128:
   21125       Opcode = X86ISD::PCMPISTR;
   21126       X86CC = X86::COND_S;
   21127       break;
   21128     case Intrinsic::x86_sse42_pcmpestris128:
   21129       Opcode = X86ISD::PCMPESTR;
   21130       X86CC = X86::COND_S;
   21131       break;
   21132     case Intrinsic::x86_sse42_pcmpistriz128:
   21133       Opcode = X86ISD::PCMPISTR;
   21134       X86CC = X86::COND_E;
   21135       break;
   21136     case Intrinsic::x86_sse42_pcmpestriz128:
   21137       Opcode = X86ISD::PCMPESTR;
   21138       X86CC = X86::COND_E;
   21139       break;
   21140     }
   21141     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
   21142     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
   21143     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
   21144     SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
   21145     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   21146   }
   21147 
   21148   case Intrinsic::x86_sse42_pcmpistri128:
   21149   case Intrinsic::x86_sse42_pcmpestri128: {
   21150     unsigned Opcode;
   21151     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
   21152       Opcode = X86ISD::PCMPISTR;
   21153     else
   21154       Opcode = X86ISD::PCMPESTR;
   21155 
   21156     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
   21157     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
   21158     return DAG.getNode(Opcode, dl, VTs, NewOps);
   21159   }
   21160 
   21161   case Intrinsic::x86_sse42_pcmpistrm128:
   21162   case Intrinsic::x86_sse42_pcmpestrm128: {
   21163     unsigned Opcode;
   21164     if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
   21165       Opcode = X86ISD::PCMPISTR;
   21166     else
   21167       Opcode = X86ISD::PCMPESTR;
   21168 
   21169     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
   21170     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
   21171     return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
   21172   }
   21173 
   21174   case Intrinsic::eh_sjlj_lsda: {
   21175     MachineFunction &MF = DAG.getMachineFunction();
   21176     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   21177     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   21178     auto &Context = MF.getMMI().getContext();
   21179     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
   21180                                             Twine(MF.getFunctionNumber()));
   21181     return DAG.getNode(getGlobalWrapperKind(), dl, VT,
   21182                        DAG.getMCSymbol(S, PtrVT));
   21183   }
   21184 
   21185   case Intrinsic::x86_seh_lsda: {
   21186     // Compute the symbol for the LSDA. We know it'll get emitted later.
   21187     MachineFunction &MF = DAG.getMachineFunction();
   21188     SDValue Op1 = Op.getOperand(1);
   21189     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
   21190     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
   21191         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
   21192 
   21193     // Generate a simple absolute symbol reference. This intrinsic is only
   21194     // supported on 32-bit Windows, which isn't PIC.
   21195     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
   21196     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
   21197   }
   21198 
   21199   case Intrinsic::x86_seh_recoverfp: {
   21200     SDValue FnOp = Op.getOperand(1);
   21201     SDValue IncomingFPOp = Op.getOperand(2);
   21202     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
   21203     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
   21204     if (!Fn)
   21205       report_fatal_error(
   21206           "llvm.x86.seh.recoverfp must take a function as the first argument");
   21207     return recoverFramePointer(DAG, Fn, IncomingFPOp);
   21208   }
   21209 
   21210   case Intrinsic::localaddress: {
   21211     // Returns one of the stack, base, or frame pointer registers, depending on
   21212     // which is used to reference local variables.
   21213     MachineFunction &MF = DAG.getMachineFunction();
   21214     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   21215     unsigned Reg;
   21216     if (RegInfo->hasBasePointer(MF))
   21217       Reg = RegInfo->getBaseRegister();
   21218     else // This function handles the SP or FP case.
   21219       Reg = RegInfo->getPtrSizedFrameRegister(MF);
   21220     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
   21221   }
   21222   }
   21223 }
   21224 
   21225 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   21226                                  SDValue Src, SDValue Mask, SDValue Base,
   21227                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
   21228                                  const X86Subtarget &Subtarget) {
   21229   SDLoc dl(Op);
   21230   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   21231   // Scale must be constant.
   21232   if (!C)
   21233     return SDValue();
   21234   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   21235   EVT MaskVT = Mask.getValueType();
   21236   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   21237   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   21238   SDValue Segment = DAG.getRegister(0, MVT::i32);
   21239   // If source is undef or we know it won't be used, use a zero vector
   21240   // to break register dependency.
   21241   // TODO: use undef instead and let BreakFalseDeps deal with it?
   21242   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
   21243     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   21244   SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
   21245   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   21246   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
   21247   return DAG.getMergeValues(RetOps, dl);
   21248 }
   21249 
   21250 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   21251                               SDValue Src, SDValue Mask, SDValue Base,
   21252                               SDValue Index, SDValue ScaleOp, SDValue Chain,
   21253                               const X86Subtarget &Subtarget) {
   21254   SDLoc dl(Op);
   21255   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   21256   // Scale must be constant.
   21257   if (!C)
   21258     return SDValue();
   21259   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   21260   MVT MaskVT = MVT::getVectorVT(MVT::i1,
   21261                              Index.getSimpleValueType().getVectorNumElements());
   21262 
   21263   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   21264   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   21265   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   21266   SDValue Segment = DAG.getRegister(0, MVT::i32);
   21267   // If source is undef or we know it won't be used, use a zero vector
   21268   // to break register dependency.
   21269   // TODO: use undef instead and let BreakFalseDeps deal with it?
   21270   if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
   21271     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   21272   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
   21273   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   21274   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
   21275   return DAG.getMergeValues(RetOps, dl);
   21276 }
   21277 
   21278 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   21279                                SDValue Src, SDValue Mask, SDValue Base,
   21280                                SDValue Index, SDValue ScaleOp, SDValue Chain,
   21281                                const X86Subtarget &Subtarget) {
   21282   SDLoc dl(Op);
   21283   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   21284   // Scale must be constant.
   21285   if (!C)
   21286     return SDValue();
   21287   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   21288   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   21289   SDValue Segment = DAG.getRegister(0, MVT::i32);
   21290   MVT MaskVT = MVT::getVectorVT(MVT::i1,
   21291                              Index.getSimpleValueType().getVectorNumElements());
   21292 
   21293   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   21294   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   21295   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
   21296   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   21297   return SDValue(Res, 1);
   21298 }
   21299 
   21300 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   21301                                SDValue Mask, SDValue Base, SDValue Index,
   21302                                SDValue ScaleOp, SDValue Chain,
   21303                                const X86Subtarget &Subtarget) {
   21304   SDLoc dl(Op);
   21305   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   21306   // Scale must be constant.
   21307   if (!C)
   21308     return SDValue();
   21309   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   21310   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   21311   SDValue Segment = DAG.getRegister(0, MVT::i32);
   21312   MVT MaskVT =
   21313     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   21314   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   21315   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
   21316   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   21317   return SDValue(Res, 0);
   21318 }
   21319 
   21320 /// Handles the lowering of builtin intrinsic that return the value
   21321 /// of the extended control register.
   21322 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
   21323                                        SelectionDAG &DAG,
   21324                                        const X86Subtarget &Subtarget,
   21325                                        SmallVectorImpl<SDValue> &Results) {
   21326   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   21327   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   21328   SDValue LO, HI;
   21329 
   21330   // The ECX register is used to select the index of the XCR register to
   21331   // return.
   21332   SDValue Chain =
   21333       DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
   21334   SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
   21335   Chain = SDValue(N1, 0);
   21336 
   21337   // Reads the content of XCR and returns it in registers EDX:EAX.
   21338   if (Subtarget.is64Bit()) {
   21339     LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
   21340     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
   21341                             LO.getValue(2));
   21342   } else {
   21343     LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
   21344     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
   21345                             LO.getValue(2));
   21346   }
   21347   Chain = HI.getValue(1);
   21348 
   21349   if (Subtarget.is64Bit()) {
   21350     // Merge the two 32-bit values into a 64-bit one..
   21351     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
   21352                               DAG.getConstant(32, DL, MVT::i8));
   21353     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
   21354     Results.push_back(Chain);
   21355     return;
   21356   }
   21357 
   21358   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   21359   SDValue Ops[] = { LO, HI };
   21360   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   21361   Results.push_back(Pair);
   21362   Results.push_back(Chain);
   21363 }
   21364 
   21365 /// Handles the lowering of builtin intrinsics that read performance monitor
   21366 /// counters (x86_rdpmc).
   21367 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
   21368                                       SelectionDAG &DAG,
   21369                                       const X86Subtarget &Subtarget,
   21370                                       SmallVectorImpl<SDValue> &Results) {
   21371   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   21372   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   21373   SDValue LO, HI;
   21374 
   21375   // The ECX register is used to select the index of the performance counter
   21376   // to read.
   21377   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
   21378                                    N->getOperand(2));
   21379   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
   21380 
   21381   // Reads the content of a 64-bit performance counter and returns it in the
   21382   // registers EDX:EAX.
   21383   if (Subtarget.is64Bit()) {
   21384     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
   21385     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
   21386                             LO.getValue(2));
   21387   } else {
   21388     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
   21389     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
   21390                             LO.getValue(2));
   21391   }
   21392   Chain = HI.getValue(1);
   21393 
   21394   if (Subtarget.is64Bit()) {
   21395     // The EAX register is loaded with the low-order 32 bits. The EDX register
   21396     // is loaded with the supported high-order bits of the counter.
   21397     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
   21398                               DAG.getConstant(32, DL, MVT::i8));
   21399     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
   21400     Results.push_back(Chain);
   21401     return;
   21402   }
   21403 
   21404   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   21405   SDValue Ops[] = { LO, HI };
   21406   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   21407   Results.push_back(Pair);
   21408   Results.push_back(Chain);
   21409 }
   21410 
   21411 /// Handles the lowering of builtin intrinsics that read the time stamp counter
   21412 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
   21413 /// READCYCLECOUNTER nodes.
   21414 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
   21415                                     SelectionDAG &DAG,
   21416                                     const X86Subtarget &Subtarget,
   21417                                     SmallVectorImpl<SDValue> &Results) {
   21418   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   21419   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
   21420   SDValue LO, HI;
   21421 
   21422   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
   21423   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
   21424   // and the EAX register is loaded with the low-order 32 bits.
   21425   if (Subtarget.is64Bit()) {
   21426     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
   21427     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
   21428                             LO.getValue(2));
   21429   } else {
   21430     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
   21431     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
   21432                             LO.getValue(2));
   21433   }
   21434   SDValue Chain = HI.getValue(1);
   21435 
   21436   if (Opcode == X86ISD::RDTSCP_DAG) {
   21437     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   21438 
   21439     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
   21440     // the ECX register. Add 'ecx' explicitly to the chain.
   21441     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
   21442                                      HI.getValue(2));
   21443     // Explicitly store the content of ECX at the location passed in input
   21444     // to the 'rdtscp' intrinsic.
   21445     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
   21446                          MachinePointerInfo());
   21447   }
   21448 
   21449   if (Subtarget.is64Bit()) {
   21450     // The EDX register is loaded with the high-order 32 bits of the MSR, and
   21451     // the EAX register is loaded with the low-order 32 bits.
   21452     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
   21453                               DAG.getConstant(32, DL, MVT::i8));
   21454     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
   21455     Results.push_back(Chain);
   21456     return;
   21457   }
   21458 
   21459   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   21460   SDValue Ops[] = { LO, HI };
   21461   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   21462   Results.push_back(Pair);
   21463   Results.push_back(Chain);
   21464 }
   21465 
   21466 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
   21467                                      SelectionDAG &DAG) {
   21468   SmallVector<SDValue, 2> Results;
   21469   SDLoc DL(Op);
   21470   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
   21471                           Results);
   21472   return DAG.getMergeValues(Results, DL);
   21473 }
   21474 
   21475 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
   21476   MachineFunction &MF = DAG.getMachineFunction();
   21477   SDValue Chain = Op.getOperand(0);
   21478   SDValue RegNode = Op.getOperand(2);
   21479   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
   21480   if (!EHInfo)
   21481     report_fatal_error("EH registrations only live in functions using WinEH");
   21482 
   21483   // Cast the operand to an alloca, and remember the frame index.
   21484   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
   21485   if (!FINode)
   21486     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
   21487   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
   21488 
   21489   // Return the chain operand without making any DAG nodes.
   21490   return Chain;
   21491 }
   21492 
   21493 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
   21494   MachineFunction &MF = DAG.getMachineFunction();
   21495   SDValue Chain = Op.getOperand(0);
   21496   SDValue EHGuard = Op.getOperand(2);
   21497   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
   21498   if (!EHInfo)
   21499     report_fatal_error("EHGuard only live in functions using WinEH");
   21500 
   21501   // Cast the operand to an alloca, and remember the frame index.
   21502   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
   21503   if (!FINode)
   21504     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
   21505   EHInfo->EHGuardFrameIndex = FINode->getIndex();
   21506 
   21507   // Return the chain operand without making any DAG nodes.
   21508   return Chain;
   21509 }
   21510 
   21511 /// Emit Truncating Store with signed or unsigned saturation.
   21512 static SDValue
   21513 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
   21514                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
   21515                 SelectionDAG &DAG) {
   21516 
   21517   SDVTList VTs = DAG.getVTList(MVT::Other);
   21518   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
   21519   SDValue Ops[] = { Chain, Val, Ptr, Undef };
   21520   return SignedSat ?
   21521     DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
   21522     DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
   21523 }
   21524 
   21525 /// Emit Masked Truncating Store with signed or unsigned saturation.
   21526 static SDValue
   21527 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
   21528                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
   21529                       MachineMemOperand *MMO, SelectionDAG &DAG) {
   21530 
   21531   SDVTList VTs = DAG.getVTList(MVT::Other);
   21532   SDValue Ops[] = { Chain, Ptr, Mask, Val };
   21533   return SignedSat ?
   21534     DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
   21535     DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
   21536 }
   21537 
   21538 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   21539                                       SelectionDAG &DAG) {
   21540   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   21541 
   21542   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
   21543   if (!IntrData) {
   21544     switch (IntNo) {
   21545     case llvm::Intrinsic::x86_seh_ehregnode:
   21546       return MarkEHRegistrationNode(Op, DAG);
   21547     case llvm::Intrinsic::x86_seh_ehguard:
   21548       return MarkEHGuard(Op, DAG);
   21549     case llvm::Intrinsic::x86_flags_read_u32:
   21550     case llvm::Intrinsic::x86_flags_read_u64:
   21551     case llvm::Intrinsic::x86_flags_write_u32:
   21552     case llvm::Intrinsic::x86_flags_write_u64: {
   21553       // We need a frame pointer because this will get lowered to a PUSH/POP
   21554       // sequence.
   21555       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   21556       MFI.setHasCopyImplyingStackAdjustment(true);
   21557       // Don't do anything here, we will expand these intrinsics out later
   21558       // during ExpandISelPseudos in EmitInstrWithCustomInserter.
   21559       return SDValue();
   21560     }
   21561     case Intrinsic::x86_lwpins32:
   21562     case Intrinsic::x86_lwpins64:
   21563     case Intrinsic::x86_umwait:
   21564     case Intrinsic::x86_tpause: {
   21565       SDLoc dl(Op);
   21566       SDValue Chain = Op->getOperand(0);
   21567       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
   21568       unsigned Opcode;
   21569 
   21570       switch (IntNo) {
   21571       default: llvm_unreachable("Impossible intrinsic");
   21572       case Intrinsic::x86_umwait:
   21573         Opcode = X86ISD::UMWAIT;
   21574         break;
   21575       case Intrinsic::x86_tpause:
   21576         Opcode = X86ISD::TPAUSE;
   21577         break;
   21578       case Intrinsic::x86_lwpins32:
   21579       case Intrinsic::x86_lwpins64:
   21580         Opcode = X86ISD::LWPINS;
   21581         break;
   21582       }
   21583 
   21584       SDValue Operation =
   21585           DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
   21586                       Op->getOperand(3), Op->getOperand(4));
   21587       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
   21588       SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
   21589       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
   21590                          Operation.getValue(1));
   21591     }
   21592     }
   21593     return SDValue();
   21594   }
   21595 
   21596   SDLoc dl(Op);
   21597   switch(IntrData->Type) {
   21598   default: llvm_unreachable("Unknown Intrinsic Type");
   21599   case RDSEED:
   21600   case RDRAND: {
   21601     // Emit the node with the right value type.
   21602     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
   21603     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
   21604 
   21605     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
   21606     // Otherwise return the value from Rand, which is always 0, casted to i32.
   21607     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
   21608                       DAG.getConstant(1, dl, Op->getValueType(1)),
   21609                       DAG.getConstant(X86::COND_B, dl, MVT::i8),
   21610                       SDValue(Result.getNode(), 1) };
   21611     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
   21612 
   21613     // Return { result, isValid, chain }.
   21614     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
   21615                        SDValue(Result.getNode(), 2));
   21616   }
   21617   case GATHER_AVX2: {
   21618     SDValue Chain = Op.getOperand(0);
   21619     SDValue Src   = Op.getOperand(2);
   21620     SDValue Base  = Op.getOperand(3);
   21621     SDValue Index = Op.getOperand(4);
   21622     SDValue Mask  = Op.getOperand(5);
   21623     SDValue Scale = Op.getOperand(6);
   21624     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
   21625                              Scale, Chain, Subtarget);
   21626   }
   21627   case GATHER: {
   21628   //gather(v1, mask, index, base, scale);
   21629     SDValue Chain = Op.getOperand(0);
   21630     SDValue Src   = Op.getOperand(2);
   21631     SDValue Base  = Op.getOperand(3);
   21632     SDValue Index = Op.getOperand(4);
   21633     SDValue Mask  = Op.getOperand(5);
   21634     SDValue Scale = Op.getOperand(6);
   21635     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
   21636                          Chain, Subtarget);
   21637   }
   21638   case SCATTER: {
   21639   //scatter(base, mask, index, v1, scale);
   21640     SDValue Chain = Op.getOperand(0);
   21641     SDValue Base  = Op.getOperand(2);
   21642     SDValue Mask  = Op.getOperand(3);
   21643     SDValue Index = Op.getOperand(4);
   21644     SDValue Src   = Op.getOperand(5);
   21645     SDValue Scale = Op.getOperand(6);
   21646     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
   21647                           Scale, Chain, Subtarget);
   21648   }
   21649   case PREFETCH: {
   21650     SDValue Hint = Op.getOperand(6);
   21651     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
   21652     assert((HintVal == 2 || HintVal == 3) &&
   21653            "Wrong prefetch hint in intrinsic: should be 2 or 3");
   21654     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
   21655     SDValue Chain = Op.getOperand(0);
   21656     SDValue Mask  = Op.getOperand(2);
   21657     SDValue Index = Op.getOperand(3);
   21658     SDValue Base  = Op.getOperand(4);
   21659     SDValue Scale = Op.getOperand(5);
   21660     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
   21661                            Subtarget);
   21662   }
   21663   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
   21664   case RDTSC: {
   21665     SmallVector<SDValue, 2> Results;
   21666     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
   21667                             Results);
   21668     return DAG.getMergeValues(Results, dl);
   21669   }
   21670   // Read Performance Monitoring Counters.
   21671   case RDPMC: {
   21672     SmallVector<SDValue, 2> Results;
   21673     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
   21674     return DAG.getMergeValues(Results, dl);
   21675   }
   21676   // Get Extended Control Register.
   21677   case XGETBV: {
   21678     SmallVector<SDValue, 2> Results;
   21679     getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
   21680     return DAG.getMergeValues(Results, dl);
   21681   }
   21682   // XTEST intrinsics.
   21683   case XTEST: {
   21684     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
   21685     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
   21686 
   21687     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
   21688     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
   21689     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
   21690                        Ret, SDValue(InTrans.getNode(), 1));
   21691   }
   21692   // ADC/ADCX/SBB
   21693   case ADX: {
   21694     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
   21695     SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
   21696     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
   21697                                 DAG.getConstant(-1, dl, MVT::i8));
   21698     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
   21699                               Op.getOperand(4), GenCF.getValue(1));
   21700     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
   21701                                  Op.getOperand(5), MachinePointerInfo());
   21702     SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
   21703     SDValue Results[] = { SetCC, Store };
   21704     return DAG.getMergeValues(Results, dl);
   21705   }
   21706   case TRUNCATE_TO_MEM_VI8:
   21707   case TRUNCATE_TO_MEM_VI16:
   21708   case TRUNCATE_TO_MEM_VI32: {
   21709     SDValue Mask = Op.getOperand(4);
   21710     SDValue DataToTruncate = Op.getOperand(3);
   21711     SDValue Addr = Op.getOperand(2);
   21712     SDValue Chain = Op.getOperand(0);
   21713 
   21714     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
   21715     assert(MemIntr && "Expected MemIntrinsicSDNode!");
   21716 
   21717     EVT MemVT  = MemIntr->getMemoryVT();
   21718 
   21719     uint16_t TruncationOp = IntrData->Opc0;
   21720     switch (TruncationOp) {
   21721     case X86ISD::VTRUNC: {
   21722       if (isAllOnesConstant(Mask)) // return just a truncate store
   21723         return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
   21724                                  MemIntr->getMemOperand());
   21725 
   21726       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
   21727       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   21728 
   21729       return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
   21730                                 MemIntr->getMemOperand(), true /* truncating */);
   21731     }
   21732     case X86ISD::VTRUNCUS:
   21733     case X86ISD::VTRUNCS: {
   21734       bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
   21735       if (isAllOnesConstant(Mask))
   21736         return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
   21737                                MemIntr->getMemOperand(), DAG);
   21738 
   21739       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
   21740       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   21741 
   21742       return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
   21743                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
   21744     }
   21745     default:
   21746       llvm_unreachable("Unsupported truncstore intrinsic");
   21747     }
   21748   }
   21749   }
   21750 }
   21751 
   21752 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
   21753                                            SelectionDAG &DAG) const {
   21754   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   21755   MFI.setReturnAddressIsTaken(true);
   21756 
   21757   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
   21758     return SDValue();
   21759 
   21760   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   21761   SDLoc dl(Op);
   21762   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   21763 
   21764   if (Depth > 0) {
   21765     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
   21766     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   21767     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
   21768     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
   21769                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
   21770                        MachinePointerInfo());
   21771   }
   21772 
   21773   // Just load the return address.
   21774   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
   21775   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
   21776                      MachinePointerInfo());
   21777 }
   21778 
   21779 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
   21780                                                  SelectionDAG &DAG) const {
   21781   DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
   21782   return getReturnAddressFrameIndex(DAG);
   21783 }
   21784 
   21785 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   21786   MachineFunction &MF = DAG.getMachineFunction();
   21787   MachineFrameInfo &MFI = MF.getFrameInfo();
   21788   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   21789   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   21790   EVT VT = Op.getValueType();
   21791 
   21792   MFI.setFrameAddressIsTaken(true);
   21793 
   21794   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
   21795     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
   21796     // is not possible to crawl up the stack without looking at the unwind codes
   21797     // simultaneously.
   21798     int FrameAddrIndex = FuncInfo->getFAIndex();
   21799     if (!FrameAddrIndex) {
   21800       // Set up a frame object for the return address.
   21801       unsigned SlotSize = RegInfo->getSlotSize();
   21802       FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
   21803           SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
   21804       FuncInfo->setFAIndex(FrameAddrIndex);
   21805     }
   21806     return DAG.getFrameIndex(FrameAddrIndex, VT);
   21807   }
   21808 
   21809   unsigned FrameReg =
   21810       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   21811   SDLoc dl(Op);  // FIXME probably not meaningful
   21812   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   21813   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
   21814           (FrameReg == X86::EBP && VT == MVT::i32)) &&
   21815          "Invalid Frame Register!");
   21816   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   21817   while (Depth--)
   21818     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
   21819                             MachinePointerInfo());
   21820   return FrameAddr;
   21821 }
   21822 
   21823 // FIXME? Maybe this could be a TableGen attribute on some registers and
   21824 // this table could be generated automatically from RegInfo.
   21825 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
   21826                                               SelectionDAG &DAG) const {
   21827   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   21828   const MachineFunction &MF = DAG.getMachineFunction();
   21829 
   21830   unsigned Reg = StringSwitch<unsigned>(RegName)
   21831                        .Case("esp", X86::ESP)
   21832                        .Case("rsp", X86::RSP)
   21833                        .Case("ebp", X86::EBP)
   21834                        .Case("rbp", X86::RBP)
   21835                        .Default(0);
   21836 
   21837   if (Reg == X86::EBP || Reg == X86::RBP) {
   21838     if (!TFI.hasFP(MF))
   21839       report_fatal_error("register " + StringRef(RegName) +
   21840                          " is allocatable: function has no frame pointer");
   21841 #ifndef NDEBUG
   21842     else {
   21843       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   21844       unsigned FrameReg =
   21845           RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   21846       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
   21847              "Invalid Frame Register!");
   21848     }
   21849 #endif
   21850   }
   21851 
   21852   if (Reg)
   21853     return Reg;
   21854 
   21855   report_fatal_error("Invalid register name global variable");
   21856 }
   21857 
   21858 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
   21859                                                      SelectionDAG &DAG) const {
   21860   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   21861   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
   21862 }
   21863 
   21864 unsigned X86TargetLowering::getExceptionPointerRegister(
   21865     const Constant *PersonalityFn) const {
   21866   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
   21867     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
   21868 
   21869   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
   21870 }
   21871 
   21872 unsigned X86TargetLowering::getExceptionSelectorRegister(
   21873     const Constant *PersonalityFn) const {
   21874   // Funclet personalities don't use selectors (the runtime does the selection).
   21875   assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
   21876   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
   21877 }
   21878 
   21879 bool X86TargetLowering::needsFixedCatchObjects() const {
   21880   return Subtarget.isTargetWin64();
   21881 }
   21882 
   21883 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   21884   SDValue Chain     = Op.getOperand(0);
   21885   SDValue Offset    = Op.getOperand(1);
   21886   SDValue Handler   = Op.getOperand(2);
   21887   SDLoc dl      (Op);
   21888 
   21889   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   21890   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   21891   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   21892   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
   21893           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
   21894          "Invalid Frame Register!");
   21895   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
   21896   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
   21897 
   21898   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
   21899                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
   21900                                                        dl));
   21901   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
   21902   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
   21903   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
   21904 
   21905   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
   21906                      DAG.getRegister(StoreAddrReg, PtrVT));
   21907 }
   21908 
   21909 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
   21910                                                SelectionDAG &DAG) const {
   21911   SDLoc DL(Op);
   21912   // If the subtarget is not 64bit, we may need the global base reg
   21913   // after isel expand pseudo, i.e., after CGBR pass ran.
   21914   // Therefore, ask for the GlobalBaseReg now, so that the pass
   21915   // inserts the code for us in case we need it.
   21916   // Otherwise, we will end up in a situation where we will
   21917   // reference a virtual register that is not defined!
   21918   if (!Subtarget.is64Bit()) {
   21919     const X86InstrInfo *TII = Subtarget.getInstrInfo();
   21920     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
   21921   }
   21922   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
   21923                      DAG.getVTList(MVT::i32, MVT::Other),
   21924                      Op.getOperand(0), Op.getOperand(1));
   21925 }
   21926 
   21927 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
   21928                                                 SelectionDAG &DAG) const {
   21929   SDLoc DL(Op);
   21930   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
   21931                      Op.getOperand(0), Op.getOperand(1));
   21932 }
   21933 
   21934 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
   21935                                                        SelectionDAG &DAG) const {
   21936   SDLoc DL(Op);
   21937   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
   21938                      Op.getOperand(0));
   21939 }
   21940 
   21941 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
   21942   return Op.getOperand(0);
   21943 }
   21944 
   21945 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   21946                                                 SelectionDAG &DAG) const {
   21947   SDValue Root = Op.getOperand(0);
   21948   SDValue Trmp = Op.getOperand(1); // trampoline
   21949   SDValue FPtr = Op.getOperand(2); // nested function
   21950   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   21951   SDLoc dl (Op);
   21952 
   21953   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   21954   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   21955 
   21956   if (Subtarget.is64Bit()) {
   21957     SDValue OutChains[6];
   21958 
   21959     // Large code-model.
   21960     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
   21961     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
   21962 
   21963     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
   21964     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
   21965 
   21966     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
   21967 
   21968     // Load the pointer to the nested function into R11.
   21969     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
   21970     SDValue Addr = Trmp;
   21971     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
   21972                                 Addr, MachinePointerInfo(TrmpAddr));
   21973 
   21974     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   21975                        DAG.getConstant(2, dl, MVT::i64));
   21976     OutChains[1] =
   21977         DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
   21978                      /* Alignment = */ 2);
   21979 
   21980     // Load the 'nest' parameter value into R10.
   21981     // R10 is specified in X86CallingConv.td
   21982     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
   21983     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   21984                        DAG.getConstant(10, dl, MVT::i64));
   21985     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
   21986                                 Addr, MachinePointerInfo(TrmpAddr, 10));
   21987 
   21988     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   21989                        DAG.getConstant(12, dl, MVT::i64));
   21990     OutChains[3] =
   21991         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
   21992                      /* Alignment = */ 2);
   21993 
   21994     // Jump to the nested function.
   21995     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
   21996     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   21997                        DAG.getConstant(20, dl, MVT::i64));
   21998     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
   21999                                 Addr, MachinePointerInfo(TrmpAddr, 20));
   22000 
   22001     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
   22002     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
   22003                        DAG.getConstant(22, dl, MVT::i64));
   22004     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
   22005                                 Addr, MachinePointerInfo(TrmpAddr, 22));
   22006 
   22007     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   22008   } else {
   22009     const Function *Func =
   22010       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
   22011     CallingConv::ID CC = Func->getCallingConv();
   22012     unsigned NestReg;
   22013 
   22014     switch (CC) {
   22015     default:
   22016       llvm_unreachable("Unsupported calling convention");
   22017     case CallingConv::C:
   22018     case CallingConv::X86_StdCall: {
   22019       // Pass 'nest' parameter in ECX.
   22020       // Must be kept in sync with X86CallingConv.td
   22021       NestReg = X86::ECX;
   22022 
   22023       // Check that ECX wasn't needed by an 'inreg' parameter.
   22024       FunctionType *FTy = Func->getFunctionType();
   22025       const AttributeList &Attrs = Func->getAttributes();
   22026 
   22027       if (!Attrs.isEmpty() && !Func->isVarArg()) {
   22028         unsigned InRegCount = 0;
   22029         unsigned Idx = 1;
   22030 
   22031         for (FunctionType::param_iterator I = FTy->param_begin(),
   22032              E = FTy->param_end(); I != E; ++I, ++Idx)
   22033           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
   22034             auto &DL = DAG.getDataLayout();
   22035             // FIXME: should only count parameters that are lowered to integers.
   22036             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
   22037           }
   22038 
   22039         if (InRegCount > 2) {
   22040           report_fatal_error("Nest register in use - reduce number of inreg"
   22041                              " parameters!");
   22042         }
   22043       }
   22044       break;
   22045     }
   22046     case CallingConv::X86_FastCall:
   22047     case CallingConv::X86_ThisCall:
   22048     case CallingConv::Fast:
   22049       // Pass 'nest' parameter in EAX.
   22050       // Must be kept in sync with X86CallingConv.td
   22051       NestReg = X86::EAX;
   22052       break;
   22053     }
   22054 
   22055     SDValue OutChains[4];
   22056     SDValue Addr, Disp;
   22057 
   22058     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   22059                        DAG.getConstant(10, dl, MVT::i32));
   22060     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
   22061 
   22062     // This is storing the opcode for MOV32ri.
   22063     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
   22064     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
   22065     OutChains[0] =
   22066         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
   22067                      Trmp, MachinePointerInfo(TrmpAddr));
   22068 
   22069     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   22070                        DAG.getConstant(1, dl, MVT::i32));
   22071     OutChains[1] =
   22072         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
   22073                      /* Alignment = */ 1);
   22074 
   22075     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
   22076     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   22077                        DAG.getConstant(5, dl, MVT::i32));
   22078     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
   22079                                 Addr, MachinePointerInfo(TrmpAddr, 5),
   22080                                 /* Alignment = */ 1);
   22081 
   22082     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
   22083                        DAG.getConstant(6, dl, MVT::i32));
   22084     OutChains[3] =
   22085         DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
   22086                      /* Alignment = */ 1);
   22087 
   22088     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   22089   }
   22090 }
   22091 
   22092 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   22093                                             SelectionDAG &DAG) const {
   22094   /*
   22095    The rounding mode is in bits 11:10 of FPSR, and has the following
   22096    settings:
   22097      00 Round to nearest
   22098      01 Round to -inf
   22099      10 Round to +inf
   22100      11 Round to 0
   22101 
   22102   FLT_ROUNDS, on the other hand, expects the following:
   22103     -1 Undefined
   22104      0 Round to 0
   22105      1 Round to nearest
   22106      2 Round to +inf
   22107      3 Round to -inf
   22108 
   22109   To perform the conversion, we do:
   22110     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
   22111   */
   22112 
   22113   MachineFunction &MF = DAG.getMachineFunction();
   22114   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   22115   unsigned StackAlignment = TFI.getStackAlignment();
   22116   MVT VT = Op.getSimpleValueType();
   22117   SDLoc DL(Op);
   22118 
   22119   // Save FP Control Word to stack slot
   22120   int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
   22121   SDValue StackSlot =
   22122       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
   22123 
   22124   MachineMemOperand *MMO =
   22125       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
   22126                               MachineMemOperand::MOStore, 2, 2);
   22127 
   22128   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
   22129   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
   22130                                           DAG.getVTList(MVT::Other),
   22131                                           Ops, MVT::i16, MMO);
   22132 
   22133   // Load FP Control Word from stack slot
   22134   SDValue CWD =
   22135       DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
   22136 
   22137   // Transform as necessary
   22138   SDValue CWD1 =
   22139     DAG.getNode(ISD::SRL, DL, MVT::i16,
   22140                 DAG.getNode(ISD::AND, DL, MVT::i16,
   22141                             CWD, DAG.getConstant(0x800, DL, MVT::i16)),
   22142                 DAG.getConstant(11, DL, MVT::i8));
   22143   SDValue CWD2 =
   22144     DAG.getNode(ISD::SRL, DL, MVT::i16,
   22145                 DAG.getNode(ISD::AND, DL, MVT::i16,
   22146                             CWD, DAG.getConstant(0x400, DL, MVT::i16)),
   22147                 DAG.getConstant(9, DL, MVT::i8));
   22148 
   22149   SDValue RetVal =
   22150     DAG.getNode(ISD::AND, DL, MVT::i16,
   22151                 DAG.getNode(ISD::ADD, DL, MVT::i16,
   22152                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
   22153                             DAG.getConstant(1, DL, MVT::i16)),
   22154                 DAG.getConstant(3, DL, MVT::i16));
   22155 
   22156   return DAG.getNode((VT.getSizeInBits() < 16 ?
   22157                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
   22158 }
   22159 
   22160 // Split an unary integer op into 2 half sized ops.
   22161 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
   22162   MVT VT = Op.getSimpleValueType();
   22163   unsigned NumElems = VT.getVectorNumElements();
   22164   unsigned SizeInBits = VT.getSizeInBits();
   22165   MVT EltVT = VT.getVectorElementType();
   22166   SDValue Src = Op.getOperand(0);
   22167   assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
   22168          "Src and Op should have the same element type!");
   22169 
   22170   // Extract the Lo/Hi vectors
   22171   SDLoc dl(Op);
   22172   SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
   22173   SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
   22174 
   22175   MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
   22176   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   22177                      DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
   22178                      DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
   22179 }
   22180 
   22181 // Decompose 256-bit ops into smaller 128-bit ops.
   22182 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
   22183   assert(Op.getSimpleValueType().is256BitVector() &&
   22184          Op.getSimpleValueType().isInteger() &&
   22185          "Only handle AVX 256-bit vector integer operation");
   22186   return LowerVectorIntUnary(Op, DAG);
   22187 }
   22188 
   22189 // Decompose 512-bit ops into smaller 256-bit ops.
   22190 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
   22191   assert(Op.getSimpleValueType().is512BitVector() &&
   22192          Op.getSimpleValueType().isInteger() &&
   22193          "Only handle AVX 512-bit vector integer operation");
   22194   return LowerVectorIntUnary(Op, DAG);
   22195 }
   22196 
   22197 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
   22198 //
   22199 // i8/i16 vector implemented using dword LZCNT vector instruction
   22200 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
   22201 // split the vector, perform operation on it's Lo a Hi part and
   22202 // concatenate the results.
   22203 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
   22204                                          const X86Subtarget &Subtarget) {
   22205   assert(Op.getOpcode() == ISD::CTLZ);
   22206   SDLoc dl(Op);
   22207   MVT VT = Op.getSimpleValueType();
   22208   MVT EltVT = VT.getVectorElementType();
   22209   unsigned NumElems = VT.getVectorNumElements();
   22210 
   22211   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
   22212           "Unsupported element type");
   22213 
   22214   // Split vector, it's Lo and Hi parts will be handled in next iteration.
   22215   if (NumElems > 16 ||
   22216       (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
   22217     return LowerVectorIntUnary(Op, DAG);
   22218 
   22219   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
   22220   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
   22221           "Unsupported value type for operation");
   22222 
   22223   // Use native supported vector instruction vplzcntd.
   22224   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
   22225   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
   22226   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
   22227   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
   22228 
   22229   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
   22230 }
   22231 
   22232 // Lower CTLZ using a PSHUFB lookup table implementation.
   22233 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
   22234                                        const X86Subtarget &Subtarget,
   22235                                        SelectionDAG &DAG) {
   22236   MVT VT = Op.getSimpleValueType();
   22237   int NumElts = VT.getVectorNumElements();
   22238   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
   22239   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
   22240 
   22241   // Per-nibble leading zero PSHUFB lookup table.
   22242   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
   22243                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
   22244                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
   22245                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
   22246 
   22247   SmallVector<SDValue, 64> LUTVec;
   22248   for (int i = 0; i < NumBytes; ++i)
   22249     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
   22250   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
   22251 
   22252   // Begin by bitcasting the input to byte vector, then split those bytes
   22253   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
   22254   // If the hi input nibble is zero then we add both results together, otherwise
   22255   // we just take the hi result (by masking the lo result to zero before the
   22256   // add).
   22257   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
   22258   SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
   22259 
   22260   SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
   22261   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
   22262   SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
   22263   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
   22264   SDValue HiZ;
   22265   if (CurrVT.is512BitVector()) {
   22266     MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
   22267     HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
   22268     HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
   22269   } else {
   22270     HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
   22271   }
   22272 
   22273   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
   22274   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
   22275   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
   22276   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
   22277 
   22278   // Merge result back from vXi8 back to VT, working on the lo/hi halves
   22279   // of the current vector width in the same way we did for the nibbles.
   22280   // If the upper half of the input element is zero then add the halves'
   22281   // leading zero counts together, otherwise just use the upper half's.
   22282   // Double the width of the result until we are at target width.
   22283   while (CurrVT != VT) {
   22284     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
   22285     int CurrNumElts = CurrVT.getVectorNumElements();
   22286     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
   22287     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
   22288     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
   22289 
   22290     // Check if the upper half of the input element is zero.
   22291     if (CurrVT.is512BitVector()) {
   22292       MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
   22293       HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
   22294                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
   22295       HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
   22296     } else {
   22297       HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
   22298                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
   22299     }
   22300     HiZ = DAG.getBitcast(NextVT, HiZ);
   22301 
   22302     // Move the upper/lower halves to the lower bits as we'll be extending to
   22303     // NextVT. Mask the lower result to zero if HiZ is true and add the results
   22304     // together.
   22305     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
   22306     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
   22307     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
   22308     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
   22309     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
   22310     CurrVT = NextVT;
   22311   }
   22312 
   22313   return Res;
   22314 }
   22315 
   22316 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
   22317                                const X86Subtarget &Subtarget,
   22318                                SelectionDAG &DAG) {
   22319   MVT VT = Op.getSimpleValueType();
   22320 
   22321   if (Subtarget.hasCDI() &&
   22322       // vXi8 vectors need to be promoted to 512-bits for vXi32.
   22323       (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
   22324     return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
   22325 
   22326   // Decompose 256-bit ops into smaller 128-bit ops.
   22327   if (VT.is256BitVector() && !Subtarget.hasInt256())
   22328     return Lower256IntUnary(Op, DAG);
   22329 
   22330   // Decompose 512-bit ops into smaller 256-bit ops.
   22331   if (VT.is512BitVector() && !Subtarget.hasBWI())
   22332     return Lower512IntUnary(Op, DAG);
   22333 
   22334   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
   22335   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
   22336 }
   22337 
   22338 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
   22339                          SelectionDAG &DAG) {
   22340   MVT VT = Op.getSimpleValueType();
   22341   MVT OpVT = VT;
   22342   unsigned NumBits = VT.getSizeInBits();
   22343   SDLoc dl(Op);
   22344   unsigned Opc = Op.getOpcode();
   22345 
   22346   if (VT.isVector())
   22347     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
   22348 
   22349   Op = Op.getOperand(0);
   22350   if (VT == MVT::i8) {
   22351     // Zero extend to i32 since there is not an i8 bsr.
   22352     OpVT = MVT::i32;
   22353     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   22354   }
   22355 
   22356   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
   22357   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   22358   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
   22359 
   22360   if (Opc == ISD::CTLZ) {
   22361     // If src is zero (i.e. bsr sets ZF), returns NumBits.
   22362     SDValue Ops[] = {
   22363       Op,
   22364       DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
   22365       DAG.getConstant(X86::COND_E, dl, MVT::i8),
   22366       Op.getValue(1)
   22367     };
   22368     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
   22369   }
   22370 
   22371   // Finally xor with NumBits-1.
   22372   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
   22373                    DAG.getConstant(NumBits - 1, dl, OpVT));
   22374 
   22375   if (VT == MVT::i8)
   22376     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
   22377   return Op;
   22378 }
   22379 
   22380 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   22381   MVT VT = Op.getSimpleValueType();
   22382   unsigned NumBits = VT.getScalarSizeInBits();
   22383   SDLoc dl(Op);
   22384 
   22385   if (VT.isVector()) {
   22386     SDValue N0 = Op.getOperand(0);
   22387     SDValue Zero = DAG.getConstant(0, dl, VT);
   22388 
   22389     // lsb(x) = (x & -x)
   22390     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
   22391                               DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
   22392 
   22393     // cttz_undef(x) = (width - 1) - ctlz(lsb)
   22394     if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
   22395       SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
   22396       return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
   22397                          DAG.getNode(ISD::CTLZ, dl, VT, LSB));
   22398     }
   22399 
   22400     // cttz(x) = ctpop(lsb - 1)
   22401     SDValue One = DAG.getConstant(1, dl, VT);
   22402     return DAG.getNode(ISD::CTPOP, dl, VT,
   22403                        DAG.getNode(ISD::SUB, dl, VT, LSB, One));
   22404   }
   22405 
   22406   assert(Op.getOpcode() == ISD::CTTZ &&
   22407          "Only scalar CTTZ requires custom lowering");
   22408 
   22409   // Issue a bsf (scan bits forward) which also sets EFLAGS.
   22410   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   22411   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
   22412 
   22413   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   22414   SDValue Ops[] = {
   22415     Op,
   22416     DAG.getConstant(NumBits, dl, VT),
   22417     DAG.getConstant(X86::COND_E, dl, MVT::i8),
   22418     Op.getValue(1)
   22419   };
   22420   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
   22421 }
   22422 
   22423 /// Break a 256-bit integer operation into two new 128-bit ones and then
   22424 /// concatenate the result back.
   22425 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
   22426   MVT VT = Op.getSimpleValueType();
   22427 
   22428   assert(VT.is256BitVector() && VT.isInteger() &&
   22429          "Unsupported value type for operation");
   22430 
   22431   unsigned NumElems = VT.getVectorNumElements();
   22432   SDLoc dl(Op);
   22433 
   22434   // Extract the LHS vectors
   22435   SDValue LHS = Op.getOperand(0);
   22436   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
   22437   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
   22438 
   22439   // Extract the RHS vectors
   22440   SDValue RHS = Op.getOperand(1);
   22441   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
   22442   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
   22443 
   22444   MVT EltVT = VT.getVectorElementType();
   22445   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   22446 
   22447   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   22448                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
   22449                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
   22450 }
   22451 
   22452 /// Break a 512-bit integer operation into two new 256-bit ones and then
   22453 /// concatenate the result back.
   22454 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
   22455   MVT VT = Op.getSimpleValueType();
   22456 
   22457   assert(VT.is512BitVector() && VT.isInteger() &&
   22458          "Unsupported value type for operation");
   22459 
   22460   unsigned NumElems = VT.getVectorNumElements();
   22461   SDLoc dl(Op);
   22462 
   22463   // Extract the LHS vectors
   22464   SDValue LHS = Op.getOperand(0);
   22465   SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
   22466   SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
   22467 
   22468   // Extract the RHS vectors
   22469   SDValue RHS = Op.getOperand(1);
   22470   SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
   22471   SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
   22472 
   22473   MVT EltVT = VT.getVectorElementType();
   22474   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   22475 
   22476   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
   22477                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
   22478                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
   22479 }
   22480 
   22481 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
   22482   MVT VT = Op.getSimpleValueType();
   22483   if (VT.getScalarType() == MVT::i1)
   22484     return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
   22485                        Op.getOperand(0), Op.getOperand(1));
   22486   assert(Op.getSimpleValueType().is256BitVector() &&
   22487          Op.getSimpleValueType().isInteger() &&
   22488          "Only handle AVX 256-bit vector integer operation");
   22489   return Lower256IntArith(Op, DAG);
   22490 }
   22491 
   22492 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
   22493   MVT VT = Op.getSimpleValueType();
   22494   if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
   22495     // Since X86 does not have CMOV for 8-bit integer, we don't convert
   22496     // 8-bit integer abs to NEG and CMOV.
   22497     SDLoc DL(Op);
   22498     SDValue N0 = Op.getOperand(0);
   22499     SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
   22500                               DAG.getConstant(0, DL, VT), N0);
   22501     SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
   22502                      SDValue(Neg.getNode(), 1)};
   22503     return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
   22504   }
   22505 
   22506   assert(Op.getSimpleValueType().is256BitVector() &&
   22507          Op.getSimpleValueType().isInteger() &&
   22508          "Only handle AVX 256-bit vector integer operation");
   22509   return Lower256IntUnary(Op, DAG);
   22510 }
   22511 
   22512 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
   22513   MVT VT = Op.getSimpleValueType();
   22514 
   22515   // For AVX1 cases, split to use legal ops (everything but v4i64).
   22516   if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
   22517     return Lower256IntArith(Op, DAG);
   22518 
   22519   SDLoc DL(Op);
   22520   unsigned Opcode = Op.getOpcode();
   22521   SDValue N0 = Op.getOperand(0);
   22522   SDValue N1 = Op.getOperand(1);
   22523 
   22524   // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
   22525   // using the SMIN/SMAX instructions and flipping the signbit back.
   22526   if (VT == MVT::v8i16) {
   22527     assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
   22528            "Unexpected MIN/MAX opcode");
   22529     SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
   22530     N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
   22531     N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
   22532     Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
   22533     SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
   22534     return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
   22535   }
   22536 
   22537   // Else, expand to a compare/select.
   22538   ISD::CondCode CC;
   22539   switch (Opcode) {
   22540   case ISD::SMIN: CC = ISD::CondCode::SETLT;  break;
   22541   case ISD::SMAX: CC = ISD::CondCode::SETGT;  break;
   22542   case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
   22543   case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
   22544   default: llvm_unreachable("Unknown MINMAX opcode");
   22545   }
   22546 
   22547   SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
   22548   return DAG.getSelect(DL, VT, Cond, N0, N1);
   22549 }
   22550 
   22551 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
   22552                         SelectionDAG &DAG) {
   22553   SDLoc dl(Op);
   22554   MVT VT = Op.getSimpleValueType();
   22555 
   22556   if (VT.getScalarType() == MVT::i1)
   22557     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
   22558 
   22559   // Decompose 256-bit ops into smaller 128-bit ops.
   22560   if (VT.is256BitVector() && !Subtarget.hasInt256())
   22561     return Lower256IntArith(Op, DAG);
   22562 
   22563   SDValue A = Op.getOperand(0);
   22564   SDValue B = Op.getOperand(1);
   22565 
   22566   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
   22567   // vector pairs, multiply and truncate.
   22568   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
   22569     if (Subtarget.hasInt256()) {
   22570       // For 512-bit vectors, split into 256-bit vectors to allow the
   22571       // sign-extension to occur.
   22572       if (VT == MVT::v64i8)
   22573         return Lower512IntArith(Op, DAG);
   22574 
   22575       // For 256-bit vectors, split into 128-bit vectors to allow the
   22576       // sign-extension to occur. We don't need this on AVX512BW as we can
   22577       // safely sign-extend to v32i16.
   22578       if (VT == MVT::v32i8 && !Subtarget.hasBWI())
   22579         return Lower256IntArith(Op, DAG);
   22580 
   22581       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
   22582       return DAG.getNode(
   22583           ISD::TRUNCATE, dl, VT,
   22584           DAG.getNode(ISD::MUL, dl, ExVT,
   22585                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
   22586                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
   22587     }
   22588 
   22589     assert(VT == MVT::v16i8 &&
   22590            "Pre-AVX2 support only supports v16i8 multiplication");
   22591     MVT ExVT = MVT::v8i16;
   22592 
   22593     // Extract the lo parts and sign extend to i16
   22594     // We're going to mask off the low byte of each result element of the
   22595     // pmullw, so it doesn't matter what's in the high byte of each 16-bit
   22596     // element.
   22597     const int LoShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1,
   22598                               4, -1, 5, -1, 6, -1, 7, -1};
   22599     SDValue ALo = DAG.getVectorShuffle(VT, dl, A, A, LoShufMask);
   22600     SDValue BLo = DAG.getVectorShuffle(VT, dl, B, B, LoShufMask);
   22601     ALo = DAG.getBitcast(ExVT, ALo);
   22602     BLo = DAG.getBitcast(ExVT, BLo);
   22603 
   22604     // Extract the hi parts and sign extend to i16
   22605     // We're going to mask off the low byte of each result element of the
   22606     // pmullw, so it doesn't matter what's in the high byte of each 16-bit
   22607     // element.
   22608     const int HiShufMask[] = {8,  -1, 9,  -1, 10, -1, 11, -1,
   22609                               12, -1, 13, -1, 14, -1, 15, -1};
   22610     SDValue AHi = DAG.getVectorShuffle(VT, dl, A, A, HiShufMask);
   22611     SDValue BHi = DAG.getVectorShuffle(VT, dl, B, B, HiShufMask);
   22612     AHi = DAG.getBitcast(ExVT, AHi);
   22613     BHi = DAG.getBitcast(ExVT, BHi);
   22614 
   22615     // Multiply, mask the lower 8bits of the lo/hi results and pack
   22616     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
   22617     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
   22618     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
   22619     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
   22620     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
   22621   }
   22622 
   22623   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
   22624   if (VT == MVT::v4i32) {
   22625     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
   22626            "Should not custom lower when pmulld is available!");
   22627 
   22628     // Extract the odd parts.
   22629     static const int UnpackMask[] = { 1, -1, 3, -1 };
   22630     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
   22631     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
   22632 
   22633     // Multiply the even parts.
   22634     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
   22635                                 DAG.getBitcast(MVT::v2i64, A),
   22636                                 DAG.getBitcast(MVT::v2i64, B));
   22637     // Now multiply odd parts.
   22638     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
   22639                                DAG.getBitcast(MVT::v2i64, Aodds),
   22640                                DAG.getBitcast(MVT::v2i64, Bodds));
   22641 
   22642     Evens = DAG.getBitcast(VT, Evens);
   22643     Odds = DAG.getBitcast(VT, Odds);
   22644 
   22645     // Merge the two vectors back together with a shuffle. This expands into 2
   22646     // shuffles.
   22647     static const int ShufMask[] = { 0, 4, 2, 6 };
   22648     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
   22649   }
   22650 
   22651   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
   22652          "Only know how to lower V2I64/V4I64/V8I64 multiply");
   22653   assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
   22654 
   22655   //  Ahi = psrlqi(a, 32);
   22656   //  Bhi = psrlqi(b, 32);
   22657   //
   22658   //  AloBlo = pmuludq(a, b);
   22659   //  AloBhi = pmuludq(a, Bhi);
   22660   //  AhiBlo = pmuludq(Ahi, b);
   22661   //
   22662   //  Hi = psllqi(AloBhi + AhiBlo, 32);
   22663   //  return AloBlo + Hi;
   22664   KnownBits AKnown, BKnown;
   22665   DAG.computeKnownBits(A, AKnown);
   22666   DAG.computeKnownBits(B, BKnown);
   22667 
   22668   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
   22669   bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
   22670   bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
   22671 
   22672   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
   22673   bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
   22674   bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
   22675 
   22676   SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
   22677 
   22678   // Only multiply lo/hi halves that aren't known to be zero.
   22679   SDValue AloBlo = Zero;
   22680   if (!ALoIsZero && !BLoIsZero)
   22681     AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
   22682 
   22683   SDValue AloBhi = Zero;
   22684   if (!ALoIsZero && !BHiIsZero) {
   22685     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
   22686     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
   22687   }
   22688 
   22689   SDValue AhiBlo = Zero;
   22690   if (!AHiIsZero && !BLoIsZero) {
   22691     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
   22692     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
   22693   }
   22694 
   22695   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
   22696   Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
   22697 
   22698   return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
   22699 }
   22700 
   22701 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
   22702                          SelectionDAG &DAG) {
   22703   SDLoc dl(Op);
   22704   MVT VT = Op.getSimpleValueType();
   22705 
   22706   // Decompose 256-bit ops into smaller 128-bit ops.
   22707   if (VT.is256BitVector() && !Subtarget.hasInt256())
   22708     return Lower256IntArith(Op, DAG);
   22709 
   22710   // Only i8 vectors should need custom lowering after this.
   22711   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
   22712          (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
   22713          "Unsupported vector type");
   22714 
   22715   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
   22716   // logical shift down the upper half and pack back to i8.
   22717   SDValue A = Op.getOperand(0);
   22718   SDValue B = Op.getOperand(1);
   22719 
   22720   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
   22721   // and then ashr/lshr the upper bits down to the lower bits before multiply.
   22722   unsigned Opcode = Op.getOpcode();
   22723   unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
   22724   unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
   22725 
   22726   // For 512-bit vectors, split into 256-bit vectors to allow the
   22727   // sign-extension to occur.
   22728   if (VT == MVT::v64i8)
   22729     return Lower512IntArith(Op, DAG);
   22730 
   22731   // AVX2 implementations - extend xmm subvectors to ymm.
   22732   if (Subtarget.hasInt256()) {
   22733     unsigned NumElems = VT.getVectorNumElements();
   22734     SDValue Lo = DAG.getIntPtrConstant(0, dl);
   22735     SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
   22736 
   22737     if (VT == MVT::v32i8) {
   22738       if (Subtarget.canExtendTo512BW()) {
   22739         SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
   22740         SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
   22741         SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
   22742         Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
   22743                           DAG.getConstant(8, dl, MVT::v32i16));
   22744         return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
   22745       }
   22746       SDValue ALo = extract128BitVector(A, 0, DAG, dl);
   22747       SDValue BLo = extract128BitVector(B, 0, DAG, dl);
   22748       SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
   22749       SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
   22750       ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
   22751       BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
   22752       AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
   22753       BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
   22754       Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
   22755                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
   22756                        DAG.getConstant(8, dl, MVT::v16i16));
   22757       Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
   22758                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
   22759                        DAG.getConstant(8, dl, MVT::v16i16));
   22760       // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
   22761       // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
   22762       const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
   22763                             16, 17, 18, 19, 20, 21, 22, 23};
   22764       const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
   22765                             24, 25, 26, 27, 28, 29, 30, 31};
   22766       return DAG.getNode(X86ISD::PACKUS, dl, VT,
   22767                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
   22768                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
   22769     }
   22770 
   22771     assert(VT == MVT::v16i8 && "Unexpected VT");
   22772 
   22773     SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
   22774     SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
   22775     SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
   22776     Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
   22777                       DAG.getConstant(8, dl, MVT::v16i16));
   22778     // If we have BWI we can use truncate instruction.
   22779     if (Subtarget.hasBWI())
   22780       return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
   22781     Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
   22782     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
   22783     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
   22784   }
   22785 
   22786   assert(VT == MVT::v16i8 &&
   22787          "Pre-AVX2 support only supports v16i8 multiplication");
   22788   MVT ExVT = MVT::v8i16;
   22789   unsigned ExSSE41 = ISD::MULHU == Opcode ? ISD::ZERO_EXTEND_VECTOR_INREG
   22790                                           : ISD::SIGN_EXTEND_VECTOR_INREG;
   22791 
   22792   // Extract the lo parts and zero/sign extend to i16.
   22793   SDValue ALo, BLo;
   22794   if (Subtarget.hasSSE41()) {
   22795     ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
   22796     BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
   22797   } else {
   22798     const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
   22799                             -1, 4, -1, 5, -1, 6, -1, 7};
   22800     ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
   22801     BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
   22802     ALo = DAG.getBitcast(ExVT, ALo);
   22803     BLo = DAG.getBitcast(ExVT, BLo);
   22804     ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
   22805     BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
   22806   }
   22807 
   22808   // Extract the hi parts and zero/sign extend to i16.
   22809   SDValue AHi, BHi;
   22810   if (Subtarget.hasSSE41()) {
   22811     const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
   22812                             -1, -1, -1, -1, -1, -1, -1, -1};
   22813     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
   22814     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
   22815     AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
   22816     BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
   22817   } else {
   22818     const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
   22819                             -1, 12, -1, 13, -1, 14, -1, 15};
   22820     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
   22821     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
   22822     AHi = DAG.getBitcast(ExVT, AHi);
   22823     BHi = DAG.getBitcast(ExVT, BHi);
   22824     AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
   22825     BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
   22826   }
   22827 
   22828   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
   22829   // pack back to v16i8.
   22830   SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
   22831   SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
   22832   RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
   22833   RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
   22834   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
   22835 }
   22836 
   22837 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
   22838   assert(Subtarget.isTargetWin64() && "Unexpected target");
   22839   EVT VT = Op.getValueType();
   22840   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
   22841          "Unexpected return type for lowering");
   22842 
   22843   RTLIB::Libcall LC;
   22844   bool isSigned;
   22845   switch (Op->getOpcode()) {
   22846   default: llvm_unreachable("Unexpected request for libcall!");
   22847   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
   22848   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
   22849   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
   22850   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
   22851   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
   22852   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
   22853   }
   22854 
   22855   SDLoc dl(Op);
   22856   SDValue InChain = DAG.getEntryNode();
   22857 
   22858   TargetLowering::ArgListTy Args;
   22859   TargetLowering::ArgListEntry Entry;
   22860   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
   22861     EVT ArgVT = Op->getOperand(i).getValueType();
   22862     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
   22863            "Unexpected argument type for lowering");
   22864     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
   22865     Entry.Node = StackPtr;
   22866     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
   22867                            MachinePointerInfo(), /* Alignment = */ 16);
   22868     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   22869     Entry.Ty = PointerType::get(ArgTy,0);
   22870     Entry.IsSExt = false;
   22871     Entry.IsZExt = false;
   22872     Args.push_back(Entry);
   22873   }
   22874 
   22875   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
   22876                                          getPointerTy(DAG.getDataLayout()));
   22877 
   22878   TargetLowering::CallLoweringInfo CLI(DAG);
   22879   CLI.setDebugLoc(dl)
   22880       .setChain(InChain)
   22881       .setLibCallee(
   22882           getLibcallCallingConv(LC),
   22883           static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
   22884           std::move(Args))
   22885       .setInRegister()
   22886       .setSExtResult(isSigned)
   22887       .setZExtResult(!isSigned);
   22888 
   22889   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   22890   return DAG.getBitcast(VT, CallInfo.first);
   22891 }
   22892 
   22893 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
   22894                              SelectionDAG &DAG) {
   22895   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
   22896   MVT VT = Op0.getSimpleValueType();
   22897   SDLoc dl(Op);
   22898 
   22899   // Decompose 256-bit ops into smaller 128-bit ops.
   22900   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
   22901     unsigned Opcode = Op.getOpcode();
   22902     unsigned NumElems = VT.getVectorNumElements();
   22903     MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
   22904     SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
   22905     SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
   22906     SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
   22907     SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
   22908     SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
   22909     SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
   22910     SDValue Ops[] = {
   22911       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
   22912       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
   22913     };
   22914     return DAG.getMergeValues(Ops, dl);
   22915   }
   22916 
   22917   assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
   22918          (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
   22919          (VT == MVT::v16i32 && Subtarget.hasAVX512()));
   22920 
   22921   int NumElts = VT.getVectorNumElements();
   22922 
   22923   // PMULxD operations multiply each even value (starting at 0) of LHS with
   22924   // the related value of RHS and produce a widen result.
   22925   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   22926   // => <2 x i64> <ae|cg>
   22927   //
   22928   // In other word, to have all the results, we need to perform two PMULxD:
   22929   // 1. one with the even values.
   22930   // 2. one with the odd values.
   22931   // To achieve #2, with need to place the odd values at an even position.
   22932   //
   22933   // Place the odd value at an even position (basically, shift all values 1
   22934   // step to the left):
   22935   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
   22936   // <a|b|c|d> => <b|undef|d|undef>
   22937   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
   22938                                       makeArrayRef(&Mask[0], NumElts));
   22939   // <e|f|g|h> => <f|undef|h|undef>
   22940   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
   22941                                       makeArrayRef(&Mask[0], NumElts));
   22942 
   22943   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
   22944   // ints.
   22945   MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
   22946   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
   22947   unsigned Opcode =
   22948       (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
   22949   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   22950   // => <2 x i64> <ae|cg>
   22951   SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
   22952                                                 DAG.getBitcast(MulVT, Op0),
   22953                                                 DAG.getBitcast(MulVT, Op1)));
   22954   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
   22955   // => <2 x i64> <bf|dh>
   22956   SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
   22957                                                 DAG.getBitcast(MulVT, Odd0),
   22958                                                 DAG.getBitcast(MulVT, Odd1)));
   22959 
   22960   // Shuffle it back into the right order.
   22961   SmallVector<int, 16> HighMask(NumElts);
   22962   SmallVector<int, 16> LowMask(NumElts);
   22963   for (int i = 0; i != NumElts; ++i) {
   22964     HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
   22965     LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
   22966   }
   22967 
   22968   SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
   22969   SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
   22970 
   22971   // If we have a signed multiply but no PMULDQ fix up the high parts of a
   22972   // unsigned multiply.
   22973   if (IsSigned && !Subtarget.hasSSE41()) {
   22974     SDValue ShAmt = DAG.getConstant(
   22975         31, dl,
   22976         DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
   22977     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
   22978                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
   22979     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
   22980                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
   22981 
   22982     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
   22983     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
   22984   }
   22985 
   22986   // The first result of MUL_LOHI is actually the low value, followed by the
   22987   // high value.
   22988   SDValue Ops[] = {Lows, Highs};
   22989   return DAG.getMergeValues(Ops, dl);
   22990 }
   22991 
   22992 // Return true if the required (according to Opcode) shift-imm form is natively
   22993 // supported by the Subtarget
   22994 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
   22995                                         unsigned Opcode) {
   22996   if (VT.getScalarSizeInBits() < 16)
   22997     return false;
   22998 
   22999   if (VT.is512BitVector() && Subtarget.hasAVX512() &&
   23000       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
   23001     return true;
   23002 
   23003   bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
   23004                 (VT.is256BitVector() && Subtarget.hasInt256());
   23005 
   23006   bool AShift = LShift && (Subtarget.hasAVX512() ||
   23007                            (VT != MVT::v2i64 && VT != MVT::v4i64));
   23008   return (Opcode == ISD::SRA) ? AShift : LShift;
   23009 }
   23010 
   23011 // The shift amount is a variable, but it is the same for all vector lanes.
   23012 // These instructions are defined together with shift-immediate.
   23013 static
   23014 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
   23015                                       unsigned Opcode) {
   23016   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
   23017 }
   23018 
   23019 // Return true if the required (according to Opcode) variable-shift form is
   23020 // natively supported by the Subtarget
   23021 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
   23022                                     unsigned Opcode) {
   23023 
   23024   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
   23025     return false;
   23026 
   23027   // vXi16 supported only on AVX-512, BWI
   23028   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
   23029     return false;
   23030 
   23031   if (Subtarget.hasAVX512())
   23032     return true;
   23033 
   23034   bool LShift = VT.is128BitVector() || VT.is256BitVector();
   23035   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
   23036   return (Opcode == ISD::SRA) ? AShift : LShift;
   23037 }
   23038 
   23039 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   23040                                          const X86Subtarget &Subtarget) {
   23041   MVT VT = Op.getSimpleValueType();
   23042   SDLoc dl(Op);
   23043   SDValue R = Op.getOperand(0);
   23044   SDValue Amt = Op.getOperand(1);
   23045 
   23046   unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
   23047     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
   23048 
   23049   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
   23050     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
   23051     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
   23052     SDValue Ex = DAG.getBitcast(ExVT, R);
   23053 
   23054     // ashr(R, 63) === cmp_slt(R, 0)
   23055     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
   23056       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
   23057              "Unsupported PCMPGT op");
   23058       return DAG.getNode(X86ISD::PCMPGT, dl, VT,
   23059                          getZeroVector(VT, Subtarget, DAG, dl), R);
   23060     }
   23061 
   23062     if (ShiftAmt >= 32) {
   23063       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
   23064       SDValue Upper =
   23065           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
   23066       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
   23067                                                  ShiftAmt - 32, DAG);
   23068       if (VT == MVT::v2i64)
   23069         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
   23070       if (VT == MVT::v4i64)
   23071         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
   23072                                   {9, 1, 11, 3, 13, 5, 15, 7});
   23073     } else {
   23074       // SRA upper i32, SHL whole i64 and select lower i32.
   23075       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
   23076                                                  ShiftAmt, DAG);
   23077       SDValue Lower =
   23078           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
   23079       Lower = DAG.getBitcast(ExVT, Lower);
   23080       if (VT == MVT::v2i64)
   23081         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
   23082       if (VT == MVT::v4i64)
   23083         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
   23084                                   {8, 1, 10, 3, 12, 5, 14, 7});
   23085     }
   23086     return DAG.getBitcast(VT, Ex);
   23087   };
   23088 
   23089   // Optimize shl/srl/sra with constant shift amount.
   23090   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
   23091     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
   23092       uint64_t ShiftAmt = ShiftConst->getZExtValue();
   23093 
   23094       if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
   23095         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
   23096 
   23097       // i64 SRA needs to be performed as partial shifts.
   23098       if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
   23099            (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
   23100           Op.getOpcode() == ISD::SRA)
   23101         return ArithmeticShiftRight64(ShiftAmt);
   23102 
   23103       if (VT == MVT::v16i8 ||
   23104           (Subtarget.hasInt256() && VT == MVT::v32i8) ||
   23105           VT == MVT::v64i8) {
   23106         unsigned NumElts = VT.getVectorNumElements();
   23107         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
   23108 
   23109         // Simple i8 add case
   23110         if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
   23111           return DAG.getNode(ISD::ADD, dl, VT, R, R);
   23112 
   23113         // ashr(R, 7)  === cmp_slt(R, 0)
   23114         if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
   23115           SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
   23116           if (VT.is512BitVector()) {
   23117             assert(VT == MVT::v64i8 && "Unexpected element type!");
   23118             SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R,
   23119                                        ISD::SETGT);
   23120             return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
   23121           }
   23122           return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
   23123         }
   23124 
   23125         // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
   23126         if (VT == MVT::v16i8 && Subtarget.hasXOP())
   23127           return SDValue();
   23128 
   23129         if (Op.getOpcode() == ISD::SHL) {
   23130           // Make a large shift.
   23131           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
   23132                                                    R, ShiftAmt, DAG);
   23133           SHL = DAG.getBitcast(VT, SHL);
   23134           // Zero out the rightmost bits.
   23135           return DAG.getNode(ISD::AND, dl, VT, SHL,
   23136                              DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
   23137         }
   23138         if (Op.getOpcode() == ISD::SRL) {
   23139           // Make a large shift.
   23140           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
   23141                                                    R, ShiftAmt, DAG);
   23142           SRL = DAG.getBitcast(VT, SRL);
   23143           // Zero out the leftmost bits.
   23144           return DAG.getNode(ISD::AND, dl, VT, SRL,
   23145                              DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
   23146         }
   23147         if (Op.getOpcode() == ISD::SRA) {
   23148           // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
   23149           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
   23150 
   23151           SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
   23152           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
   23153           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
   23154           return Res;
   23155         }
   23156         llvm_unreachable("Unknown shift opcode.");
   23157       }
   23158     }
   23159   }
   23160 
   23161   // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
   23162   // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
   23163   if (!Subtarget.hasXOP() &&
   23164       (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
   23165        (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
   23166 
   23167     // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
   23168     unsigned SubVectorScale = 1;
   23169     if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
   23170       SubVectorScale =
   23171           Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
   23172       Amt = Amt.getOperand(0);
   23173     }
   23174 
   23175     // Peek through any splat that was introduced for i64 shift vectorization.
   23176     int SplatIndex = -1;
   23177     if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
   23178       if (SVN->isSplat()) {
   23179         SplatIndex = SVN->getSplatIndex();
   23180         Amt = Amt.getOperand(0);
   23181         assert(SplatIndex < (int)VT.getVectorNumElements() &&
   23182                "Splat shuffle referencing second operand");
   23183       }
   23184 
   23185     if (Amt.getOpcode() != ISD::BITCAST ||
   23186         Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
   23187       return SDValue();
   23188 
   23189     Amt = Amt.getOperand(0);
   23190     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
   23191                      (SubVectorScale * VT.getVectorNumElements());
   23192     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
   23193     uint64_t ShiftAmt = 0;
   23194     unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
   23195     for (unsigned i = 0; i != Ratio; ++i) {
   23196       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
   23197       if (!C)
   23198         return SDValue();
   23199       // 6 == Log2(64)
   23200       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
   23201     }
   23202 
   23203     // Check remaining shift amounts (if not a splat).
   23204     if (SplatIndex < 0) {
   23205       for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
   23206         uint64_t ShAmt = 0;
   23207         for (unsigned j = 0; j != Ratio; ++j) {
   23208           ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
   23209           if (!C)
   23210             return SDValue();
   23211           // 6 == Log2(64)
   23212           ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
   23213         }
   23214         if (ShAmt != ShiftAmt)
   23215           return SDValue();
   23216       }
   23217     }
   23218 
   23219     if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
   23220       return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
   23221 
   23222     if (Op.getOpcode() == ISD::SRA)
   23223       return ArithmeticShiftRight64(ShiftAmt);
   23224   }
   23225 
   23226   return SDValue();
   23227 }
   23228 
   23229 // Determine if V is a splat value, and return the scalar.
   23230 static SDValue IsSplatValue(MVT VT, SDValue V, const SDLoc &dl,
   23231                             SelectionDAG &DAG, const X86Subtarget &Subtarget,
   23232                             unsigned Opcode) {
   23233    V = peekThroughEXTRACT_SUBVECTORs(V);
   23234 
   23235   // Check if this is a splat build_vector node.
   23236   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V)) {
   23237     SDValue SplatAmt = BV->getSplatValue();
   23238     if (SplatAmt && SplatAmt.isUndef())
   23239       return SDValue();
   23240     return SplatAmt;
   23241   }
   23242 
   23243   // Check for SUB(SPLAT_BV, SPLAT) cases from rotate patterns.
   23244   if (V.getOpcode() == ISD::SUB &&
   23245       !SupportedVectorVarShift(VT, Subtarget, Opcode)) {
   23246     SDValue LHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(0));
   23247     SDValue RHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(1));
   23248 
   23249     // Ensure that the corresponding splat BV element is not UNDEF.
   23250     BitVector UndefElts;
   23251     BuildVectorSDNode *BV0 = dyn_cast<BuildVectorSDNode>(LHS);
   23252     ShuffleVectorSDNode *SVN1 = dyn_cast<ShuffleVectorSDNode>(RHS);
   23253     if (BV0 && SVN1 && BV0->getSplatValue(&UndefElts) && SVN1->isSplat()) {
   23254       unsigned SplatIdx = (unsigned)SVN1->getSplatIndex();
   23255       if (!UndefElts[SplatIdx])
   23256         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   23257                            VT.getVectorElementType(), V,
   23258                            DAG.getIntPtrConstant(SplatIdx, dl));
   23259     }
   23260   }
   23261 
   23262   // Check if this is a shuffle node doing a splat.
   23263   ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(V);
   23264   if (!SVN || !SVN->isSplat())
   23265     return SDValue();
   23266 
   23267   unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
   23268   SDValue InVec = V.getOperand(0);
   23269   if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
   23270     assert((SplatIdx < VT.getVectorNumElements()) &&
   23271            "Unexpected shuffle index found!");
   23272     return InVec.getOperand(SplatIdx);
   23273   } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
   23274     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2)))
   23275       if (C->getZExtValue() == SplatIdx)
   23276         return InVec.getOperand(1);
   23277   }
   23278 
   23279   // Avoid introducing an extract element from a shuffle.
   23280   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   23281                      VT.getVectorElementType(), InVec,
   23282                      DAG.getIntPtrConstant(SplatIdx, dl));
   23283 }
   23284 
   23285 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   23286                                         const X86Subtarget &Subtarget) {
   23287   MVT VT = Op.getSimpleValueType();
   23288   SDLoc dl(Op);
   23289   SDValue R = Op.getOperand(0);
   23290   SDValue Amt = Op.getOperand(1);
   23291   unsigned Opcode = Op.getOpcode();
   23292 
   23293   unsigned X86OpcI = (Opcode == ISD::SHL) ? X86ISD::VSHLI :
   23294     (Opcode == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
   23295 
   23296   unsigned X86OpcV = (Opcode == ISD::SHL) ? X86ISD::VSHL :
   23297     (Opcode == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
   23298 
   23299   Amt = peekThroughEXTRACT_SUBVECTORs(Amt);
   23300 
   23301   if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
   23302     if (SDValue BaseShAmt = IsSplatValue(VT, Amt, dl, DAG, Subtarget, Opcode)) {
   23303       MVT EltVT = VT.getVectorElementType();
   23304       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
   23305       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
   23306         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
   23307       else if (EltVT.bitsLT(MVT::i32))
   23308         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
   23309 
   23310       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
   23311     }
   23312   }
   23313 
   23314   // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
   23315   if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
   23316       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
   23317     Amt = Amt.getOperand(0);
   23318     unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
   23319     std::vector<SDValue> Vals(Ratio);
   23320     for (unsigned i = 0; i != Ratio; ++i)
   23321       Vals[i] = Amt.getOperand(i);
   23322     for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
   23323       for (unsigned j = 0; j != Ratio; ++j)
   23324         if (Vals[j] != Amt.getOperand(i + j))
   23325           return SDValue();
   23326     }
   23327 
   23328     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
   23329       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
   23330   }
   23331   return SDValue();
   23332 }
   23333 
   23334 // Convert a shift/rotate left amount to a multiplication scale factor.
   23335 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
   23336                                        const X86Subtarget &Subtarget,
   23337                                        SelectionDAG &DAG) {
   23338   MVT VT = Amt.getSimpleValueType();
   23339   if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
   23340         (Subtarget.hasInt256() && VT == MVT::v16i16) ||
   23341         (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
   23342     return SDValue();
   23343 
   23344   if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
   23345     SmallVector<SDValue, 8> Elts;
   23346     MVT SVT = VT.getVectorElementType();
   23347     unsigned SVTBits = SVT.getSizeInBits();
   23348     APInt One(SVTBits, 1);
   23349     unsigned NumElems = VT.getVectorNumElements();
   23350 
   23351     for (unsigned i = 0; i != NumElems; ++i) {
   23352       SDValue Op = Amt->getOperand(i);
   23353       if (Op->isUndef()) {
   23354         Elts.push_back(Op);
   23355         continue;
   23356       }
   23357 
   23358       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
   23359       APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
   23360       uint64_t ShAmt = C.getZExtValue();
   23361       if (ShAmt >= SVTBits) {
   23362         Elts.push_back(DAG.getUNDEF(SVT));
   23363         continue;
   23364       }
   23365       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
   23366     }
   23367     return DAG.getBuildVector(VT, dl, Elts);
   23368   }
   23369 
   23370   // If the target doesn't support variable shifts, use either FP conversion
   23371   // or integer multiplication to avoid shifting each element individually.
   23372   if (VT == MVT::v4i32) {
   23373     Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
   23374     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
   23375                       DAG.getConstant(0x3f800000U, dl, VT));
   23376     Amt = DAG.getBitcast(MVT::v4f32, Amt);
   23377     return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
   23378   }
   23379 
   23380   // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
   23381   if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
   23382     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
   23383     SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
   23384     SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
   23385     Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
   23386     Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
   23387     if (Subtarget.hasSSE41())
   23388       return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
   23389 
   23390     return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
   23391                                         DAG.getBitcast(VT, Hi),
   23392                                         {0, 2, 4, 6, 8, 10, 12, 14});
   23393   }
   23394 
   23395   return SDValue();
   23396 }
   23397 
   23398 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   23399                           SelectionDAG &DAG) {
   23400   MVT VT = Op.getSimpleValueType();
   23401   SDLoc dl(Op);
   23402   SDValue R = Op.getOperand(0);
   23403   SDValue Amt = Op.getOperand(1);
   23404   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
   23405 
   23406   assert(VT.isVector() && "Custom lowering only for vector shifts!");
   23407   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
   23408 
   23409   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
   23410     return V;
   23411 
   23412   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
   23413     return V;
   23414 
   23415   if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
   23416     return Op;
   23417 
   23418   // XOP has 128-bit variable logical/arithmetic shifts.
   23419   // +ve/-ve Amt = shift left/right.
   23420   if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
   23421                              VT == MVT::v8i16 || VT == MVT::v16i8)) {
   23422     if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
   23423       SDValue Zero = DAG.getConstant(0, dl, VT);
   23424       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
   23425     }
   23426     if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
   23427       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
   23428     if (Op.getOpcode() == ISD::SRA)
   23429       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
   23430   }
   23431 
   23432   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
   23433   // shifts per-lane and then shuffle the partial results back together.
   23434   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
   23435     // Splat the shift amounts so the scalar shifts above will catch it.
   23436     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
   23437     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
   23438     SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
   23439     SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
   23440     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
   23441   }
   23442 
   23443   // i64 vector arithmetic shift can be emulated with the transform:
   23444   // M = lshr(SIGN_MASK, Amt)
   23445   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
   23446   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
   23447       Op.getOpcode() == ISD::SRA) {
   23448     SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
   23449     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
   23450     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
   23451     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
   23452     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
   23453     return R;
   23454   }
   23455 
   23456   // If possible, lower this shift as a sequence of two shifts by
   23457   // constant plus a BLENDing shuffle instead of scalarizing it.
   23458   // Example:
   23459   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
   23460   //
   23461   // Could be rewritten as:
   23462   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
   23463   //
   23464   // The advantage is that the two shifts from the example would be
   23465   // lowered as X86ISD::VSRLI nodes in parallel before blending.
   23466   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
   23467                       (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
   23468     SDValue Amt1, Amt2;
   23469     unsigned NumElts = VT.getVectorNumElements();
   23470     SmallVector<int, 8> ShuffleMask;
   23471     for (unsigned i = 0; i != NumElts; ++i) {
   23472       SDValue A = Amt->getOperand(i);
   23473       if (A.isUndef()) {
   23474         ShuffleMask.push_back(SM_SentinelUndef);
   23475         continue;
   23476       }
   23477       if (!Amt1 || Amt1 == A) {
   23478         ShuffleMask.push_back(i);
   23479         Amt1 = A;
   23480         continue;
   23481       }
   23482       if (!Amt2 || Amt2 == A) {
   23483         ShuffleMask.push_back(i + NumElts);
   23484         Amt2 = A;
   23485         continue;
   23486       }
   23487       break;
   23488     }
   23489 
   23490     // Only perform this blend if we can perform it without loading a mask.
   23491     if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
   23492         isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2) &&
   23493         (VT != MVT::v16i16 ||
   23494          is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
   23495         (VT == MVT::v4i32 || Subtarget.hasSSE41() ||
   23496          Op.getOpcode() != ISD::SHL || canWidenShuffleElements(ShuffleMask))) {
   23497       SDValue Splat1 =
   23498           DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
   23499       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
   23500       SDValue Splat2 =
   23501           DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
   23502       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
   23503       return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
   23504     }
   23505   }
   23506 
   23507   // If possible, lower this packed shift into a vector multiply instead of
   23508   // expanding it into a sequence of scalar shifts.
   23509   if (Op.getOpcode() == ISD::SHL)
   23510     if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
   23511       return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
   23512 
   23513   // Constant ISD::SRL can be performed efficiently on vXi8/vXi16 vectors as we
   23514   // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
   23515   // TODO: Improve support for the shift by zero special case.
   23516   if (Op.getOpcode() == ISD::SRL && ConstantAmt &&
   23517       ((Subtarget.hasSSE41() && VT == MVT::v8i16) ||
   23518        DAG.isKnownNeverZero(Amt)) &&
   23519       (VT == MVT::v16i8 || VT == MVT::v8i16 ||
   23520        ((VT == MVT::v32i8 || VT == MVT::v16i16) && Subtarget.hasInt256()))) {
   23521     SDValue EltBits = DAG.getConstant(VT.getScalarSizeInBits(), dl, VT);
   23522     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
   23523     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
   23524       SDValue Zero = DAG.getConstant(0, dl, VT);
   23525       SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
   23526       SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
   23527       return DAG.getSelect(dl, VT, ZAmt, R, Res);
   23528     }
   23529   }
   23530 
   23531   // v4i32 Non Uniform Shifts.
   23532   // If the shift amount is constant we can shift each lane using the SSE2
   23533   // immediate shifts, else we need to zero-extend each lane to the lower i64
   23534   // and shift using the SSE2 variable shifts.
   23535   // The separate results can then be blended together.
   23536   if (VT == MVT::v4i32) {
   23537     unsigned Opc = Op.getOpcode();
   23538     SDValue Amt0, Amt1, Amt2, Amt3;
   23539     if (ConstantAmt) {
   23540       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
   23541       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
   23542       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
   23543       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
   23544     } else {
   23545       // ISD::SHL is handled above but we include it here for completeness.
   23546       switch (Opc) {
   23547       default:
   23548         llvm_unreachable("Unknown target vector shift node");
   23549       case ISD::SHL:
   23550         Opc = X86ISD::VSHL;
   23551         break;
   23552       case ISD::SRL:
   23553         Opc = X86ISD::VSRL;
   23554         break;
   23555       case ISD::SRA:
   23556         Opc = X86ISD::VSRA;
   23557         break;
   23558       }
   23559       // The SSE2 shifts use the lower i64 as the same shift amount for
   23560       // all lanes and the upper i64 is ignored. On AVX we're better off
   23561       // just zero-extending, but for SSE just duplicating the top 16-bits is
   23562       // cheaper and has the same effect for out of range values.
   23563       if (Subtarget.hasAVX()) {
   23564         SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
   23565         Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
   23566         Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
   23567         Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
   23568         Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
   23569       } else {
   23570         SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
   23571         SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
   23572                                              {4, 5, 6, 7, -1, -1, -1, -1});
   23573         Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
   23574                                     {0, 1, 1, 1, -1, -1, -1, -1});
   23575         Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
   23576                                     {2, 3, 3, 3, -1, -1, -1, -1});
   23577         Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
   23578                                     {0, 1, 1, 1, -1, -1, -1, -1});
   23579         Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
   23580                                     {2, 3, 3, 3, -1, -1, -1, -1});
   23581       }
   23582     }
   23583 
   23584     SDValue R0 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt0));
   23585     SDValue R1 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt1));
   23586     SDValue R2 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt2));
   23587     SDValue R3 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt3));
   23588 
   23589     // Merge the shifted lane results optimally with/without PBLENDW.
   23590     // TODO - ideally shuffle combining would handle this.
   23591     if (Subtarget.hasSSE41()) {
   23592       SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
   23593       SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
   23594       return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
   23595     }
   23596     SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
   23597     SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
   23598     return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
   23599   }
   23600 
   23601   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
   23602   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
   23603   // make the existing SSE solution better.
   23604   // NOTE: We honor prefered vector width before promoting to 512-bits.
   23605   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
   23606       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
   23607       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
   23608       (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
   23609       (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
   23610     assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
   23611            "Unexpected vector type");
   23612     MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
   23613     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
   23614     unsigned ExtOpc =
   23615         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
   23616     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
   23617     Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
   23618     return DAG.getNode(ISD::TRUNCATE, dl, VT,
   23619                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
   23620   }
   23621 
   23622   if (VT == MVT::v16i8 ||
   23623       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
   23624       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
   23625     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
   23626     unsigned ShiftOpcode = Op->getOpcode();
   23627 
   23628     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
   23629       if (VT.is512BitVector()) {
   23630         // On AVX512BW targets we make use of the fact that VSELECT lowers
   23631         // to a masked blend which selects bytes based just on the sign bit
   23632         // extracted to a mask.
   23633         MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   23634         V0 = DAG.getBitcast(VT, V0);
   23635         V1 = DAG.getBitcast(VT, V1);
   23636         Sel = DAG.getBitcast(VT, Sel);
   23637         Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
   23638                            ISD::SETGT);
   23639         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
   23640       } else if (Subtarget.hasSSE41()) {
   23641         // On SSE41 targets we make use of the fact that VSELECT lowers
   23642         // to PBLENDVB which selects bytes based just on the sign bit.
   23643         V0 = DAG.getBitcast(VT, V0);
   23644         V1 = DAG.getBitcast(VT, V1);
   23645         Sel = DAG.getBitcast(VT, Sel);
   23646         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
   23647       }
   23648       // On pre-SSE41 targets we test for the sign bit by comparing to
   23649       // zero - a negative value will set all bits of the lanes to true
   23650       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
   23651       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
   23652       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
   23653       return DAG.getSelect(dl, SelVT, C, V0, V1);
   23654     };
   23655 
   23656     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
   23657     // We can safely do this using i16 shifts as we're only interested in
   23658     // the 3 lower bits of each byte.
   23659     Amt = DAG.getBitcast(ExtVT, Amt);
   23660     Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
   23661     Amt = DAG.getBitcast(VT, Amt);
   23662 
   23663     if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
   23664       // r = VSELECT(r, shift(r, 4), a);
   23665       SDValue M =
   23666           DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
   23667       R = SignBitSelect(VT, Amt, M, R);
   23668 
   23669       // a += a
   23670       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   23671 
   23672       // r = VSELECT(r, shift(r, 2), a);
   23673       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
   23674       R = SignBitSelect(VT, Amt, M, R);
   23675 
   23676       // a += a
   23677       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   23678 
   23679       // return VSELECT(r, shift(r, 1), a);
   23680       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
   23681       R = SignBitSelect(VT, Amt, M, R);
   23682       return R;
   23683     }
   23684 
   23685     if (Op->getOpcode() == ISD::SRA) {
   23686       // For SRA we need to unpack each byte to the higher byte of a i16 vector
   23687       // so we can correctly sign extend. We don't care what happens to the
   23688       // lower byte.
   23689       SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
   23690       SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
   23691       SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
   23692       SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
   23693       ALo = DAG.getBitcast(ExtVT, ALo);
   23694       AHi = DAG.getBitcast(ExtVT, AHi);
   23695       RLo = DAG.getBitcast(ExtVT, RLo);
   23696       RHi = DAG.getBitcast(ExtVT, RHi);
   23697 
   23698       // r = VSELECT(r, shift(r, 4), a);
   23699       SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
   23700                                 DAG.getConstant(4, dl, ExtVT));
   23701       SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
   23702                                 DAG.getConstant(4, dl, ExtVT));
   23703       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
   23704       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
   23705 
   23706       // a += a
   23707       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
   23708       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
   23709 
   23710       // r = VSELECT(r, shift(r, 2), a);
   23711       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
   23712                         DAG.getConstant(2, dl, ExtVT));
   23713       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
   23714                         DAG.getConstant(2, dl, ExtVT));
   23715       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
   23716       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
   23717 
   23718       // a += a
   23719       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
   23720       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
   23721 
   23722       // r = VSELECT(r, shift(r, 1), a);
   23723       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
   23724                         DAG.getConstant(1, dl, ExtVT));
   23725       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
   23726                         DAG.getConstant(1, dl, ExtVT));
   23727       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
   23728       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
   23729 
   23730       // Logical shift the result back to the lower byte, leaving a zero upper
   23731       // byte
   23732       // meaning that we can safely pack with PACKUSWB.
   23733       RLo =
   23734           DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
   23735       RHi =
   23736           DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
   23737       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
   23738     }
   23739   }
   23740 
   23741   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
   23742     MVT ExtVT = MVT::v8i32;
   23743     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
   23744     SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
   23745     SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
   23746     SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
   23747     SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
   23748     ALo = DAG.getBitcast(ExtVT, ALo);
   23749     AHi = DAG.getBitcast(ExtVT, AHi);
   23750     RLo = DAG.getBitcast(ExtVT, RLo);
   23751     RHi = DAG.getBitcast(ExtVT, RHi);
   23752     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
   23753     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
   23754     Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
   23755     Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
   23756     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
   23757   }
   23758 
   23759   if (VT == MVT::v8i16) {
   23760     unsigned ShiftOpcode = Op->getOpcode();
   23761 
   23762     // If we have a constant shift amount, the non-SSE41 path is best as
   23763     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
   23764     bool UseSSE41 = Subtarget.hasSSE41() &&
   23765                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
   23766 
   23767     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
   23768       // On SSE41 targets we make use of the fact that VSELECT lowers
   23769       // to PBLENDVB which selects bytes based just on the sign bit.
   23770       if (UseSSE41) {
   23771         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
   23772         V0 = DAG.getBitcast(ExtVT, V0);
   23773         V1 = DAG.getBitcast(ExtVT, V1);
   23774         Sel = DAG.getBitcast(ExtVT, Sel);
   23775         return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
   23776       }
   23777       // On pre-SSE41 targets we splat the sign bit - a negative value will
   23778       // set all bits of the lanes to true and VSELECT uses that in
   23779       // its OR(AND(V0,C),AND(V1,~C)) lowering.
   23780       SDValue C =
   23781           DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
   23782       return DAG.getSelect(dl, VT, C, V0, V1);
   23783     };
   23784 
   23785     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
   23786     if (UseSSE41) {
   23787       // On SSE41 targets we need to replicate the shift mask in both
   23788       // bytes for PBLENDVB.
   23789       Amt = DAG.getNode(
   23790           ISD::OR, dl, VT,
   23791           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
   23792           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
   23793     } else {
   23794       Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
   23795     }
   23796 
   23797     // r = VSELECT(r, shift(r, 8), a);
   23798     SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
   23799     R = SignBitSelect(Amt, M, R);
   23800 
   23801     // a += a
   23802     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   23803 
   23804     // r = VSELECT(r, shift(r, 4), a);
   23805     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
   23806     R = SignBitSelect(Amt, M, R);
   23807 
   23808     // a += a
   23809     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   23810 
   23811     // r = VSELECT(r, shift(r, 2), a);
   23812     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
   23813     R = SignBitSelect(Amt, M, R);
   23814 
   23815     // a += a
   23816     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
   23817 
   23818     // return VSELECT(r, shift(r, 1), a);
   23819     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
   23820     R = SignBitSelect(Amt, M, R);
   23821     return R;
   23822   }
   23823 
   23824   // Decompose 256-bit shifts into smaller 128-bit shifts.
   23825   if (VT.is256BitVector())
   23826     return Lower256IntArith(Op, DAG);
   23827 
   23828   return SDValue();
   23829 }
   23830 
   23831 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   23832                            SelectionDAG &DAG) {
   23833   MVT VT = Op.getSimpleValueType();
   23834   assert(VT.isVector() && "Custom lowering only for vector rotates!");
   23835 
   23836   SDLoc DL(Op);
   23837   SDValue R = Op.getOperand(0);
   23838   SDValue Amt = Op.getOperand(1);
   23839   unsigned Opcode = Op.getOpcode();
   23840   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   23841 
   23842   if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
   23843     // Attempt to rotate by immediate.
   23844     APInt UndefElts;
   23845     SmallVector<APInt, 16> EltBits;
   23846     if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
   23847       if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
   23848             return EltBits[0] == V;
   23849           })) {
   23850         unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
   23851         uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
   23852         return DAG.getNode(Op, DL, VT, R,
   23853                            DAG.getConstant(RotateAmt, DL, MVT::i8));
   23854       }
   23855     }
   23856 
   23857     // Else, fall-back on VPROLV/VPRORV.
   23858     return Op;
   23859   }
   23860 
   23861   assert((Opcode == ISD::ROTL) && "Only ROTL supported");
   23862 
   23863   // XOP has 128-bit vector variable + immediate rotates.
   23864   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
   23865   if (Subtarget.hasXOP()) {
   23866     // Split 256-bit integers.
   23867     if (VT.is256BitVector())
   23868       return Lower256IntArith(Op, DAG);
   23869     assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
   23870 
   23871     // Attempt to rotate by immediate.
   23872     if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
   23873       if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
   23874         uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
   23875         assert(RotateAmt < EltSizeInBits && "Rotation out of range");
   23876         return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
   23877                            DAG.getConstant(RotateAmt, DL, MVT::i8));
   23878       }
   23879     }
   23880 
   23881     // Use general rotate by variable (per-element).
   23882     return Op;
   23883   }
   23884 
   23885   // Split 256-bit integers on pre-AVX2 targets.
   23886   if (VT.is256BitVector() && !Subtarget.hasAVX2())
   23887     return Lower256IntArith(Op, DAG);
   23888 
   23889   assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
   23890           ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
   23891            Subtarget.hasAVX2())) &&
   23892          "Only vXi32/vXi16/vXi8 vector rotates supported");
   23893 
   23894   // Rotate by an uniform constant - expand back to shifts.
   23895   // TODO - legalizers should be able to handle this.
   23896   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
   23897     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
   23898       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
   23899       assert(RotateAmt < EltSizeInBits && "Rotation out of range");
   23900       if (RotateAmt == 0)
   23901         return R;
   23902 
   23903       SDValue AmtR = DAG.getConstant(EltSizeInBits - RotateAmt, DL, VT);
   23904       SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
   23905       SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
   23906       return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
   23907     }
   23908   }
   23909 
   23910   // Rotate by splat - expand back to shifts.
   23911   // TODO - legalizers should be able to handle this.
   23912   if ((EltSizeInBits >= 16 || Subtarget.hasBWI()) &&
   23913       IsSplatValue(VT, Amt, DL, DAG, Subtarget, Opcode)) {
   23914     SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
   23915     AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
   23916     SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
   23917     SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
   23918     return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
   23919   }
   23920 
   23921   // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
   23922   // the amount bit.
   23923   if (EltSizeInBits == 8) {
   23924     if (Subtarget.hasBWI()) {
   23925       SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
   23926       AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
   23927       SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
   23928       SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
   23929       return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
   23930     }
   23931 
   23932     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
   23933 
   23934     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
   23935       if (Subtarget.hasSSE41()) {
   23936         // On SSE41 targets we make use of the fact that VSELECT lowers
   23937         // to PBLENDVB which selects bytes based just on the sign bit.
   23938         V0 = DAG.getBitcast(VT, V0);
   23939         V1 = DAG.getBitcast(VT, V1);
   23940         Sel = DAG.getBitcast(VT, Sel);
   23941         return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
   23942       }
   23943       // On pre-SSE41 targets we test for the sign bit by comparing to
   23944       // zero - a negative value will set all bits of the lanes to true
   23945       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
   23946       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, DL);
   23947       SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
   23948       return DAG.getSelect(DL, SelVT, C, V0, V1);
   23949     };
   23950 
   23951     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
   23952     // We can safely do this using i16 shifts as we're only interested in
   23953     // the 3 lower bits of each byte.
   23954     Amt = DAG.getBitcast(ExtVT, Amt);
   23955     Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
   23956     Amt = DAG.getBitcast(VT, Amt);
   23957 
   23958     // r = VSELECT(r, rot(r, 4), a);
   23959     SDValue M;
   23960     M = DAG.getNode(
   23961         ISD::OR, DL, VT,
   23962         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
   23963         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
   23964     R = SignBitSelect(VT, Amt, M, R);
   23965 
   23966     // a += a
   23967     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
   23968 
   23969     // r = VSELECT(r, rot(r, 2), a);
   23970     M = DAG.getNode(
   23971         ISD::OR, DL, VT,
   23972         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
   23973         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
   23974     R = SignBitSelect(VT, Amt, M, R);
   23975 
   23976     // a += a
   23977     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
   23978 
   23979     // return VSELECT(r, rot(r, 1), a);
   23980     M = DAG.getNode(
   23981         ISD::OR, DL, VT,
   23982         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
   23983         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
   23984     return SignBitSelect(VT, Amt, M, R);
   23985   }
   23986 
   23987   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
   23988   bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
   23989                         SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
   23990 
   23991   // Best to fallback for all supported variable shifts.
   23992   // AVX2 - best to fallback for non-constants as well.
   23993   // TODO - legalizers should be able to handle this.
   23994   if (LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
   23995     SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
   23996     AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
   23997     SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
   23998     SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
   23999     return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
   24000   }
   24001 
   24002   // As with shifts, convert the rotation amount to a multiplication factor.
   24003   SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
   24004   assert(Scale && "Failed to convert ROTL amount to scale");
   24005 
   24006   // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
   24007   if (EltSizeInBits == 16) {
   24008     SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
   24009     SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
   24010     return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
   24011   }
   24012 
   24013   // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
   24014   // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
   24015   // that can then be OR'd with the lower 32-bits.
   24016   assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
   24017   static const int OddMask[] = {1, -1, 3, -1};
   24018   SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
   24019   SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
   24020 
   24021   SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
   24022                               DAG.getBitcast(MVT::v2i64, R),
   24023                               DAG.getBitcast(MVT::v2i64, Scale));
   24024   SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
   24025                               DAG.getBitcast(MVT::v2i64, R13),
   24026                               DAG.getBitcast(MVT::v2i64, Scale13));
   24027   Res02 = DAG.getBitcast(VT, Res02);
   24028   Res13 = DAG.getBitcast(VT, Res13);
   24029 
   24030   return DAG.getNode(ISD::OR, DL, VT,
   24031                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
   24032                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
   24033 }
   24034 
   24035 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   24036   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
   24037   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
   24038   // looks for this combo and may remove the "setcc" instruction if the "setcc"
   24039   // has only one use.
   24040   SDNode *N = Op.getNode();
   24041   SDValue LHS = N->getOperand(0);
   24042   SDValue RHS = N->getOperand(1);
   24043   unsigned BaseOp = 0;
   24044   X86::CondCode Cond;
   24045   SDLoc DL(Op);
   24046   switch (Op.getOpcode()) {
   24047   default: llvm_unreachable("Unknown ovf instruction!");
   24048   case ISD::SADDO:
   24049     // A subtract of one will be selected as a INC. Note that INC doesn't
   24050     // set CF, so we can't do this for UADDO.
   24051     if (isOneConstant(RHS)) {
   24052       BaseOp = X86ISD::INC;
   24053       Cond = X86::COND_O;
   24054       break;
   24055     }
   24056     BaseOp = X86ISD::ADD;
   24057     Cond = X86::COND_O;
   24058     break;
   24059   case ISD::UADDO:
   24060     BaseOp = X86ISD::ADD;
   24061     Cond = X86::COND_B;
   24062     break;
   24063   case ISD::SSUBO:
   24064     // A subtract of one will be selected as a DEC. Note that DEC doesn't
   24065     // set CF, so we can't do this for USUBO.
   24066     if (isOneConstant(RHS)) {
   24067       BaseOp = X86ISD::DEC;
   24068       Cond = X86::COND_O;
   24069       break;
   24070     }
   24071     BaseOp = X86ISD::SUB;
   24072     Cond = X86::COND_O;
   24073     break;
   24074   case ISD::USUBO:
   24075     BaseOp = X86ISD::SUB;
   24076     Cond = X86::COND_B;
   24077     break;
   24078   case ISD::SMULO:
   24079     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
   24080     Cond = X86::COND_O;
   24081     break;
   24082   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
   24083     if (N->getValueType(0) == MVT::i8) {
   24084       BaseOp = X86ISD::UMUL8;
   24085       Cond = X86::COND_O;
   24086       break;
   24087     }
   24088     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
   24089                                  MVT::i32);
   24090     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
   24091 
   24092     SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
   24093 
   24094     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   24095   }
   24096   }
   24097 
   24098   // Also sets EFLAGS.
   24099   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
   24100   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
   24101 
   24102   SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
   24103 
   24104   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   24105 }
   24106 
   24107 /// Returns true if the operand type is exactly twice the native width, and
   24108 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
   24109 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
   24110 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
   24111 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
   24112   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
   24113 
   24114   if (OpWidth == 64)
   24115     return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
   24116   else if (OpWidth == 128)
   24117     return Subtarget.hasCmpxchg16b();
   24118   else
   24119     return false;
   24120 }
   24121 
   24122 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   24123   return needsCmpXchgNb(SI->getValueOperand()->getType());
   24124 }
   24125 
   24126 // Note: this turns large loads into lock cmpxchg8b/16b.
   24127 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
   24128 TargetLowering::AtomicExpansionKind
   24129 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   24130   auto PTy = cast<PointerType>(LI->getPointerOperandType());
   24131   return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
   24132                                                : AtomicExpansionKind::None;
   24133 }
   24134 
   24135 TargetLowering::AtomicExpansionKind
   24136 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   24137   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
   24138   Type *MemType = AI->getType();
   24139 
   24140   // If the operand is too big, we must see if cmpxchg8/16b is available
   24141   // and default to library calls otherwise.
   24142   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
   24143     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
   24144                                    : AtomicExpansionKind::None;
   24145   }
   24146 
   24147   AtomicRMWInst::BinOp Op = AI->getOperation();
   24148   switch (Op) {
   24149   default:
   24150     llvm_unreachable("Unknown atomic operation");
   24151   case AtomicRMWInst::Xchg:
   24152   case AtomicRMWInst::Add:
   24153   case AtomicRMWInst::Sub:
   24154     // It's better to use xadd, xsub or xchg for these in all cases.
   24155     return AtomicExpansionKind::None;
   24156   case AtomicRMWInst::Or:
   24157   case AtomicRMWInst::And:
   24158   case AtomicRMWInst::Xor:
   24159     // If the atomicrmw's result isn't actually used, we can just add a "lock"
   24160     // prefix to a normal instruction for these operations.
   24161     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
   24162                             : AtomicExpansionKind::None;
   24163   case AtomicRMWInst::Nand:
   24164   case AtomicRMWInst::Max:
   24165   case AtomicRMWInst::Min:
   24166   case AtomicRMWInst::UMax:
   24167   case AtomicRMWInst::UMin:
   24168     // These always require a non-trivial set of data operations on x86. We must
   24169     // use a cmpxchg loop.
   24170     return AtomicExpansionKind::CmpXChg;
   24171   }
   24172 }
   24173 
   24174 LoadInst *
   24175 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   24176   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
   24177   Type *MemType = AI->getType();
   24178   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
   24179   // there is no benefit in turning such RMWs into loads, and it is actually
   24180   // harmful as it introduces a mfence.
   24181   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
   24182     return nullptr;
   24183 
   24184   auto Builder = IRBuilder<>(AI);
   24185   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   24186   auto SSID = AI->getSyncScopeID();
   24187   // We must restrict the ordering to avoid generating loads with Release or
   24188   // ReleaseAcquire orderings.
   24189   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
   24190   auto Ptr = AI->getPointerOperand();
   24191 
   24192   // Before the load we need a fence. Here is an example lifted from
   24193   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
   24194   // is required:
   24195   // Thread 0:
   24196   //   x.store(1, relaxed);
   24197   //   r1 = y.fetch_add(0, release);
   24198   // Thread 1:
   24199   //   y.fetch_add(42, acquire);
   24200   //   r2 = x.load(relaxed);
   24201   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
   24202   // lowered to just a load without a fence. A mfence flushes the store buffer,
   24203   // making the optimization clearly correct.
   24204   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
   24205   // otherwise, we might be able to be more aggressive on relaxed idempotent
   24206   // rmw. In practice, they do not look useful, so we don't try to be
   24207   // especially clever.
   24208   if (SSID == SyncScope::SingleThread)
   24209     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
   24210     // the IR level, so we must wrap it in an intrinsic.
   24211     return nullptr;
   24212 
   24213   if (!Subtarget.hasMFence())
   24214     // FIXME: it might make sense to use a locked operation here but on a
   24215     // different cache-line to prevent cache-line bouncing. In practice it
   24216     // is probably a small win, and x86 processors without mfence are rare
   24217     // enough that we do not bother.
   24218     return nullptr;
   24219 
   24220   Function *MFence =
   24221       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
   24222   Builder.CreateCall(MFence, {});
   24223 
   24224   // Finally we can emit the atomic load.
   24225   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
   24226           AI->getType()->getPrimitiveSizeInBits());
   24227   Loaded->setAtomic(Order, SSID);
   24228   AI->replaceAllUsesWith(Loaded);
   24229   AI->eraseFromParent();
   24230   return Loaded;
   24231 }
   24232 
   24233 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
   24234                                  SelectionDAG &DAG) {
   24235   SDLoc dl(Op);
   24236   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
   24237     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   24238   SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
   24239     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
   24240 
   24241   // The only fence that needs an instruction is a sequentially-consistent
   24242   // cross-thread fence.
   24243   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
   24244       FenceSSID == SyncScope::System) {
   24245     if (Subtarget.hasMFence())
   24246       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
   24247 
   24248     SDValue Chain = Op.getOperand(0);
   24249     SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
   24250     SDValue Ops[] = {
   24251       DAG.getRegister(X86::ESP, MVT::i32),     // Base
   24252       DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
   24253       DAG.getRegister(0, MVT::i32),            // Index
   24254       DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
   24255       DAG.getRegister(0, MVT::i32),            // Segment.
   24256       Zero,
   24257       Chain
   24258     };
   24259     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
   24260     return SDValue(Res, 0);
   24261   }
   24262 
   24263   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
   24264   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
   24265 }
   24266 
   24267 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
   24268                              SelectionDAG &DAG) {
   24269   MVT T = Op.getSimpleValueType();
   24270   SDLoc DL(Op);
   24271   unsigned Reg = 0;
   24272   unsigned size = 0;
   24273   switch(T.SimpleTy) {
   24274   default: llvm_unreachable("Invalid value type!");
   24275   case MVT::i8:  Reg = X86::AL;  size = 1; break;
   24276   case MVT::i16: Reg = X86::AX;  size = 2; break;
   24277   case MVT::i32: Reg = X86::EAX; size = 4; break;
   24278   case MVT::i64:
   24279     assert(Subtarget.is64Bit() && "Node not type legal!");
   24280     Reg = X86::RAX; size = 8;
   24281     break;
   24282   }
   24283   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
   24284                                   Op.getOperand(2), SDValue());
   24285   SDValue Ops[] = { cpIn.getValue(0),
   24286                     Op.getOperand(1),
   24287                     Op.getOperand(3),
   24288                     DAG.getTargetConstant(size, DL, MVT::i8),
   24289                     cpIn.getValue(1) };
   24290   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   24291   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   24292   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
   24293                                            Ops, T, MMO);
   24294 
   24295   SDValue cpOut =
   24296     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
   24297   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
   24298                                       MVT::i32, cpOut.getValue(2));
   24299   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
   24300 
   24301   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
   24302   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
   24303   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
   24304   return SDValue();
   24305 }
   24306 
   24307 // Create MOVMSKB, taking into account whether we need to split for AVX1.
   24308 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
   24309                            const X86Subtarget &Subtarget) {
   24310   MVT InVT = V.getSimpleValueType();
   24311 
   24312   if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
   24313     SDValue Lo, Hi;
   24314     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
   24315     Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
   24316     Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
   24317     Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
   24318                      DAG.getConstant(16, DL, MVT::i8));
   24319     return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
   24320   }
   24321 
   24322   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
   24323 }
   24324 
   24325 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
   24326                             SelectionDAG &DAG) {
   24327   SDValue Src = Op.getOperand(0);
   24328   MVT SrcVT = Src.getSimpleValueType();
   24329   MVT DstVT = Op.getSimpleValueType();
   24330 
   24331   // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
   24332   // half to v32i1 and concatenating the result.
   24333   if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
   24334     assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
   24335     assert(Subtarget.hasBWI() && "Expected BWI target");
   24336     SDLoc dl(Op);
   24337     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
   24338                              DAG.getIntPtrConstant(0, dl));
   24339     Lo = DAG.getBitcast(MVT::v32i1, Lo);
   24340     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
   24341                              DAG.getIntPtrConstant(1, dl));
   24342     Hi = DAG.getBitcast(MVT::v32i1, Hi);
   24343     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
   24344   }
   24345 
   24346   // Custom splitting for BWI types when AVX512F is available but BWI isn't.
   24347   if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
   24348     DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
   24349     SDLoc dl(Op);
   24350     SDValue Lo, Hi;
   24351     std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
   24352     EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
   24353                                   DstVT.getVectorNumElements() / 2);
   24354     Lo = DAG.getBitcast(CastVT, Lo);
   24355     Hi = DAG.getBitcast(CastVT, Hi);
   24356     return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
   24357   }
   24358 
   24359   // Use MOVMSK for vector to scalar conversion to prevent scalarization.
   24360   if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
   24361     assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
   24362     MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
   24363     SDLoc DL(Op);
   24364     SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
   24365     V = getPMOVMSKB(DL, V, DAG, Subtarget);
   24366     return DAG.getZExtOrTrunc(V, DL, DstVT);
   24367   }
   24368 
   24369   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
   24370       SrcVT == MVT::i64) {
   24371     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
   24372     if (DstVT != MVT::f64)
   24373       // This conversion needs to be expanded.
   24374       return SDValue();
   24375 
   24376     SmallVector<SDValue, 16> Elts;
   24377     SDLoc dl(Op);
   24378     unsigned NumElts;
   24379     MVT SVT;
   24380     if (SrcVT.isVector()) {
   24381       NumElts = SrcVT.getVectorNumElements();
   24382       SVT = SrcVT.getVectorElementType();
   24383 
   24384       // Widen the vector in input in the case of MVT::v2i32.
   24385       // Example: from MVT::v2i32 to MVT::v4i32.
   24386       for (unsigned i = 0, e = NumElts; i != e; ++i)
   24387         Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Src,
   24388                                    DAG.getIntPtrConstant(i, dl)));
   24389     } else {
   24390       assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
   24391              "Unexpected source type in LowerBITCAST");
   24392       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
   24393                                  DAG.getIntPtrConstant(0, dl)));
   24394       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
   24395                                  DAG.getIntPtrConstant(1, dl)));
   24396       NumElts = 2;
   24397       SVT = MVT::i32;
   24398     }
   24399     // Explicitly mark the extra elements as Undef.
   24400     Elts.append(NumElts, DAG.getUNDEF(SVT));
   24401 
   24402     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
   24403     SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
   24404     SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
   24405     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
   24406                        DAG.getIntPtrConstant(0, dl));
   24407   }
   24408 
   24409   assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
   24410          Subtarget.hasMMX() && "Unexpected custom BITCAST");
   24411   assert((DstVT == MVT::i64 ||
   24412           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
   24413          "Unexpected custom BITCAST");
   24414   // i64 <=> MMX conversions are Legal.
   24415   if (SrcVT==MVT::i64 && DstVT.isVector())
   24416     return Op;
   24417   if (DstVT==MVT::i64 && SrcVT.isVector())
   24418     return Op;
   24419   // MMX <=> MMX conversions are Legal.
   24420   if (SrcVT.isVector() && DstVT.isVector())
   24421     return Op;
   24422   // All other conversions need to be expanded.
   24423   return SDValue();
   24424 }
   24425 
   24426 /// Compute the horizontal sum of bytes in V for the elements of VT.
   24427 ///
   24428 /// Requires V to be a byte vector and VT to be an integer vector type with
   24429 /// wider elements than V's type. The width of the elements of VT determines
   24430 /// how many bytes of V are summed horizontally to produce each element of the
   24431 /// result.
   24432 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
   24433                                       const X86Subtarget &Subtarget,
   24434                                       SelectionDAG &DAG) {
   24435   SDLoc DL(V);
   24436   MVT ByteVecVT = V.getSimpleValueType();
   24437   MVT EltVT = VT.getVectorElementType();
   24438   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
   24439          "Expected value to have byte element type.");
   24440   assert(EltVT != MVT::i8 &&
   24441          "Horizontal byte sum only makes sense for wider elements!");
   24442   unsigned VecSize = VT.getSizeInBits();
   24443   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
   24444 
   24445   // PSADBW instruction horizontally add all bytes and leave the result in i64
   24446   // chunks, thus directly computes the pop count for v2i64 and v4i64.
   24447   if (EltVT == MVT::i64) {
   24448     SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
   24449     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
   24450     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
   24451     return DAG.getBitcast(VT, V);
   24452   }
   24453 
   24454   if (EltVT == MVT::i32) {
   24455     // We unpack the low half and high half into i32s interleaved with zeros so
   24456     // that we can use PSADBW to horizontally sum them. The most useful part of
   24457     // this is that it lines up the results of two PSADBW instructions to be
   24458     // two v2i64 vectors which concatenated are the 4 population counts. We can
   24459     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
   24460     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
   24461     SDValue V32 = DAG.getBitcast(VT, V);
   24462     SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
   24463     SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
   24464 
   24465     // Do the horizontal sums into two v2i64s.
   24466     Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
   24467     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
   24468     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
   24469                       DAG.getBitcast(ByteVecVT, Low), Zeros);
   24470     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
   24471                        DAG.getBitcast(ByteVecVT, High), Zeros);
   24472 
   24473     // Merge them together.
   24474     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
   24475     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
   24476                     DAG.getBitcast(ShortVecVT, Low),
   24477                     DAG.getBitcast(ShortVecVT, High));
   24478 
   24479     return DAG.getBitcast(VT, V);
   24480   }
   24481 
   24482   // The only element type left is i16.
   24483   assert(EltVT == MVT::i16 && "Unknown how to handle type");
   24484 
   24485   // To obtain pop count for each i16 element starting from the pop count for
   24486   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
   24487   // right by 8. It is important to shift as i16s as i8 vector shift isn't
   24488   // directly supported.
   24489   SDValue ShifterV = DAG.getConstant(8, DL, VT);
   24490   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
   24491   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
   24492                   DAG.getBitcast(ByteVecVT, V));
   24493   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
   24494 }
   24495 
   24496 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
   24497                                         const X86Subtarget &Subtarget,
   24498                                         SelectionDAG &DAG) {
   24499   MVT VT = Op.getSimpleValueType();
   24500   MVT EltVT = VT.getVectorElementType();
   24501   unsigned VecSize = VT.getSizeInBits();
   24502 
   24503   // Implement a lookup table in register by using an algorithm based on:
   24504   // http://wm.ite.pl/articles/sse-popcount.html
   24505   //
   24506   // The general idea is that every lower byte nibble in the input vector is an
   24507   // index into a in-register pre-computed pop count table. We then split up the
   24508   // input vector in two new ones: (1) a vector with only the shifted-right
   24509   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
   24510   // masked out higher ones) for each byte. PSHUFB is used separately with both
   24511   // to index the in-register table. Next, both are added and the result is a
   24512   // i8 vector where each element contains the pop count for input byte.
   24513   //
   24514   // To obtain the pop count for elements != i8, we follow up with the same
   24515   // approach and use additional tricks as described below.
   24516   //
   24517   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
   24518                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
   24519                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
   24520                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
   24521 
   24522   int NumByteElts = VecSize / 8;
   24523   MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
   24524   SDValue In = DAG.getBitcast(ByteVecVT, Op);
   24525   SmallVector<SDValue, 64> LUTVec;
   24526   for (int i = 0; i < NumByteElts; ++i)
   24527     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
   24528   SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
   24529   SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
   24530 
   24531   // High nibbles
   24532   SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
   24533   SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
   24534 
   24535   // Low nibbles
   24536   SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
   24537 
   24538   // The input vector is used as the shuffle mask that index elements into the
   24539   // LUT. After counting low and high nibbles, add the vector to obtain the
   24540   // final pop count per i8 element.
   24541   SDValue HighPopCnt =
   24542       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
   24543   SDValue LowPopCnt =
   24544       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
   24545   SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
   24546 
   24547   if (EltVT == MVT::i8)
   24548     return PopCnt;
   24549 
   24550   return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
   24551 }
   24552 
   24553 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
   24554                                        const X86Subtarget &Subtarget,
   24555                                        SelectionDAG &DAG) {
   24556   MVT VT = Op.getSimpleValueType();
   24557   assert(VT.is128BitVector() &&
   24558          "Only 128-bit vector bitmath lowering supported.");
   24559 
   24560   int VecSize = VT.getSizeInBits();
   24561   MVT EltVT = VT.getVectorElementType();
   24562   int Len = EltVT.getSizeInBits();
   24563 
   24564   // This is the vectorized version of the "best" algorithm from
   24565   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
   24566   // with a minor tweak to use a series of adds + shifts instead of vector
   24567   // multiplications. Implemented for all integer vector types. We only use
   24568   // this when we don't have SSSE3 which allows a LUT-based lowering that is
   24569   // much faster, even faster than using native popcnt instructions.
   24570 
   24571   auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
   24572     MVT VT = V.getSimpleValueType();
   24573     SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
   24574     return DAG.getNode(OpCode, DL, VT, V, ShifterV);
   24575   };
   24576   auto GetMask = [&](SDValue V, APInt Mask) {
   24577     MVT VT = V.getSimpleValueType();
   24578     SDValue MaskV = DAG.getConstant(Mask, DL, VT);
   24579     return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
   24580   };
   24581 
   24582   // We don't want to incur the implicit masks required to SRL vNi8 vectors on
   24583   // x86, so set the SRL type to have elements at least i16 wide. This is
   24584   // correct because all of our SRLs are followed immediately by a mask anyways
   24585   // that handles any bits that sneak into the high bits of the byte elements.
   24586   MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
   24587 
   24588   SDValue V = Op;
   24589 
   24590   // v = v - ((v >> 1) & 0x55555555...)
   24591   SDValue Srl =
   24592       DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
   24593   SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
   24594   V = DAG.getNode(ISD::SUB, DL, VT, V, And);
   24595 
   24596   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
   24597   SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
   24598   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
   24599   SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
   24600   V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
   24601 
   24602   // v = (v + (v >> 4)) & 0x0F0F0F0F...
   24603   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
   24604   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
   24605   V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
   24606 
   24607   // At this point, V contains the byte-wise population count, and we are
   24608   // merely doing a horizontal sum if necessary to get the wider element
   24609   // counts.
   24610   if (EltVT == MVT::i8)
   24611     return V;
   24612 
   24613   return LowerHorizontalByteSum(
   24614       DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
   24615       DAG);
   24616 }
   24617 
   24618 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
   24619 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
   24620 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
   24621                                 SelectionDAG &DAG) {
   24622   MVT VT = Op.getSimpleValueType();
   24623   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
   24624          "Unknown CTPOP type to handle");
   24625   SDLoc DL(Op.getNode());
   24626   SDValue Op0 = Op.getOperand(0);
   24627 
   24628   // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
   24629   if (Subtarget.hasVPOPCNTDQ()) {
   24630     unsigned NumElems = VT.getVectorNumElements();
   24631     assert((VT.getVectorElementType() == MVT::i8 ||
   24632             VT.getVectorElementType() == MVT::i16) && "Unexpected type");
   24633     if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
   24634       MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
   24635       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
   24636       Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
   24637       return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
   24638     }
   24639   }
   24640 
   24641   if (!Subtarget.hasSSSE3()) {
   24642     // We can't use the fast LUT approach, so fall back on vectorized bitmath.
   24643     assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
   24644     return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
   24645   }
   24646 
   24647   // Decompose 256-bit ops into smaller 128-bit ops.
   24648   if (VT.is256BitVector() && !Subtarget.hasInt256())
   24649     return Lower256IntUnary(Op, DAG);
   24650 
   24651   // Decompose 512-bit ops into smaller 256-bit ops.
   24652   if (VT.is512BitVector() && !Subtarget.hasBWI())
   24653     return Lower512IntUnary(Op, DAG);
   24654 
   24655   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
   24656 }
   24657 
   24658 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
   24659                           SelectionDAG &DAG) {
   24660   assert(Op.getSimpleValueType().isVector() &&
   24661          "We only do custom lowering for vector population count.");
   24662   return LowerVectorCTPOP(Op, Subtarget, DAG);
   24663 }
   24664 
   24665 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
   24666   MVT VT = Op.getSimpleValueType();
   24667   SDValue In = Op.getOperand(0);
   24668   SDLoc DL(Op);
   24669 
   24670   // For scalars, its still beneficial to transfer to/from the SIMD unit to
   24671   // perform the BITREVERSE.
   24672   if (!VT.isVector()) {
   24673     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
   24674     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
   24675     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
   24676     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
   24677                        DAG.getIntPtrConstant(0, DL));
   24678   }
   24679 
   24680   int NumElts = VT.getVectorNumElements();
   24681   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
   24682 
   24683   // Decompose 256-bit ops into smaller 128-bit ops.
   24684   if (VT.is256BitVector())
   24685     return Lower256IntUnary(Op, DAG);
   24686 
   24687   assert(VT.is128BitVector() &&
   24688          "Only 128-bit vector bitreverse lowering supported.");
   24689 
   24690   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
   24691   // perform the BSWAP in the shuffle.
   24692   // Its best to shuffle using the second operand as this will implicitly allow
   24693   // memory folding for multiple vectors.
   24694   SmallVector<SDValue, 16> MaskElts;
   24695   for (int i = 0; i != NumElts; ++i) {
   24696     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
   24697       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
   24698       int PermuteByte = SourceByte | (2 << 5);
   24699       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
   24700     }
   24701   }
   24702 
   24703   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
   24704   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
   24705   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
   24706                     Res, Mask);
   24707   return DAG.getBitcast(VT, Res);
   24708 }
   24709 
   24710 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   24711                                SelectionDAG &DAG) {
   24712   MVT VT = Op.getSimpleValueType();
   24713 
   24714   if (Subtarget.hasXOP() && !VT.is512BitVector())
   24715     return LowerBITREVERSE_XOP(Op, DAG);
   24716 
   24717   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
   24718 
   24719   SDValue In = Op.getOperand(0);
   24720   SDLoc DL(Op);
   24721 
   24722   unsigned NumElts = VT.getVectorNumElements();
   24723   assert(VT.getScalarType() == MVT::i8 &&
   24724          "Only byte vector BITREVERSE supported");
   24725 
   24726   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
   24727   if (VT.is256BitVector() && !Subtarget.hasInt256())
   24728     return Lower256IntUnary(Op, DAG);
   24729 
   24730   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
   24731   // two nibbles and a PSHUFB lookup to find the bitreverse of each
   24732   // 0-15 value (moved to the other nibble).
   24733   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
   24734   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
   24735   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
   24736 
   24737   const int LoLUT[16] = {
   24738       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
   24739       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
   24740       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
   24741       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
   24742   const int HiLUT[16] = {
   24743       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
   24744       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
   24745       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
   24746       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
   24747 
   24748   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
   24749   for (unsigned i = 0; i < NumElts; ++i) {
   24750     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
   24751     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
   24752   }
   24753 
   24754   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
   24755   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
   24756   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
   24757   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
   24758   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
   24759 }
   24760 
   24761 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
   24762                                         const X86Subtarget &Subtarget,
   24763                                         bool AllowIncDec = true) {
   24764   unsigned NewOpc = 0;
   24765   switch (N->getOpcode()) {
   24766   case ISD::ATOMIC_LOAD_ADD:
   24767     NewOpc = X86ISD::LADD;
   24768     break;
   24769   case ISD::ATOMIC_LOAD_SUB:
   24770     NewOpc = X86ISD::LSUB;
   24771     break;
   24772   case ISD::ATOMIC_LOAD_OR:
   24773     NewOpc = X86ISD::LOR;
   24774     break;
   24775   case ISD::ATOMIC_LOAD_XOR:
   24776     NewOpc = X86ISD::LXOR;
   24777     break;
   24778   case ISD::ATOMIC_LOAD_AND:
   24779     NewOpc = X86ISD::LAND;
   24780     break;
   24781   default:
   24782     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
   24783   }
   24784 
   24785   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
   24786 
   24787   if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
   24788     // Convert to inc/dec if they aren't slow or we are optimizing for size.
   24789     if (AllowIncDec && (!Subtarget.slowIncDec() ||
   24790                         DAG.getMachineFunction().getFunction().optForSize())) {
   24791       if ((NewOpc == X86ISD::LADD && C->isOne()) ||
   24792           (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
   24793         return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
   24794                                        DAG.getVTList(MVT::i32, MVT::Other),
   24795                                        {N->getOperand(0), N->getOperand(1)},
   24796                                        /*MemVT=*/N->getSimpleValueType(0), MMO);
   24797       if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
   24798           (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
   24799         return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
   24800                                        DAG.getVTList(MVT::i32, MVT::Other),
   24801                                        {N->getOperand(0), N->getOperand(1)},
   24802                                        /*MemVT=*/N->getSimpleValueType(0), MMO);
   24803     }
   24804   }
   24805 
   24806   return DAG.getMemIntrinsicNode(
   24807       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
   24808       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
   24809       /*MemVT=*/N->getSimpleValueType(0), MMO);
   24810 }
   24811 
   24812 /// Lower atomic_load_ops into LOCK-prefixed operations.
   24813 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
   24814                                 const X86Subtarget &Subtarget) {
   24815   SDValue Chain = N->getOperand(0);
   24816   SDValue LHS = N->getOperand(1);
   24817   SDValue RHS = N->getOperand(2);
   24818   unsigned Opc = N->getOpcode();
   24819   MVT VT = N->getSimpleValueType(0);
   24820   SDLoc DL(N);
   24821 
   24822   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
   24823   // can only be lowered when the result is unused.  They should have already
   24824   // been transformed into a cmpxchg loop in AtomicExpand.
   24825   if (N->hasAnyUseOfValue(0)) {
   24826     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
   24827     // select LXADD if LOCK_SUB can't be selected.
   24828     if (Opc == ISD::ATOMIC_LOAD_SUB) {
   24829       AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
   24830       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
   24831       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
   24832                            RHS, AN->getMemOperand());
   24833     }
   24834     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
   24835            "Used AtomicRMW ops other than Add should have been expanded!");
   24836     return N;
   24837   }
   24838 
   24839   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
   24840   // RAUW the chain, but don't worry about the result, as it's unused.
   24841   assert(!N->hasAnyUseOfValue(0));
   24842   DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
   24843   return SDValue();
   24844 }
   24845 
   24846 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   24847   SDNode *Node = Op.getNode();
   24848   SDLoc dl(Node);
   24849   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
   24850 
   24851   // Convert seq_cst store -> xchg
   24852   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
   24853   // FIXME: On 32-bit, store -> fist or movq would be more efficient
   24854   //        (The only way to get a 16-byte store is cmpxchg16b)
   24855   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
   24856   if (cast<AtomicSDNode>(Node)->getOrdering() ==
   24857           AtomicOrdering::SequentiallyConsistent ||
   24858       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
   24859     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
   24860                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
   24861                                  Node->getOperand(0),
   24862                                  Node->getOperand(1), Node->getOperand(2),
   24863                                  cast<AtomicSDNode>(Node)->getMemOperand());
   24864     return Swap.getValue(1);
   24865   }
   24866   // Other atomic stores have a simple pattern.
   24867   return Op;
   24868 }
   24869 
   24870 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
   24871   SDNode *N = Op.getNode();
   24872   MVT VT = N->getSimpleValueType(0);
   24873 
   24874   // Let legalize expand this if it isn't a legal type yet.
   24875   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   24876     return SDValue();
   24877 
   24878   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   24879   SDLoc DL(N);
   24880 
   24881   // Set the carry flag.
   24882   SDValue Carry = Op.getOperand(2);
   24883   EVT CarryVT = Carry.getValueType();
   24884   APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
   24885   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
   24886                       Carry, DAG.getConstant(NegOne, DL, CarryVT));
   24887 
   24888   unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
   24889   SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
   24890                             Op.getOperand(1), Carry.getValue(1));
   24891 
   24892   SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
   24893   if (N->getValueType(1) == MVT::i1)
   24894     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
   24895 
   24896   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   24897 }
   24898 
   24899 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
   24900                             SelectionDAG &DAG) {
   24901   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
   24902 
   24903   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   24904   // which returns the values as { float, float } (in XMM0) or
   24905   // { double, double } (which is returned in XMM0, XMM1).
   24906   SDLoc dl(Op);
   24907   SDValue Arg = Op.getOperand(0);
   24908   EVT ArgVT = Arg.getValueType();
   24909   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   24910 
   24911   TargetLowering::ArgListTy Args;
   24912   TargetLowering::ArgListEntry Entry;
   24913 
   24914   Entry.Node = Arg;
   24915   Entry.Ty = ArgTy;
   24916   Entry.IsSExt = false;
   24917   Entry.IsZExt = false;
   24918   Args.push_back(Entry);
   24919 
   24920   bool isF64 = ArgVT == MVT::f64;
   24921   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
   24922   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   24923   // the results are returned via SRet in memory.
   24924   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   24925   RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
   24926   const char *LibcallName = TLI.getLibcallName(LC);
   24927   SDValue Callee =
   24928       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
   24929 
   24930   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
   24931                       : (Type *)VectorType::get(ArgTy, 4);
   24932 
   24933   TargetLowering::CallLoweringInfo CLI(DAG);
   24934   CLI.setDebugLoc(dl)
   24935       .setChain(DAG.getEntryNode())
   24936       .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
   24937 
   24938   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
   24939 
   24940   if (isF64)
   24941     // Returned in xmm0 and xmm1.
   24942     return CallResult.first;
   24943 
   24944   // Returned in bits 0:31 and 32:64 xmm0.
   24945   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
   24946                                CallResult.first, DAG.getIntPtrConstant(0, dl));
   24947   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
   24948                                CallResult.first, DAG.getIntPtrConstant(1, dl));
   24949   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
   24950   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
   24951 }
   24952 
   24953 /// Widen a vector input to a vector of NVT.  The
   24954 /// input vector must have the same element type as NVT.
   24955 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
   24956                             bool FillWithZeroes = false) {
   24957   // Check if InOp already has the right width.
   24958   MVT InVT = InOp.getSimpleValueType();
   24959   if (InVT == NVT)
   24960     return InOp;
   24961 
   24962   if (InOp.isUndef())
   24963     return DAG.getUNDEF(NVT);
   24964 
   24965   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
   24966          "input and widen element type must match");
   24967 
   24968   unsigned InNumElts = InVT.getVectorNumElements();
   24969   unsigned WidenNumElts = NVT.getVectorNumElements();
   24970   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
   24971          "Unexpected request for vector widening");
   24972 
   24973   SDLoc dl(InOp);
   24974   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
   24975       InOp.getNumOperands() == 2) {
   24976     SDValue N1 = InOp.getOperand(1);
   24977     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
   24978         N1.isUndef()) {
   24979       InOp = InOp.getOperand(0);
   24980       InVT = InOp.getSimpleValueType();
   24981       InNumElts = InVT.getVectorNumElements();
   24982     }
   24983   }
   24984   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
   24985       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
   24986     SmallVector<SDValue, 16> Ops;
   24987     for (unsigned i = 0; i < InNumElts; ++i)
   24988       Ops.push_back(InOp.getOperand(i));
   24989 
   24990     EVT EltVT = InOp.getOperand(0).getValueType();
   24991 
   24992     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
   24993       DAG.getUNDEF(EltVT);
   24994     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
   24995       Ops.push_back(FillVal);
   24996     return DAG.getBuildVector(NVT, dl, Ops);
   24997   }
   24998   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
   24999     DAG.getUNDEF(NVT);
   25000   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
   25001                      InOp, DAG.getIntPtrConstant(0, dl));
   25002 }
   25003 
   25004 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
   25005                              SelectionDAG &DAG) {
   25006   assert(Subtarget.hasAVX512() &&
   25007          "MGATHER/MSCATTER are supported on AVX-512 arch only");
   25008 
   25009   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
   25010   SDValue Src = N->getValue();
   25011   MVT VT = Src.getSimpleValueType();
   25012   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
   25013   SDLoc dl(Op);
   25014 
   25015   SDValue Scale = N->getScale();
   25016   SDValue Index = N->getIndex();
   25017   SDValue Mask = N->getMask();
   25018   SDValue Chain = N->getChain();
   25019   SDValue BasePtr = N->getBasePtr();
   25020 
   25021   if (VT == MVT::v2f32) {
   25022     assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
   25023     // If the index is v2i64 and we have VLX we can use xmm for data and index.
   25024     if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
   25025       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
   25026                         DAG.getUNDEF(MVT::v2f32));
   25027       SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
   25028       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
   25029       SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
   25030           VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
   25031       DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
   25032       return SDValue(NewScatter.getNode(), 1);
   25033     }
   25034     return SDValue();
   25035   }
   25036 
   25037   if (VT == MVT::v2i32) {
   25038     assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
   25039     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
   25040                       DAG.getUNDEF(MVT::v2i32));
   25041     // If the index is v2i64 and we have VLX we can use xmm for data and index.
   25042     if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
   25043       SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
   25044       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
   25045       SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
   25046           VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
   25047       DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
   25048       return SDValue(NewScatter.getNode(), 1);
   25049     }
   25050     // Custom widen all the operands to avoid promotion.
   25051     EVT NewIndexVT = EVT::getVectorVT(
   25052         *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
   25053     Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
   25054                         DAG.getUNDEF(Index.getValueType()));
   25055     Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
   25056                        DAG.getConstant(0, dl, MVT::v2i1));
   25057     SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
   25058     return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
   25059                                 Ops, N->getMemOperand());
   25060   }
   25061 
   25062   MVT IndexVT = Index.getSimpleValueType();
   25063   MVT MaskVT = Mask.getSimpleValueType();
   25064 
   25065   // If the index is v2i32, we're being called by type legalization and we
   25066   // should just let the default handling take care of it.
   25067   if (IndexVT == MVT::v2i32)
   25068     return SDValue();
   25069 
   25070   // If we don't have VLX and neither the passthru or index is 512-bits, we
   25071   // need to widen until one is.
   25072   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
   25073       !Index.getSimpleValueType().is512BitVector()) {
   25074     // Determine how much we need to widen by to get a 512-bit type.
   25075     unsigned Factor = std::min(512/VT.getSizeInBits(),
   25076                                512/IndexVT.getSizeInBits());
   25077     unsigned NumElts = VT.getVectorNumElements() * Factor;
   25078 
   25079     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
   25080     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
   25081     MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
   25082 
   25083     Src = ExtendToType(Src, VT, DAG);
   25084     Index = ExtendToType(Index, IndexVT, DAG);
   25085     Mask = ExtendToType(Mask, MaskVT, DAG, true);
   25086   }
   25087 
   25088   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   25089   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
   25090   SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
   25091       VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
   25092   DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
   25093   return SDValue(NewScatter.getNode(), 1);
   25094 }
   25095 
   25096 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
   25097                           SelectionDAG &DAG) {
   25098 
   25099   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
   25100   MVT VT = Op.getSimpleValueType();
   25101   MVT ScalarVT = VT.getScalarType();
   25102   SDValue Mask = N->getMask();
   25103   SDLoc dl(Op);
   25104 
   25105   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
   25106          "Expanding masked load is supported on AVX-512 target only!");
   25107 
   25108   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
   25109          "Expanding masked load is supported for 32 and 64-bit types only!");
   25110 
   25111   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
   25112          "Cannot lower masked load op.");
   25113 
   25114   assert((ScalarVT.getSizeInBits() >= 32 ||
   25115           (Subtarget.hasBWI() &&
   25116               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
   25117          "Unsupported masked load op.");
   25118 
   25119   // This operation is legal for targets with VLX, but without
   25120   // VLX the vector should be widened to 512 bit
   25121   unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
   25122   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
   25123   SDValue Src0 = N->getSrc0();
   25124   Src0 = ExtendToType(Src0, WideDataVT, DAG);
   25125 
   25126   // Mask element has to be i1.
   25127   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
   25128          "Unexpected mask type");
   25129 
   25130   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
   25131 
   25132   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
   25133   SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
   25134                                       N->getBasePtr(), Mask, Src0,
   25135                                       N->getMemoryVT(), N->getMemOperand(),
   25136                                       N->getExtensionType(),
   25137                                       N->isExpandingLoad());
   25138 
   25139   SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
   25140                                NewLoad.getValue(0),
   25141                                DAG.getIntPtrConstant(0, dl));
   25142   SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
   25143   return DAG.getMergeValues(RetOps, dl);
   25144 }
   25145 
   25146 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
   25147                            SelectionDAG &DAG) {
   25148   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
   25149   SDValue DataToStore = N->getValue();
   25150   MVT VT = DataToStore.getSimpleValueType();
   25151   MVT ScalarVT = VT.getScalarType();
   25152   SDValue Mask = N->getMask();
   25153   SDLoc dl(Op);
   25154 
   25155   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
   25156          "Expanding masked load is supported on AVX-512 target only!");
   25157 
   25158   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
   25159          "Expanding masked load is supported for 32 and 64-bit types only!");
   25160 
   25161   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
   25162          "Cannot lower masked store op.");
   25163 
   25164   assert((ScalarVT.getSizeInBits() >= 32 ||
   25165           (Subtarget.hasBWI() &&
   25166               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
   25167           "Unsupported masked store op.");
   25168 
   25169   // This operation is legal for targets with VLX, but without
   25170   // VLX the vector should be widened to 512 bit
   25171   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
   25172   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
   25173 
   25174   // Mask element has to be i1.
   25175   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
   25176          "Unexpected mask type");
   25177 
   25178   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
   25179 
   25180   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
   25181   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
   25182   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
   25183                             Mask, N->getMemoryVT(), N->getMemOperand(),
   25184                             N->isTruncatingStore(), N->isCompressingStore());
   25185 }
   25186 
   25187 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
   25188                             SelectionDAG &DAG) {
   25189   assert(Subtarget.hasAVX2() &&
   25190          "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
   25191 
   25192   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
   25193   SDLoc dl(Op);
   25194   MVT VT = Op.getSimpleValueType();
   25195   SDValue Index = N->getIndex();
   25196   SDValue Mask = N->getMask();
   25197   SDValue Src0 = N->getValue();
   25198   MVT IndexVT = Index.getSimpleValueType();
   25199   MVT MaskVT = Mask.getSimpleValueType();
   25200 
   25201   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
   25202 
   25203   // If the index is v2i32, we're being called by type legalization.
   25204   if (IndexVT == MVT::v2i32)
   25205     return SDValue();
   25206 
   25207   // If we don't have VLX and neither the passthru or index is 512-bits, we
   25208   // need to widen until one is.
   25209   MVT OrigVT = VT;
   25210   if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
   25211       !IndexVT.is512BitVector()) {
   25212     // Determine how much we need to widen by to get a 512-bit type.
   25213     unsigned Factor = std::min(512/VT.getSizeInBits(),
   25214                                512/IndexVT.getSizeInBits());
   25215 
   25216     unsigned NumElts = VT.getVectorNumElements() * Factor;
   25217 
   25218     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
   25219     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
   25220     MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
   25221 
   25222     Src0 = ExtendToType(Src0, VT, DAG);
   25223     Index = ExtendToType(Index, IndexVT, DAG);
   25224     Mask = ExtendToType(Mask, MaskVT, DAG, true);
   25225   }
   25226 
   25227   SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index,
   25228                     N->getScale() };
   25229   SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
   25230       DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
   25231       N->getMemOperand());
   25232   SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
   25233                                 NewGather, DAG.getIntPtrConstant(0, dl));
   25234   return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
   25235 }
   25236 
   25237 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
   25238                                                     SelectionDAG &DAG) const {
   25239   // TODO: Eventually, the lowering of these nodes should be informed by or
   25240   // deferred to the GC strategy for the function in which they appear. For
   25241   // now, however, they must be lowered to something. Since they are logically
   25242   // no-ops in the case of a null GC strategy (or a GC strategy which does not
   25243   // require special handling for these nodes), lower them as literal NOOPs for
   25244   // the time being.
   25245   SmallVector<SDValue, 2> Ops;
   25246 
   25247   Ops.push_back(Op.getOperand(0));
   25248   if (Op->getGluedNode())
   25249     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
   25250 
   25251   SDLoc OpDL(Op);
   25252   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
   25253   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
   25254 
   25255   return NOOP;
   25256 }
   25257 
   25258 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
   25259                                                   SelectionDAG &DAG) const {
   25260   // TODO: Eventually, the lowering of these nodes should be informed by or
   25261   // deferred to the GC strategy for the function in which they appear. For
   25262   // now, however, they must be lowered to something. Since they are logically
   25263   // no-ops in the case of a null GC strategy (or a GC strategy which does not
   25264   // require special handling for these nodes), lower them as literal NOOPs for
   25265   // the time being.
   25266   SmallVector<SDValue, 2> Ops;
   25267 
   25268   Ops.push_back(Op.getOperand(0));
   25269   if (Op->getGluedNode())
   25270     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
   25271 
   25272   SDLoc OpDL(Op);
   25273   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
   25274   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
   25275 
   25276   return NOOP;
   25277 }
   25278 
   25279 /// Provide custom lowering hooks for some operations.
   25280 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   25281   switch (Op.getOpcode()) {
   25282   default: llvm_unreachable("Should not custom lower this!");
   25283   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
   25284   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
   25285     return LowerCMP_SWAP(Op, Subtarget, DAG);
   25286   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
   25287   case ISD::ATOMIC_LOAD_ADD:
   25288   case ISD::ATOMIC_LOAD_SUB:
   25289   case ISD::ATOMIC_LOAD_OR:
   25290   case ISD::ATOMIC_LOAD_XOR:
   25291   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
   25292   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
   25293   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
   25294   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   25295   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
   25296   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
   25297   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
   25298   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   25299   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   25300   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
   25301   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
   25302   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
   25303   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   25304   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   25305   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   25306   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
   25307   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   25308   case ISD::SHL_PARTS:
   25309   case ISD::SRA_PARTS:
   25310   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   25311   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   25312   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   25313   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
   25314   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
   25315   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
   25316   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
   25317   case ISD::ZERO_EXTEND_VECTOR_INREG:
   25318   case ISD::SIGN_EXTEND_VECTOR_INREG:
   25319     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
   25320   case ISD::FP_TO_SINT:
   25321   case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
   25322   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
   25323   case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
   25324   case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
   25325   case ISD::FABS:
   25326   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
   25327   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   25328   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   25329   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   25330   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
   25331   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   25332   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   25333   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   25334   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   25335   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   25336   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
   25337   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   25338   case ISD::INTRINSIC_VOID:
   25339   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
   25340   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   25341   case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
   25342   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   25343   case ISD::FRAME_TO_ARGS_OFFSET:
   25344                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   25345   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   25346   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
   25347   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   25348   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
   25349   case ISD::EH_SJLJ_SETUP_DISPATCH:
   25350     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
   25351   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   25352   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   25353   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   25354   case ISD::CTLZ:
   25355   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
   25356   case ISD::CTTZ:
   25357   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
   25358   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   25359   case ISD::MULHS:
   25360   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
   25361   case ISD::UMUL_LOHI:
   25362   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
   25363   case ISD::ROTL:
   25364   case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
   25365   case ISD::SRA:
   25366   case ISD::SRL:
   25367   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
   25368   case ISD::SADDO:
   25369   case ISD::UADDO:
   25370   case ISD::SSUBO:
   25371   case ISD::USUBO:
   25372   case ISD::SMULO:
   25373   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   25374   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
   25375   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
   25376   case ISD::ADDCARRY:
   25377   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
   25378   case ISD::ADD:
   25379   case ISD::SUB:                return LowerADD_SUB(Op, DAG);
   25380   case ISD::SMAX:
   25381   case ISD::SMIN:
   25382   case ISD::UMAX:
   25383   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
   25384   case ISD::ABS:                return LowerABS(Op, DAG);
   25385   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   25386   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   25387   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
   25388   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
   25389   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
   25390   case ISD::GC_TRANSITION_START:
   25391                                 return LowerGC_TRANSITION_START(Op, DAG);
   25392   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
   25393   }
   25394 }
   25395 
   25396 /// Places new result values for the node in Results (their number
   25397 /// and types must exactly match those of the original return values of
   25398 /// the node), or leaves Results empty, which indicates that the node is not
   25399 /// to be custom lowered after all.
   25400 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
   25401                                               SmallVectorImpl<SDValue> &Results,
   25402                                               SelectionDAG &DAG) const {
   25403   SDValue Res = LowerOperation(SDValue(N, 0), DAG);
   25404 
   25405   if (!Res.getNode())
   25406     return;
   25407 
   25408   assert((N->getNumValues() <= Res->getNumValues()) &&
   25409       "Lowering returned the wrong number of results!");
   25410 
   25411   // Places new result values base on N result number.
   25412   // In some cases (LowerSINT_TO_FP for example) Res has more result values
   25413   // than original node, chain should be dropped(last value).
   25414   for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
   25415     Results.push_back(Res.getValue(I));
   25416 }
   25417 
   25418 /// Replace a node with an illegal result type with a new node built out of
   25419 /// custom code.
   25420 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   25421                                            SmallVectorImpl<SDValue>&Results,
   25422                                            SelectionDAG &DAG) const {
   25423   SDLoc dl(N);
   25424   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   25425   switch (N->getOpcode()) {
   25426   default:
   25427     llvm_unreachable("Do not know how to custom type legalize this operation!");
   25428   case X86ISD::AVG: {
   25429     // Legalize types for X86ISD::AVG by expanding vectors.
   25430     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
   25431 
   25432     auto InVT = N->getValueType(0);
   25433     assert(InVT.getSizeInBits() < 128);
   25434     assert(128 % InVT.getSizeInBits() == 0);
   25435     unsigned NumConcat = 128 / InVT.getSizeInBits();
   25436 
   25437     EVT RegVT = EVT::getVectorVT(*DAG.getContext(),
   25438                                  InVT.getVectorElementType(),
   25439                                  NumConcat * InVT.getVectorNumElements());
   25440 
   25441     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
   25442     Ops[0] = N->getOperand(0);
   25443     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
   25444     Ops[0] = N->getOperand(1);
   25445     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
   25446 
   25447     SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
   25448     if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector)
   25449       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
   25450                         DAG.getIntPtrConstant(0, dl));
   25451     Results.push_back(Res);
   25452     return;
   25453   }
   25454   case ISD::SETCC: {
   25455     // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
   25456     // setCC result type is v2i1 because type legalzation will end up with
   25457     // a v4i1 setcc plus an extend.
   25458     assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
   25459     if (N->getOperand(0).getValueType() != MVT::v2f32)
   25460       return;
   25461     SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
   25462     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
   25463                               N->getOperand(0), UNDEF);
   25464     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
   25465                               N->getOperand(1), UNDEF);
   25466     SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
   25467                               N->getOperand(2));
   25468     if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
   25469       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
   25470                         DAG.getIntPtrConstant(0, dl));
   25471     Results.push_back(Res);
   25472     return;
   25473   }
   25474   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
   25475   case X86ISD::FMINC:
   25476   case X86ISD::FMIN:
   25477   case X86ISD::FMAXC:
   25478   case X86ISD::FMAX: {
   25479     EVT VT = N->getValueType(0);
   25480     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
   25481     SDValue UNDEF = DAG.getUNDEF(VT);
   25482     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
   25483                               N->getOperand(0), UNDEF);
   25484     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
   25485                               N->getOperand(1), UNDEF);
   25486     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
   25487     return;
   25488   }
   25489   case ISD::SDIV:
   25490   case ISD::UDIV:
   25491   case ISD::SREM:
   25492   case ISD::UREM:
   25493   case ISD::SDIVREM:
   25494   case ISD::UDIVREM: {
   25495     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
   25496     Results.push_back(V);
   25497     return;
   25498   }
   25499   case ISD::FP_TO_SINT:
   25500   case ISD::FP_TO_UINT: {
   25501     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
   25502     EVT VT = N->getValueType(0);
   25503     SDValue Src = N->getOperand(0);
   25504     EVT SrcVT = Src.getValueType();
   25505 
   25506     if (VT == MVT::v2i32) {
   25507       assert((IsSigned || Subtarget.hasAVX512()) &&
   25508              "Can only handle signed conversion without AVX512");
   25509       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
   25510       if (Src.getValueType() == MVT::v2f64) {
   25511         MVT ResVT = MVT::v4i32;
   25512         unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
   25513         if (!IsSigned && !Subtarget.hasVLX()) {
   25514           // Widen to 512-bits.
   25515           ResVT = MVT::v8i32;
   25516           Opc = ISD::FP_TO_UINT;
   25517           Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
   25518                             DAG.getUNDEF(MVT::v8f64),
   25519                             Src, DAG.getIntPtrConstant(0, dl));
   25520         }
   25521         SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
   25522         bool WidenType = getTypeAction(*DAG.getContext(),
   25523                                        MVT::v2i32) == TypeWidenVector;
   25524         ResVT = WidenType ? MVT::v4i32 : MVT::v2i32;
   25525         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
   25526                           DAG.getIntPtrConstant(0, dl));
   25527         Results.push_back(Res);
   25528         return;
   25529       }
   25530       if (SrcVT == MVT::v2f32) {
   25531         SDValue Idx = DAG.getIntPtrConstant(0, dl);
   25532         SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
   25533                                   DAG.getUNDEF(MVT::v2f32));
   25534         Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
   25535                                    : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
   25536         if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
   25537           Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
   25538         Results.push_back(Res);
   25539         return;
   25540       }
   25541 
   25542       // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
   25543       // so early out here.
   25544       return;
   25545     }
   25546 
   25547     if (Subtarget.hasDQI() && VT == MVT::i64 &&
   25548         (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
   25549       assert(!Subtarget.is64Bit() && "i64 should be legal");
   25550       unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
   25551       // Using a 256-bit input here to guarantee 128-bit input for f32 case.
   25552       // TODO: Use 128-bit vectors for f64 case?
   25553       // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
   25554       MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
   25555       MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
   25556 
   25557       SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
   25558       SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
   25559                                 DAG.getConstantFP(0.0, dl, VecInVT), Src,
   25560                                 ZeroIdx);
   25561       Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
   25562       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
   25563       Results.push_back(Res);
   25564       return;
   25565     }
   25566 
   25567     std::pair<SDValue,SDValue> Vals =
   25568         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
   25569     SDValue FIST = Vals.first, StackSlot = Vals.second;
   25570     if (FIST.getNode()) {
   25571       // Return a load from the stack slot.
   25572       if (StackSlot.getNode())
   25573         Results.push_back(
   25574             DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
   25575       else
   25576         Results.push_back(FIST);
   25577     }
   25578     return;
   25579   }
   25580   case ISD::SINT_TO_FP: {
   25581     assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
   25582     SDValue Src = N->getOperand(0);
   25583     if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
   25584       return;
   25585     Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
   25586     return;
   25587   }
   25588   case ISD::UINT_TO_FP: {
   25589     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
   25590     EVT VT = N->getValueType(0);
   25591     if (VT != MVT::v2f32)
   25592       return;
   25593     SDValue Src = N->getOperand(0);
   25594     EVT SrcVT = Src.getValueType();
   25595     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
   25596       Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
   25597       return;
   25598     }
   25599     if (SrcVT != MVT::v2i32)
   25600       return;
   25601     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
   25602     SDValue VBias =
   25603         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
   25604     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
   25605                              DAG.getBitcast(MVT::v2i64, VBias));
   25606     Or = DAG.getBitcast(MVT::v2f64, Or);
   25607     // TODO: Are there any fast-math-flags to propagate here?
   25608     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
   25609     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
   25610     return;
   25611   }
   25612   case ISD::FP_ROUND: {
   25613     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
   25614         return;
   25615     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
   25616     Results.push_back(V);
   25617     return;
   25618   }
   25619   case ISD::FP_EXTEND: {
   25620     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
   25621     // No other ValueType for FP_EXTEND should reach this point.
   25622     assert(N->getValueType(0) == MVT::v2f32 &&
   25623            "Do not know how to legalize this Node");
   25624     return;
   25625   }
   25626   case ISD::INTRINSIC_W_CHAIN: {
   25627     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   25628     switch (IntNo) {
   25629     default : llvm_unreachable("Do not know how to custom type "
   25630                                "legalize this intrinsic operation!");
   25631     case Intrinsic::x86_rdtsc:
   25632       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
   25633                                      Results);
   25634     case Intrinsic::x86_rdtscp:
   25635       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
   25636                                      Results);
   25637     case Intrinsic::x86_rdpmc:
   25638       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
   25639 
   25640     case Intrinsic::x86_xgetbv:
   25641       return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
   25642     }
   25643   }
   25644   case ISD::INTRINSIC_WO_CHAIN: {
   25645     if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
   25646       Results.push_back(V);
   25647     return;
   25648   }
   25649   case ISD::READCYCLECOUNTER: {
   25650     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
   25651                                    Results);
   25652   }
   25653   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
   25654     EVT T = N->getValueType(0);
   25655     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
   25656     bool Regs64bit = T == MVT::i128;
   25657     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
   25658     SDValue cpInL, cpInH;
   25659     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
   25660                         DAG.getConstant(0, dl, HalfT));
   25661     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
   25662                         DAG.getConstant(1, dl, HalfT));
   25663     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
   25664                              Regs64bit ? X86::RAX : X86::EAX,
   25665                              cpInL, SDValue());
   25666     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
   25667                              Regs64bit ? X86::RDX : X86::EDX,
   25668                              cpInH, cpInL.getValue(1));
   25669     SDValue swapInL, swapInH;
   25670     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
   25671                           DAG.getConstant(0, dl, HalfT));
   25672     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
   25673                           DAG.getConstant(1, dl, HalfT));
   25674     swapInH =
   25675         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
   25676                          swapInH, cpInH.getValue(1));
   25677     // If the current function needs the base pointer, RBX,
   25678     // we shouldn't use cmpxchg directly.
   25679     // Indeed the lowering of that instruction will clobber
   25680     // that register and since RBX will be a reserved register
   25681     // the register allocator will not make sure its value will
   25682     // be properly saved and restored around this live-range.
   25683     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   25684     SDValue Result;
   25685     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   25686     unsigned BasePtr = TRI->getBaseRegister();
   25687     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
   25688     if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
   25689         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
   25690       // ISel prefers the LCMPXCHG64 variant.
   25691       // If that assert breaks, that means it is not the case anymore,
   25692       // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
   25693       // not just EBX. This is a matter of accepting i64 input for that
   25694       // pseudo, and restoring into the register of the right wide
   25695       // in expand pseudo. Everything else should just work.
   25696       assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
   25697              "Saving only half of the RBX");
   25698       unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
   25699                                   : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
   25700       SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
   25701                                            Regs64bit ? X86::RBX : X86::EBX,
   25702                                            HalfT, swapInH.getValue(1));
   25703       SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
   25704                        RBXSave,
   25705                        /*Glue*/ RBXSave.getValue(2)};
   25706       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
   25707     } else {
   25708       unsigned Opcode =
   25709           Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
   25710       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
   25711                                  Regs64bit ? X86::RBX : X86::EBX, swapInL,
   25712                                  swapInH.getValue(1));
   25713       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
   25714                        swapInL.getValue(1)};
   25715       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
   25716     }
   25717     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
   25718                                         Regs64bit ? X86::RAX : X86::EAX,
   25719                                         HalfT, Result.getValue(1));
   25720     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
   25721                                         Regs64bit ? X86::RDX : X86::EDX,
   25722                                         HalfT, cpOutL.getValue(2));
   25723     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
   25724 
   25725     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
   25726                                         MVT::i32, cpOutH.getValue(2));
   25727     SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
   25728     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
   25729 
   25730     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
   25731     Results.push_back(Success);
   25732     Results.push_back(EFLAGS.getValue(1));
   25733     return;
   25734   }
   25735   case ISD::ATOMIC_SWAP:
   25736   case ISD::ATOMIC_LOAD_ADD:
   25737   case ISD::ATOMIC_LOAD_SUB:
   25738   case ISD::ATOMIC_LOAD_AND:
   25739   case ISD::ATOMIC_LOAD_OR:
   25740   case ISD::ATOMIC_LOAD_XOR:
   25741   case ISD::ATOMIC_LOAD_NAND:
   25742   case ISD::ATOMIC_LOAD_MIN:
   25743   case ISD::ATOMIC_LOAD_MAX:
   25744   case ISD::ATOMIC_LOAD_UMIN:
   25745   case ISD::ATOMIC_LOAD_UMAX:
   25746   case ISD::ATOMIC_LOAD: {
   25747     // Delegate to generic TypeLegalization. Situations we can really handle
   25748     // should have already been dealt with by AtomicExpandPass.cpp.
   25749     break;
   25750   }
   25751   case ISD::BITCAST: {
   25752     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
   25753     EVT DstVT = N->getValueType(0);
   25754     EVT SrcVT = N->getOperand(0).getValueType();
   25755 
   25756     // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
   25757     // we can split using the k-register rather than memory.
   25758     if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
   25759       assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
   25760       SDValue Lo, Hi;
   25761       std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
   25762       Lo = DAG.getBitcast(MVT::i32, Lo);
   25763       Hi = DAG.getBitcast(MVT::i32, Hi);
   25764       SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
   25765       Results.push_back(Res);
   25766       return;
   25767     }
   25768 
   25769     // Custom splitting for BWI types when AVX512F is available but BWI isn't.
   25770     if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
   25771         SrcVT.isVector() && isTypeLegal(SrcVT)) {
   25772       SDValue Lo, Hi;
   25773       std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
   25774       MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
   25775       Lo = DAG.getBitcast(CastVT, Lo);
   25776       Hi = DAG.getBitcast(CastVT, Hi);
   25777       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
   25778       Results.push_back(Res);
   25779       return;
   25780     }
   25781 
   25782     if (SrcVT != MVT::f64 ||
   25783         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
   25784       return;
   25785 
   25786     unsigned NumElts = DstVT.getVectorNumElements();
   25787     EVT SVT = DstVT.getVectorElementType();
   25788     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
   25789     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
   25790                                    MVT::v2f64, N->getOperand(0));
   25791     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
   25792 
   25793     if (getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) {
   25794       // If we are legalizing vectors by widening, we already have the desired
   25795       // legal vector type, just return it.
   25796       Results.push_back(ToVecInt);
   25797       return;
   25798     }
   25799 
   25800     SmallVector<SDValue, 8> Elts;
   25801     for (unsigned i = 0, e = NumElts; i != e; ++i)
   25802       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
   25803                                    ToVecInt, DAG.getIntPtrConstant(i, dl)));
   25804 
   25805     Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
   25806     return;
   25807   }
   25808   case ISD::MGATHER: {
   25809     EVT VT = N->getValueType(0);
   25810     if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
   25811       auto *Gather = cast<MaskedGatherSDNode>(N);
   25812       SDValue Index = Gather->getIndex();
   25813       if (Index.getValueType() != MVT::v2i64)
   25814         return;
   25815       SDValue Mask = Gather->getMask();
   25816       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
   25817       SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
   25818                                  Gather->getValue(),
   25819                                  DAG.getUNDEF(MVT::v2f32));
   25820       if (!Subtarget.hasVLX()) {
   25821         // We need to widen the mask, but the instruction will only use 2
   25822         // of its elements. So we can use undef.
   25823         Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
   25824                            DAG.getUNDEF(MVT::v2i1));
   25825         Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
   25826       }
   25827       SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
   25828                         Index, Gather->getScale() };
   25829       SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
   25830         DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
   25831         Gather->getMemoryVT(), Gather->getMemOperand());
   25832       Results.push_back(Res);
   25833       Results.push_back(Res.getValue(2));
   25834       return;
   25835     }
   25836     if (VT == MVT::v2i32) {
   25837       auto *Gather = cast<MaskedGatherSDNode>(N);
   25838       SDValue Index = Gather->getIndex();
   25839       SDValue Mask = Gather->getMask();
   25840       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
   25841       SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
   25842                                  Gather->getValue(),
   25843                                  DAG.getUNDEF(MVT::v2i32));
   25844       // If the index is v2i64 we can use it directly.
   25845       if (Index.getValueType() == MVT::v2i64 &&
   25846           (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
   25847         if (!Subtarget.hasVLX()) {
   25848           // We need to widen the mask, but the instruction will only use 2
   25849           // of its elements. So we can use undef.
   25850           Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
   25851                              DAG.getUNDEF(MVT::v2i1));
   25852           Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
   25853         }
   25854         SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
   25855                           Index, Gather->getScale() };
   25856         SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
   25857           DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
   25858           Gather->getMemoryVT(), Gather->getMemOperand());
   25859         SDValue Chain = Res.getValue(2);
   25860         if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
   25861           Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
   25862                             DAG.getIntPtrConstant(0, dl));
   25863         Results.push_back(Res);
   25864         Results.push_back(Chain);
   25865         return;
   25866       }
   25867       EVT IndexVT = Index.getValueType();
   25868       EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
   25869                                         IndexVT.getScalarType(), 4);
   25870       // Otherwise we need to custom widen everything to avoid promotion.
   25871       Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
   25872                           DAG.getUNDEF(IndexVT));
   25873       Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
   25874                          DAG.getConstant(0, dl, MVT::v2i1));
   25875       SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
   25876                         Index, Gather->getScale() };
   25877       SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
   25878                                         Gather->getMemoryVT(), dl, Ops,
   25879                                         Gather->getMemOperand());
   25880       SDValue Chain = Res.getValue(1);
   25881       if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
   25882         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
   25883                           DAG.getIntPtrConstant(0, dl));
   25884       Results.push_back(Res);
   25885       Results.push_back(Chain);
   25886       return;
   25887     }
   25888     break;
   25889   }
   25890   }
   25891 }
   25892 
   25893 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   25894   switch ((X86ISD::NodeType)Opcode) {
   25895   case X86ISD::FIRST_NUMBER:       break;
   25896   case X86ISD::BSF:                return "X86ISD::BSF";
   25897   case X86ISD::BSR:                return "X86ISD::BSR";
   25898   case X86ISD::SHLD:               return "X86ISD::SHLD";
   25899   case X86ISD::SHRD:               return "X86ISD::SHRD";
   25900   case X86ISD::FAND:               return "X86ISD::FAND";
   25901   case X86ISD::FANDN:              return "X86ISD::FANDN";
   25902   case X86ISD::FOR:                return "X86ISD::FOR";
   25903   case X86ISD::FXOR:               return "X86ISD::FXOR";
   25904   case X86ISD::FILD:               return "X86ISD::FILD";
   25905   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
   25906   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
   25907   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
   25908   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
   25909   case X86ISD::FLD:                return "X86ISD::FLD";
   25910   case X86ISD::FST:                return "X86ISD::FST";
   25911   case X86ISD::CALL:               return "X86ISD::CALL";
   25912   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
   25913   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
   25914   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
   25915   case X86ISD::BT:                 return "X86ISD::BT";
   25916   case X86ISD::CMP:                return "X86ISD::CMP";
   25917   case X86ISD::COMI:               return "X86ISD::COMI";
   25918   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
   25919   case X86ISD::CMPM:               return "X86ISD::CMPM";
   25920   case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
   25921   case X86ISD::SETCC:              return "X86ISD::SETCC";
   25922   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   25923   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
   25924   case X86ISD::FSETCCM:            return "X86ISD::FSETCCM";
   25925   case X86ISD::FSETCCM_RND:        return "X86ISD::FSETCCM_RND";
   25926   case X86ISD::CMOV:               return "X86ISD::CMOV";
   25927   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   25928   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
   25929   case X86ISD::IRET:               return "X86ISD::IRET";
   25930   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
   25931   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
   25932   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
   25933   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
   25934   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
   25935   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
   25936   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
   25937   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
   25938   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
   25939   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
   25940   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   25941   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   25942   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
   25943   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   25944   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   25945   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
   25946   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
   25947   case X86ISD::ADDUS:              return "X86ISD::ADDUS";
   25948   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
   25949   case X86ISD::HADD:               return "X86ISD::HADD";
   25950   case X86ISD::HSUB:               return "X86ISD::HSUB";
   25951   case X86ISD::FHADD:              return "X86ISD::FHADD";
   25952   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   25953   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
   25954   case X86ISD::FMAX:               return "X86ISD::FMAX";
   25955   case X86ISD::FMAXS:              return "X86ISD::FMAXS";
   25956   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
   25957   case X86ISD::FMAXS_RND:          return "X86ISD::FMAX_RND";
   25958   case X86ISD::FMIN:               return "X86ISD::FMIN";
   25959   case X86ISD::FMINS:              return "X86ISD::FMINS";
   25960   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
   25961   case X86ISD::FMINS_RND:          return "X86ISD::FMINS_RND";
   25962   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
   25963   case X86ISD::FMINC:              return "X86ISD::FMINC";
   25964   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   25965   case X86ISD::FRCP:               return "X86ISD::FRCP";
   25966   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
   25967   case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
   25968   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   25969   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   25970   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   25971   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
   25972   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
   25973   case X86ISD::EH_SJLJ_SETUP_DISPATCH:
   25974     return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
   25975   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   25976   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   25977   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
   25978   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
   25979   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   25980   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
   25981   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
   25982   case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
   25983     return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
   25984   case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
   25985     return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
   25986   case X86ISD::LADD:               return "X86ISD::LADD";
   25987   case X86ISD::LSUB:               return "X86ISD::LSUB";
   25988   case X86ISD::LOR:                return "X86ISD::LOR";
   25989   case X86ISD::LXOR:               return "X86ISD::LXOR";
   25990   case X86ISD::LAND:               return "X86ISD::LAND";
   25991   case X86ISD::LINC:               return "X86ISD::LINC";
   25992   case X86ISD::LDEC:               return "X86ISD::LDEC";
   25993   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   25994   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   25995   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
   25996   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
   25997   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
   25998   case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
   25999   case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
   26000   case X86ISD::VTRUNCSTORES:       return "X86ISD::VTRUNCSTORES";
   26001   case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
   26002   case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
   26003   case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
   26004   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   26005   case X86ISD::VFPEXT_RND:         return "X86ISD::VFPEXT_RND";
   26006   case X86ISD::VFPEXTS_RND:        return "X86ISD::VFPEXTS_RND";
   26007   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   26008   case X86ISD::VFPROUND_RND:       return "X86ISD::VFPROUND_RND";
   26009   case X86ISD::VFPROUNDS_RND:      return "X86ISD::VFPROUNDS_RND";
   26010   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   26011   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   26012   case X86ISD::VSHL:               return "X86ISD::VSHL";
   26013   case X86ISD::VSRL:               return "X86ISD::VSRL";
   26014   case X86ISD::VSRA:               return "X86ISD::VSRA";
   26015   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
   26016   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
   26017   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
   26018   case X86ISD::VSRAV:              return "X86ISD::VSRAV";
   26019   case X86ISD::VROTLI:             return "X86ISD::VROTLI";
   26020   case X86ISD::VROTRI:             return "X86ISD::VROTRI";
   26021   case X86ISD::VPPERM:             return "X86ISD::VPPERM";
   26022   case X86ISD::CMPP:               return "X86ISD::CMPP";
   26023   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   26024   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
   26025   case X86ISD::PHMINPOS:           return "X86ISD::PHMINPOS";
   26026   case X86ISD::ADD:                return "X86ISD::ADD";
   26027   case X86ISD::SUB:                return "X86ISD::SUB";
   26028   case X86ISD::ADC:                return "X86ISD::ADC";
   26029   case X86ISD::SBB:                return "X86ISD::SBB";
   26030   case X86ISD::SMUL:               return "X86ISD::SMUL";
   26031   case X86ISD::UMUL:               return "X86ISD::UMUL";
   26032   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
   26033   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
   26034   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
   26035   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
   26036   case X86ISD::INC:                return "X86ISD::INC";
   26037   case X86ISD::DEC:                return "X86ISD::DEC";
   26038   case X86ISD::OR:                 return "X86ISD::OR";
   26039   case X86ISD::XOR:                return "X86ISD::XOR";
   26040   case X86ISD::AND:                return "X86ISD::AND";
   26041   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   26042   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   26043   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
   26044   case X86ISD::PTEST:              return "X86ISD::PTEST";
   26045   case X86ISD::TESTP:              return "X86ISD::TESTP";
   26046   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
   26047   case X86ISD::KTEST:              return "X86ISD::KTEST";
   26048   case X86ISD::KADD:               return "X86ISD::KADD";
   26049   case X86ISD::KSHIFTL:            return "X86ISD::KSHIFTL";
   26050   case X86ISD::KSHIFTR:            return "X86ISD::KSHIFTR";
   26051   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
   26052   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
   26053   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   26054   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
   26055   case X86ISD::VSHLD:              return "X86ISD::VSHLD";
   26056   case X86ISD::VSHRD:              return "X86ISD::VSHRD";
   26057   case X86ISD::VSHLDV:             return "X86ISD::VSHLDV";
   26058   case X86ISD::VSHRDV:             return "X86ISD::VSHRDV";
   26059   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   26060   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
   26061   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
   26062   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
   26063   case X86ISD::SHUF128:            return "X86ISD::SHUF128";
   26064   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
   26065   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
   26066   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
   26067   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
   26068   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
   26069   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
   26070   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
   26071   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
   26072   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   26073   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   26074   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
   26075   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
   26076   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
   26077   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
   26078   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   26079   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
   26080   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
   26081   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   26082   case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
   26083   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
   26084   case X86ISD::VFIXUPIMMS:         return "X86ISD::VFIXUPIMMS";
   26085   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
   26086   case X86ISD::VRANGE_RND:         return "X86ISD::VRANGE_RND";
   26087   case X86ISD::VRANGES:            return "X86ISD::VRANGES";
   26088   case X86ISD::VRANGES_RND:        return "X86ISD::VRANGES_RND";
   26089   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   26090   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
   26091   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
   26092   case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
   26093   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   26094   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   26095   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
   26096   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
   26097   case X86ISD::MFENCE:             return "X86ISD::MFENCE";
   26098   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   26099   case X86ISD::SAHF:               return "X86ISD::SAHF";
   26100   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
   26101   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
   26102   case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
   26103   case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
   26104   case X86ISD::VPSHA:              return "X86ISD::VPSHA";
   26105   case X86ISD::VPSHL:              return "X86ISD::VPSHL";
   26106   case X86ISD::VPCOM:              return "X86ISD::VPCOM";
   26107   case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
   26108   case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
   26109   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
   26110   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
   26111   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
   26112   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
   26113   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
   26114   case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
   26115   case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
   26116   case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
   26117   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
   26118   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
   26119   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
   26120   case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
   26121   case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
   26122   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
   26123   case X86ISD::VRNDSCALE_RND:      return "X86ISD::VRNDSCALE_RND";
   26124   case X86ISD::VRNDSCALES:         return "X86ISD::VRNDSCALES";
   26125   case X86ISD::VRNDSCALES_RND:     return "X86ISD::VRNDSCALES_RND";
   26126   case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
   26127   case X86ISD::VREDUCE_RND:        return "X86ISD::VREDUCE_RND";
   26128   case X86ISD::VREDUCES:           return "X86ISD::VREDUCES";
   26129   case X86ISD::VREDUCES_RND:       return "X86ISD::VREDUCES_RND";
   26130   case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
   26131   case X86ISD::VGETMANT_RND:       return "X86ISD::VGETMANT_RND";
   26132   case X86ISD::VGETMANTS:          return "X86ISD::VGETMANTS";
   26133   case X86ISD::VGETMANTS_RND:      return "X86ISD::VGETMANTS_RND";
   26134   case X86ISD::PCMPESTR:           return "X86ISD::PCMPESTR";
   26135   case X86ISD::PCMPISTR:           return "X86ISD::PCMPISTR";
   26136   case X86ISD::XTEST:              return "X86ISD::XTEST";
   26137   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
   26138   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
   26139   case X86ISD::SELECT:             return "X86ISD::SELECT";
   26140   case X86ISD::SELECTS:            return "X86ISD::SELECTS";
   26141   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
   26142   case X86ISD::RCP14:              return "X86ISD::RCP14";
   26143   case X86ISD::RCP14S:             return "X86ISD::RCP14S";
   26144   case X86ISD::RCP28:              return "X86ISD::RCP28";
   26145   case X86ISD::RCP28S:             return "X86ISD::RCP28S";
   26146   case X86ISD::EXP2:               return "X86ISD::EXP2";
   26147   case X86ISD::RSQRT14:            return "X86ISD::RSQRT14";
   26148   case X86ISD::RSQRT14S:           return "X86ISD::RSQRT14S";
   26149   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
   26150   case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
   26151   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
   26152   case X86ISD::FADDS_RND:          return "X86ISD::FADDS_RND";
   26153   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
   26154   case X86ISD::FSUBS_RND:          return "X86ISD::FSUBS_RND";
   26155   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
   26156   case X86ISD::FMULS_RND:          return "X86ISD::FMULS_RND";
   26157   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
   26158   case X86ISD::FDIVS_RND:          return "X86ISD::FDIVS_RND";
   26159   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
   26160   case X86ISD::FSQRTS_RND:         return "X86ISD::FSQRTS_RND";
   26161   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
   26162   case X86ISD::FGETEXPS_RND:       return "X86ISD::FGETEXPS_RND";
   26163   case X86ISD::SCALEF:             return "X86ISD::SCALEF";
   26164   case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
   26165   case X86ISD::ADDS:               return "X86ISD::ADDS";
   26166   case X86ISD::SUBS:               return "X86ISD::SUBS";
   26167   case X86ISD::AVG:                return "X86ISD::AVG";
   26168   case X86ISD::MULHRS:             return "X86ISD::MULHRS";
   26169   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
   26170   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
   26171   case X86ISD::CVTTP2SI:           return "X86ISD::CVTTP2SI";
   26172   case X86ISD::CVTTP2UI:           return "X86ISD::CVTTP2UI";
   26173   case X86ISD::CVTTP2SI_RND:       return "X86ISD::CVTTP2SI_RND";
   26174   case X86ISD::CVTTP2UI_RND:       return "X86ISD::CVTTP2UI_RND";
   26175   case X86ISD::CVTTS2SI_RND:       return "X86ISD::CVTTS2SI_RND";
   26176   case X86ISD::CVTTS2UI_RND:       return "X86ISD::CVTTS2UI_RND";
   26177   case X86ISD::CVTSI2P:            return "X86ISD::CVTSI2P";
   26178   case X86ISD::CVTUI2P:            return "X86ISD::CVTUI2P";
   26179   case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
   26180   case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
   26181   case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
   26182   case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
   26183   case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
   26184   case X86ISD::CVTPS2PH:           return "X86ISD::CVTPS2PH";
   26185   case X86ISD::CVTPH2PS:           return "X86ISD::CVTPH2PS";
   26186   case X86ISD::CVTPH2PS_RND:       return "X86ISD::CVTPH2PS_RND";
   26187   case X86ISD::CVTP2SI:            return "X86ISD::CVTP2SI";
   26188   case X86ISD::CVTP2UI:            return "X86ISD::CVTP2UI";
   26189   case X86ISD::CVTP2SI_RND:        return "X86ISD::CVTP2SI_RND";
   26190   case X86ISD::CVTP2UI_RND:        return "X86ISD::CVTP2UI_RND";
   26191   case X86ISD::CVTS2SI_RND:        return "X86ISD::CVTS2SI_RND";
   26192   case X86ISD::CVTS2UI_RND:        return "X86ISD::CVTS2UI_RND";
   26193   case X86ISD::LWPINS:             return "X86ISD::LWPINS";
   26194   case X86ISD::MGATHER:            return "X86ISD::MGATHER";
   26195   case X86ISD::MSCATTER:           return "X86ISD::MSCATTER";
   26196   case X86ISD::VPDPBUSD:           return "X86ISD::VPDPBUSD";
   26197   case X86ISD::VPDPBUSDS:          return "X86ISD::VPDPBUSDS";
   26198   case X86ISD::VPDPWSSD:           return "X86ISD::VPDPWSSD";
   26199   case X86ISD::VPDPWSSDS:          return "X86ISD::VPDPWSSDS";
   26200   case X86ISD::VPSHUFBITQMB:       return "X86ISD::VPSHUFBITQMB";
   26201   case X86ISD::GF2P8MULB:          return "X86ISD::GF2P8MULB";
   26202   case X86ISD::GF2P8AFFINEQB:      return "X86ISD::GF2P8AFFINEQB";
   26203   case X86ISD::GF2P8AFFINEINVQB:   return "X86ISD::GF2P8AFFINEINVQB";
   26204   case X86ISD::NT_CALL:            return "X86ISD::NT_CALL";
   26205   case X86ISD::NT_BRIND:           return "X86ISD::NT_BRIND";
   26206   case X86ISD::UMWAIT:             return "X86ISD::UMWAIT";
   26207   case X86ISD::TPAUSE:             return "X86ISD::TPAUSE";
   26208   }
   26209   return nullptr;
   26210 }
   26211 
   26212 /// Return true if the addressing mode represented by AM is legal for this
   26213 /// target, for a load/store of the specified type.
   26214 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
   26215                                               const AddrMode &AM, Type *Ty,
   26216                                               unsigned AS,
   26217                                               Instruction *I) const {
   26218   // X86 supports extremely general addressing modes.
   26219   CodeModel::Model M = getTargetMachine().getCodeModel();
   26220 
   26221   // X86 allows a sign-extended 32-bit immediate field as a displacement.
   26222   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
   26223     return false;
   26224 
   26225   if (AM.BaseGV) {
   26226     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
   26227 
   26228     // If a reference to this global requires an extra load, we can't fold it.
   26229     if (isGlobalStubReference(GVFlags))
   26230       return false;
   26231 
   26232     // If BaseGV requires a register for the PIC base, we cannot also have a
   26233     // BaseReg specified.
   26234     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
   26235       return false;
   26236 
   26237     // If lower 4G is not available, then we must use rip-relative addressing.
   26238     if ((M != CodeModel::Small || isPositionIndependent()) &&
   26239         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
   26240       return false;
   26241   }
   26242 
   26243   switch (AM.Scale) {
   26244   case 0:
   26245   case 1:
   26246   case 2:
   26247   case 4:
   26248   case 8:
   26249     // These scales always work.
   26250     break;
   26251   case 3:
   26252   case 5:
   26253   case 9:
   26254     // These scales are formed with basereg+scalereg.  Only accept if there is
   26255     // no basereg yet.
   26256     if (AM.HasBaseReg)
   26257       return false;
   26258     break;
   26259   default:  // Other stuff never works.
   26260     return false;
   26261   }
   26262 
   26263   return true;
   26264 }
   26265 
   26266 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
   26267   unsigned Bits = Ty->getScalarSizeInBits();
   26268 
   26269   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
   26270   // particularly cheaper than those without.
   26271   if (Bits == 8)
   26272     return false;
   26273 
   26274   // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
   26275   if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
   26276       (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
   26277     return false;
   26278 
   26279   // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
   26280   // shifts just as cheap as scalar ones.
   26281   if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
   26282     return false;
   26283 
   26284   // AVX512BW has shifts such as vpsllvw.
   26285   if (Subtarget.hasBWI() && Bits == 16)
   26286       return false;
   26287 
   26288   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
   26289   // fully general vector.
   26290   return true;
   26291 }
   26292 
   26293 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   26294   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   26295     return false;
   26296   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   26297   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   26298   return NumBits1 > NumBits2;
   26299 }
   26300 
   26301 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
   26302   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
   26303     return false;
   26304 
   26305   if (!isTypeLegal(EVT::getEVT(Ty1)))
   26306     return false;
   26307 
   26308   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
   26309 
   26310   // Assuming the caller doesn't have a zeroext or signext return parameter,
   26311   // truncation all the way down to i1 is valid.
   26312   return true;
   26313 }
   26314 
   26315 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   26316   return isInt<32>(Imm);
   26317 }
   26318 
   26319 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
   26320   // Can also use sub to handle negated immediates.
   26321   return isInt<32>(Imm);
   26322 }
   26323 
   26324 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   26325   if (!VT1.isInteger() || !VT2.isInteger())
   26326     return false;
   26327   unsigned NumBits1 = VT1.getSizeInBits();
   26328   unsigned NumBits2 = VT2.getSizeInBits();
   26329   return NumBits1 > NumBits2;
   26330 }
   26331 
   26332 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   26333   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   26334   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
   26335 }
   26336 
   26337 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   26338   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   26339   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
   26340 }
   26341 
   26342 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   26343   EVT VT1 = Val.getValueType();
   26344   if (isZExtFree(VT1, VT2))
   26345     return true;
   26346 
   26347   if (Val.getOpcode() != ISD::LOAD)
   26348     return false;
   26349 
   26350   if (!VT1.isSimple() || !VT1.isInteger() ||
   26351       !VT2.isSimple() || !VT2.isInteger())
   26352     return false;
   26353 
   26354   switch (VT1.getSimpleVT().SimpleTy) {
   26355   default: break;
   26356   case MVT::i8:
   26357   case MVT::i16:
   26358   case MVT::i32:
   26359     // X86 has 8, 16, and 32-bit zero-extending loads.
   26360     return true;
   26361   }
   26362 
   26363   return false;
   26364 }
   26365 
   26366 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   26367   EVT SrcVT = ExtVal.getOperand(0).getValueType();
   26368 
   26369   // There is no extending load for vXi1.
   26370   if (SrcVT.getScalarType() == MVT::i1)
   26371     return false;
   26372 
   26373   return true;
   26374 }
   26375 
   26376 bool
   26377 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   26378   if (!Subtarget.hasAnyFMA())
   26379     return false;
   26380 
   26381   VT = VT.getScalarType();
   26382 
   26383   if (!VT.isSimple())
   26384     return false;
   26385 
   26386   switch (VT.getSimpleVT().SimpleTy) {
   26387   case MVT::f32:
   26388   case MVT::f64:
   26389     return true;
   26390   default:
   26391     break;
   26392   }
   26393 
   26394   return false;
   26395 }
   26396 
   26397 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
   26398   // i16 instructions are longer (0x66 prefix) and potentially slower.
   26399   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
   26400 }
   26401 
   26402 /// Targets can use this to indicate that they only support *some*
   26403 /// VECTOR_SHUFFLE operations, those with specific masks.
   26404 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
   26405 /// are assumed to be legal.
   26406 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   26407   if (!VT.isSimple())
   26408     return false;
   26409 
   26410   // Not for i1 vectors
   26411   if (VT.getSimpleVT().getScalarType() == MVT::i1)
   26412     return false;
   26413 
   26414   // Very little shuffling can be done for 64-bit vectors right now.
   26415   if (VT.getSimpleVT().getSizeInBits() == 64)
   26416     return false;
   26417 
   26418   // We only care that the types being shuffled are legal. The lowering can
   26419   // handle any possible shuffle mask that results.
   26420   return isTypeLegal(VT.getSimpleVT());
   26421 }
   26422 
   26423 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
   26424                                                EVT VT) const {
   26425   // Don't convert an 'and' into a shuffle that we don't directly support.
   26426   // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
   26427   if (!Subtarget.hasAVX2())
   26428     if (VT == MVT::v32i8 || VT == MVT::v16i16)
   26429       return false;
   26430 
   26431   // Just delegate to the generic legality, clear masks aren't special.
   26432   return isShuffleMaskLegal(Mask, VT);
   26433 }
   26434 
   26435 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
   26436   // If the subtarget is using retpolines, we need to not generate jump tables.
   26437   if (Subtarget.useRetpoline())
   26438     return false;
   26439 
   26440   // Otherwise, fallback on the generic logic.
   26441   return TargetLowering::areJTsAllowed(Fn);
   26442 }
   26443 
   26444 //===----------------------------------------------------------------------===//
   26445 //                           X86 Scheduler Hooks
   26446 //===----------------------------------------------------------------------===//
   26447 
   26448 /// Utility function to emit xbegin specifying the start of an RTM region.
   26449 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
   26450                                      const TargetInstrInfo *TII) {
   26451   DebugLoc DL = MI.getDebugLoc();
   26452 
   26453   const BasicBlock *BB = MBB->getBasicBlock();
   26454   MachineFunction::iterator I = ++MBB->getIterator();
   26455 
   26456   // For the v = xbegin(), we generate
   26457   //
   26458   // thisMBB:
   26459   //  xbegin sinkMBB
   26460   //
   26461   // mainMBB:
   26462   //  s0 = -1
   26463   //
   26464   // fallBB:
   26465   //  eax = # XABORT_DEF
   26466   //  s1 = eax
   26467   //
   26468   // sinkMBB:
   26469   //  v = phi(s0/mainBB, s1/fallBB)
   26470 
   26471   MachineBasicBlock *thisMBB = MBB;
   26472   MachineFunction *MF = MBB->getParent();
   26473   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   26474   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
   26475   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   26476   MF->insert(I, mainMBB);
   26477   MF->insert(I, fallMBB);
   26478   MF->insert(I, sinkMBB);
   26479 
   26480   // Transfer the remainder of BB and its successor edges to sinkMBB.
   26481   sinkMBB->splice(sinkMBB->begin(), MBB,
   26482                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   26483   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
   26484 
   26485   MachineRegisterInfo &MRI = MF->getRegInfo();
   26486   unsigned DstReg = MI.getOperand(0).getReg();
   26487   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   26488   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   26489   unsigned fallDstReg = MRI.createVirtualRegister(RC);
   26490 
   26491   // thisMBB:
   26492   //  xbegin fallMBB
   26493   //  # fallthrough to mainMBB
   26494   //  # abortion to fallMBB
   26495   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
   26496   thisMBB->addSuccessor(mainMBB);
   26497   thisMBB->addSuccessor(fallMBB);
   26498 
   26499   // mainMBB:
   26500   //  mainDstReg := -1
   26501   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
   26502   BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   26503   mainMBB->addSuccessor(sinkMBB);
   26504 
   26505   // fallMBB:
   26506   //  ; pseudo instruction to model hardware's definition from XABORT
   26507   //  EAX := XABORT_DEF
   26508   //  fallDstReg := EAX
   26509   BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
   26510   BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
   26511       .addReg(X86::EAX);
   26512   fallMBB->addSuccessor(sinkMBB);
   26513 
   26514   // sinkMBB:
   26515   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
   26516   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
   26517       .addReg(mainDstReg).addMBB(mainMBB)
   26518       .addReg(fallDstReg).addMBB(fallMBB);
   26519 
   26520   MI.eraseFromParent();
   26521   return sinkMBB;
   26522 }
   26523 
   26524 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
   26525                                      const X86Subtarget &Subtarget) {
   26526   DebugLoc dl = MI.getDebugLoc();
   26527   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   26528 
   26529   // insert input VAL into EAX
   26530   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
   26531       .addReg(MI.getOperand(0).getReg());
   26532   // insert zero to ECX
   26533   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
   26534 
   26535   // insert zero to EDX
   26536   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
   26537 
   26538   // insert WRPKRU instruction
   26539   BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
   26540 
   26541   MI.eraseFromParent(); // The pseudo is gone now.
   26542   return BB;
   26543 }
   26544 
   26545 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
   26546                                      const X86Subtarget &Subtarget) {
   26547   DebugLoc dl = MI.getDebugLoc();
   26548   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   26549 
   26550   // insert zero to ECX
   26551   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
   26552 
   26553   // insert RDPKRU instruction
   26554   BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
   26555   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
   26556       .addReg(X86::EAX);
   26557 
   26558   MI.eraseFromParent(); // The pseudo is gone now.
   26559   return BB;
   26560 }
   26561 
   26562 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
   26563                                       const X86Subtarget &Subtarget,
   26564                                       unsigned Opc) {
   26565   DebugLoc dl = MI.getDebugLoc();
   26566   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   26567   // Address into RAX/EAX, other two args into ECX, EDX.
   26568   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
   26569   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
   26570   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   26571   for (int i = 0; i < X86::AddrNumOperands; ++i)
   26572     MIB.add(MI.getOperand(i));
   26573 
   26574   unsigned ValOps = X86::AddrNumOperands;
   26575   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
   26576       .addReg(MI.getOperand(ValOps).getReg());
   26577   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
   26578       .addReg(MI.getOperand(ValOps + 1).getReg());
   26579 
   26580   // The instruction doesn't actually take any operands though.
   26581   BuildMI(*BB, MI, dl, TII->get(Opc));
   26582 
   26583   MI.eraseFromParent(); // The pseudo is gone now.
   26584   return BB;
   26585 }
   26586 
   26587 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
   26588                                       const X86Subtarget &Subtarget) {
   26589   DebugLoc dl = MI->getDebugLoc();
   26590   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   26591   // Address into RAX/EAX
   26592   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
   26593   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
   26594   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   26595   for (int i = 0; i < X86::AddrNumOperands; ++i)
   26596     MIB.add(MI->getOperand(i));
   26597 
   26598   // The instruction doesn't actually take any operands though.
   26599   BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
   26600 
   26601   MI->eraseFromParent(); // The pseudo is gone now.
   26602   return BB;
   26603 }
   26604 
   26605 
   26606 
   26607 MachineBasicBlock *
   26608 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   26609                                                  MachineBasicBlock *MBB) const {
   26610   // Emit va_arg instruction on X86-64.
   26611 
   26612   // Operands to this pseudo-instruction:
   26613   // 0  ) Output        : destination address (reg)
   26614   // 1-5) Input         : va_list address (addr, i64mem)
   26615   // 6  ) ArgSize       : Size (in bytes) of vararg type
   26616   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
   26617   // 8  ) Align         : Alignment of type
   26618   // 9  ) EFLAGS (implicit-def)
   26619 
   26620   assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
   26621   static_assert(X86::AddrNumOperands == 5,
   26622                 "VAARG_64 assumes 5 address operands");
   26623 
   26624   unsigned DestReg = MI.getOperand(0).getReg();
   26625   MachineOperand &Base = MI.getOperand(1);
   26626   MachineOperand &Scale = MI.getOperand(2);
   26627   MachineOperand &Index = MI.getOperand(3);
   26628   MachineOperand &Disp = MI.getOperand(4);
   26629   MachineOperand &Segment = MI.getOperand(5);
   26630   unsigned ArgSize = MI.getOperand(6).getImm();
   26631   unsigned ArgMode = MI.getOperand(7).getImm();
   26632   unsigned Align = MI.getOperand(8).getImm();
   26633 
   26634   // Memory Reference
   26635   assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
   26636   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   26637   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
   26638 
   26639   // Machine Information
   26640   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   26641   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   26642   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   26643   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
   26644   DebugLoc DL = MI.getDebugLoc();
   26645 
   26646   // struct va_list {
   26647   //   i32   gp_offset
   26648   //   i32   fp_offset
   26649   //   i64   overflow_area (address)
   26650   //   i64   reg_save_area (address)
   26651   // }
   26652   // sizeof(va_list) = 24
   26653   // alignment(va_list) = 8
   26654 
   26655   unsigned TotalNumIntRegs = 6;
   26656   unsigned TotalNumXMMRegs = 8;
   26657   bool UseGPOffset = (ArgMode == 1);
   26658   bool UseFPOffset = (ArgMode == 2);
   26659   unsigned MaxOffset = TotalNumIntRegs * 8 +
   26660                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
   26661 
   26662   /* Align ArgSize to a multiple of 8 */
   26663   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
   26664   bool NeedsAlign = (Align > 8);
   26665 
   26666   MachineBasicBlock *thisMBB = MBB;
   26667   MachineBasicBlock *overflowMBB;
   26668   MachineBasicBlock *offsetMBB;
   26669   MachineBasicBlock *endMBB;
   26670 
   26671   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
   26672   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
   26673   unsigned OffsetReg = 0;
   26674 
   26675   if (!UseGPOffset && !UseFPOffset) {
   26676     // If we only pull from the overflow region, we don't create a branch.
   26677     // We don't need to alter control flow.
   26678     OffsetDestReg = 0; // unused
   26679     OverflowDestReg = DestReg;
   26680 
   26681     offsetMBB = nullptr;
   26682     overflowMBB = thisMBB;
   26683     endMBB = thisMBB;
   26684   } else {
   26685     // First emit code to check if gp_offset (or fp_offset) is below the bound.
   26686     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
   26687     // If not, pull from overflow_area. (branch to overflowMBB)
   26688     //
   26689     //       thisMBB
   26690     //         |     .
   26691     //         |        .
   26692     //     offsetMBB   overflowMBB
   26693     //         |        .
   26694     //         |     .
   26695     //        endMBB
   26696 
   26697     // Registers for the PHI in endMBB
   26698     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
   26699     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
   26700 
   26701     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   26702     MachineFunction *MF = MBB->getParent();
   26703     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   26704     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   26705     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   26706 
   26707     MachineFunction::iterator MBBIter = ++MBB->getIterator();
   26708 
   26709     // Insert the new basic blocks
   26710     MF->insert(MBBIter, offsetMBB);
   26711     MF->insert(MBBIter, overflowMBB);
   26712     MF->insert(MBBIter, endMBB);
   26713 
   26714     // Transfer the remainder of MBB and its successor edges to endMBB.
   26715     endMBB->splice(endMBB->begin(), thisMBB,
   26716                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
   26717     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
   26718 
   26719     // Make offsetMBB and overflowMBB successors of thisMBB
   26720     thisMBB->addSuccessor(offsetMBB);
   26721     thisMBB->addSuccessor(overflowMBB);
   26722 
   26723     // endMBB is a successor of both offsetMBB and overflowMBB
   26724     offsetMBB->addSuccessor(endMBB);
   26725     overflowMBB->addSuccessor(endMBB);
   26726 
   26727     // Load the offset value into a register
   26728     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
   26729     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
   26730         .add(Base)
   26731         .add(Scale)
   26732         .add(Index)
   26733         .addDisp(Disp, UseFPOffset ? 4 : 0)
   26734         .add(Segment)
   26735         .setMemRefs(MMOBegin, MMOEnd);
   26736 
   26737     // Check if there is enough room left to pull this argument.
   26738     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
   26739       .addReg(OffsetReg)
   26740       .addImm(MaxOffset + 8 - ArgSizeA8);
   26741 
   26742     // Branch to "overflowMBB" if offset >= max
   26743     // Fall through to "offsetMBB" otherwise
   26744     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
   26745       .addMBB(overflowMBB);
   26746   }
   26747 
   26748   // In offsetMBB, emit code to use the reg_save_area.
   26749   if (offsetMBB) {
   26750     assert(OffsetReg != 0);
   26751 
   26752     // Read the reg_save_area address.
   26753     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
   26754     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
   26755         .add(Base)
   26756         .add(Scale)
   26757         .add(Index)
   26758         .addDisp(Disp, 16)
   26759         .add(Segment)
   26760         .setMemRefs(MMOBegin, MMOEnd);
   26761 
   26762     // Zero-extend the offset
   26763     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
   26764       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
   26765         .addImm(0)
   26766         .addReg(OffsetReg)
   26767         .addImm(X86::sub_32bit);
   26768 
   26769     // Add the offset to the reg_save_area to get the final address.
   26770     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
   26771       .addReg(OffsetReg64)
   26772       .addReg(RegSaveReg);
   26773 
   26774     // Compute the offset for the next argument
   26775     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
   26776     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
   26777       .addReg(OffsetReg)
   26778       .addImm(UseFPOffset ? 16 : 8);
   26779 
   26780     // Store it back into the va_list.
   26781     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
   26782         .add(Base)
   26783         .add(Scale)
   26784         .add(Index)
   26785         .addDisp(Disp, UseFPOffset ? 4 : 0)
   26786         .add(Segment)
   26787         .addReg(NextOffsetReg)
   26788         .setMemRefs(MMOBegin, MMOEnd);
   26789 
   26790     // Jump to endMBB
   26791     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
   26792       .addMBB(endMBB);
   26793   }
   26794 
   26795   //
   26796   // Emit code to use overflow area
   26797   //
   26798 
   26799   // Load the overflow_area address into a register.
   26800   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
   26801   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
   26802       .add(Base)
   26803       .add(Scale)
   26804       .add(Index)
   26805       .addDisp(Disp, 8)
   26806       .add(Segment)
   26807       .setMemRefs(MMOBegin, MMOEnd);
   26808 
   26809   // If we need to align it, do so. Otherwise, just copy the address
   26810   // to OverflowDestReg.
   26811   if (NeedsAlign) {
   26812     // Align the overflow address
   26813     assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
   26814     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
   26815 
   26816     // aligned_addr = (addr + (align-1)) & ~(align-1)
   26817     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
   26818       .addReg(OverflowAddrReg)
   26819       .addImm(Align-1);
   26820 
   26821     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
   26822       .addReg(TmpReg)
   26823       .addImm(~(uint64_t)(Align-1));
   26824   } else {
   26825     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
   26826       .addReg(OverflowAddrReg);
   26827   }
   26828 
   26829   // Compute the next overflow address after this argument.
   26830   // (the overflow address should be kept 8-byte aligned)
   26831   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
   26832   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
   26833     .addReg(OverflowDestReg)
   26834     .addImm(ArgSizeA8);
   26835 
   26836   // Store the new overflow address.
   26837   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
   26838       .add(Base)
   26839       .add(Scale)
   26840       .add(Index)
   26841       .addDisp(Disp, 8)
   26842       .add(Segment)
   26843       .addReg(NextAddrReg)
   26844       .setMemRefs(MMOBegin, MMOEnd);
   26845 
   26846   // If we branched, emit the PHI to the front of endMBB.
   26847   if (offsetMBB) {
   26848     BuildMI(*endMBB, endMBB->begin(), DL,
   26849             TII->get(X86::PHI), DestReg)
   26850       .addReg(OffsetDestReg).addMBB(offsetMBB)
   26851       .addReg(OverflowDestReg).addMBB(overflowMBB);
   26852   }
   26853 
   26854   // Erase the pseudo instruction
   26855   MI.eraseFromParent();
   26856 
   26857   return endMBB;
   26858 }
   26859 
   26860 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   26861     MachineInstr &MI, MachineBasicBlock *MBB) const {
   26862   // Emit code to save XMM registers to the stack. The ABI says that the
   26863   // number of registers to save is given in %al, so it's theoretically
   26864   // possible to do an indirect jump trick to avoid saving all of them,
   26865   // however this code takes a simpler approach and just executes all
   26866   // of the stores if %al is non-zero. It's less code, and it's probably
   26867   // easier on the hardware branch predictor, and stores aren't all that
   26868   // expensive anyway.
   26869 
   26870   // Create the new basic blocks. One block contains all the XMM stores,
   26871   // and one block is the final destination regardless of whether any
   26872   // stores were performed.
   26873   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   26874   MachineFunction *F = MBB->getParent();
   26875   MachineFunction::iterator MBBIter = ++MBB->getIterator();
   26876   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
   26877   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
   26878   F->insert(MBBIter, XMMSaveMBB);
   26879   F->insert(MBBIter, EndMBB);
   26880 
   26881   // Transfer the remainder of MBB and its successor edges to EndMBB.
   26882   EndMBB->splice(EndMBB->begin(), MBB,
   26883                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   26884   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
   26885 
   26886   // The original block will now fall through to the XMM save block.
   26887   MBB->addSuccessor(XMMSaveMBB);
   26888   // The XMMSaveMBB will fall through to the end block.
   26889   XMMSaveMBB->addSuccessor(EndMBB);
   26890 
   26891   // Now add the instructions.
   26892   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   26893   DebugLoc DL = MI.getDebugLoc();
   26894 
   26895   unsigned CountReg = MI.getOperand(0).getReg();
   26896   int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
   26897   int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
   26898 
   26899   if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
   26900     // If %al is 0, branch around the XMM save block.
   26901     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
   26902     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
   26903     MBB->addSuccessor(EndMBB);
   26904   }
   26905 
   26906   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
   26907   // that was just emitted, but clearly shouldn't be "saved".
   26908   assert((MI.getNumOperands() <= 3 ||
   26909           !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
   26910           MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
   26911          "Expected last argument to be EFLAGS");
   26912   unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   26913   // In the XMM save block, save all the XMM argument registers.
   26914   for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
   26915     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
   26916     MachineMemOperand *MMO = F->getMachineMemOperand(
   26917         MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
   26918         MachineMemOperand::MOStore,
   26919         /*Size=*/16, /*Align=*/16);
   26920     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
   26921         .addFrameIndex(RegSaveFrameIndex)
   26922         .addImm(/*Scale=*/1)
   26923         .addReg(/*IndexReg=*/0)
   26924         .addImm(/*Disp=*/Offset)
   26925         .addReg(/*Segment=*/0)
   26926         .addReg(MI.getOperand(i).getReg())
   26927         .addMemOperand(MMO);
   26928   }
   26929 
   26930   MI.eraseFromParent(); // The pseudo instruction is gone now.
   26931 
   26932   return EndMBB;
   26933 }
   26934 
   26935 // The EFLAGS operand of SelectItr might be missing a kill marker
   26936 // because there were multiple uses of EFLAGS, and ISel didn't know
   26937 // which to mark. Figure out whether SelectItr should have had a
   26938 // kill marker, and set it if it should. Returns the correct kill
   26939 // marker value.
   26940 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
   26941                                      MachineBasicBlock* BB,
   26942                                      const TargetRegisterInfo* TRI) {
   26943   // Scan forward through BB for a use/def of EFLAGS.
   26944   MachineBasicBlock::iterator miI(std::next(SelectItr));
   26945   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
   26946     const MachineInstr& mi = *miI;
   26947     if (mi.readsRegister(X86::EFLAGS))
   26948       return false;
   26949     if (mi.definesRegister(X86::EFLAGS))
   26950       break; // Should have kill-flag - update below.
   26951   }
   26952 
   26953   // If we hit the end of the block, check whether EFLAGS is live into a
   26954   // successor.
   26955   if (miI == BB->end()) {
   26956     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
   26957                                           sEnd = BB->succ_end();
   26958          sItr != sEnd; ++sItr) {
   26959       MachineBasicBlock* succ = *sItr;
   26960       if (succ->isLiveIn(X86::EFLAGS))
   26961         return false;
   26962     }
   26963   }
   26964 
   26965   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
   26966   // out. SelectMI should have a kill flag on EFLAGS.
   26967   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
   26968   return true;
   26969 }
   26970 
   26971 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
   26972 // together with other CMOV pseudo-opcodes into a single basic-block with
   26973 // conditional jump around it.
   26974 static bool isCMOVPseudo(MachineInstr &MI) {
   26975   switch (MI.getOpcode()) {
   26976   case X86::CMOV_FR32:
   26977   case X86::CMOV_FR64:
   26978   case X86::CMOV_GR8:
   26979   case X86::CMOV_GR16:
   26980   case X86::CMOV_GR32:
   26981   case X86::CMOV_RFP32:
   26982   case X86::CMOV_RFP64:
   26983   case X86::CMOV_RFP80:
   26984   case X86::CMOV_V2F64:
   26985   case X86::CMOV_V2I64:
   26986   case X86::CMOV_V4F32:
   26987   case X86::CMOV_V4F64:
   26988   case X86::CMOV_V4I64:
   26989   case X86::CMOV_V16F32:
   26990   case X86::CMOV_V8F32:
   26991   case X86::CMOV_V8F64:
   26992   case X86::CMOV_V8I64:
   26993   case X86::CMOV_V8I1:
   26994   case X86::CMOV_V16I1:
   26995   case X86::CMOV_V32I1:
   26996   case X86::CMOV_V64I1:
   26997     return true;
   26998 
   26999   default:
   27000     return false;
   27001   }
   27002 }
   27003 
   27004 // Helper function, which inserts PHI functions into SinkMBB:
   27005 //   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
   27006 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
   27007 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
   27008 // the last PHI function inserted.
   27009 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
   27010     MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
   27011     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
   27012     MachineBasicBlock *SinkMBB) {
   27013   MachineFunction *MF = TrueMBB->getParent();
   27014   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   27015   DebugLoc DL = MIItBegin->getDebugLoc();
   27016 
   27017   X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
   27018   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
   27019 
   27020   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
   27021 
   27022   // As we are creating the PHIs, we have to be careful if there is more than
   27023   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
   27024   // PHIs have to reference the individual true/false inputs from earlier PHIs.
   27025   // That also means that PHI construction must work forward from earlier to
   27026   // later, and that the code must maintain a mapping from earlier PHI's
   27027   // destination registers, and the registers that went into the PHI.
   27028   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
   27029   MachineInstrBuilder MIB;
   27030 
   27031   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
   27032     unsigned DestReg = MIIt->getOperand(0).getReg();
   27033     unsigned Op1Reg = MIIt->getOperand(1).getReg();
   27034     unsigned Op2Reg = MIIt->getOperand(2).getReg();
   27035 
   27036     // If this CMOV we are generating is the opposite condition from
   27037     // the jump we generated, then we have to swap the operands for the
   27038     // PHI that is going to be generated.
   27039     if (MIIt->getOperand(3).getImm() == OppCC)
   27040       std::swap(Op1Reg, Op2Reg);
   27041 
   27042     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
   27043       Op1Reg = RegRewriteTable[Op1Reg].first;
   27044 
   27045     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
   27046       Op2Reg = RegRewriteTable[Op2Reg].second;
   27047 
   27048     MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
   27049               .addReg(Op1Reg)
   27050               .addMBB(FalseMBB)
   27051               .addReg(Op2Reg)
   27052               .addMBB(TrueMBB);
   27053 
   27054     // Add this PHI to the rewrite table.
   27055     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
   27056   }
   27057 
   27058   return MIB;
   27059 }
   27060 
   27061 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
   27062 MachineBasicBlock *
   27063 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
   27064                                              MachineInstr &SecondCascadedCMOV,
   27065                                              MachineBasicBlock *ThisMBB) const {
   27066   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   27067   DebugLoc DL = FirstCMOV.getDebugLoc();
   27068 
   27069   // We lower cascaded CMOVs such as
   27070   //
   27071   //   (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
   27072   //
   27073   // to two successive branches.
   27074   //
   27075   // Without this, we would add a PHI between the two jumps, which ends up
   27076   // creating a few copies all around. For instance, for
   27077   //
   27078   //    (sitofp (zext (fcmp une)))
   27079   //
   27080   // we would generate:
   27081   //
   27082   //         ucomiss %xmm1, %xmm0
   27083   //         movss  <1.0f>, %xmm0
   27084   //         movaps  %xmm0, %xmm1
   27085   //         jne     .LBB5_2
   27086   //         xorps   %xmm1, %xmm1
   27087   // .LBB5_2:
   27088   //         jp      .LBB5_4
   27089   //         movaps  %xmm1, %xmm0
   27090   // .LBB5_4:
   27091   //         retq
   27092   //
   27093   // because this custom-inserter would have generated:
   27094   //
   27095   //   A
   27096   //   | \
   27097   //   |  B
   27098   //   | /
   27099   //   C
   27100   //   | \
   27101   //   |  D
   27102   //   | /
   27103   //   E
   27104   //
   27105   // A: X = ...; Y = ...
   27106   // B: empty
   27107   // C: Z = PHI [X, A], [Y, B]
   27108   // D: empty
   27109   // E: PHI [X, C], [Z, D]
   27110   //
   27111   // If we lower both CMOVs in a single step, we can instead generate:
   27112   //
   27113   //   A
   27114   //   | \
   27115   //   |  C
   27116   //   | /|
   27117   //   |/ |
   27118   //   |  |
   27119   //   |  D
   27120   //   | /
   27121   //   E
   27122   //
   27123   // A: X = ...; Y = ...
   27124   // D: empty
   27125   // E: PHI [X, A], [X, C], [Y, D]
   27126   //
   27127   // Which, in our sitofp/fcmp example, gives us something like:
   27128   //
   27129   //         ucomiss %xmm1, %xmm0
   27130   //         movss  <1.0f>, %xmm0
   27131   //         jne     .LBB5_4
   27132   //         jp      .LBB5_4
   27133   //         xorps   %xmm0, %xmm0
   27134   // .LBB5_4:
   27135   //         retq
   27136   //
   27137 
   27138   // We lower cascaded CMOV into two successive branches to the same block.
   27139   // EFLAGS is used by both, so mark it as live in the second.
   27140   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
   27141   MachineFunction *F = ThisMBB->getParent();
   27142   MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
   27143   MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
   27144   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
   27145 
   27146   MachineFunction::iterator It = ++ThisMBB->getIterator();
   27147   F->insert(It, FirstInsertedMBB);
   27148   F->insert(It, SecondInsertedMBB);
   27149   F->insert(It, SinkMBB);
   27150 
   27151   // For a cascaded CMOV, we lower it to two successive branches to
   27152   // the same block (SinkMBB).  EFLAGS is used by both, so mark it as live in
   27153   // the FirstInsertedMBB.
   27154   FirstInsertedMBB->addLiveIn(X86::EFLAGS);
   27155 
   27156   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   27157   // live into the sink and copy blocks.
   27158   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   27159   if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
   27160       !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
   27161     SecondInsertedMBB->addLiveIn(X86::EFLAGS);
   27162     SinkMBB->addLiveIn(X86::EFLAGS);
   27163   }
   27164 
   27165   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
   27166   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
   27167                   std::next(MachineBasicBlock::iterator(FirstCMOV)),
   27168                   ThisMBB->end());
   27169   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
   27170 
   27171   // Fallthrough block for ThisMBB.
   27172   ThisMBB->addSuccessor(FirstInsertedMBB);
   27173   // The true block target of the first branch is always SinkMBB.
   27174   ThisMBB->addSuccessor(SinkMBB);
   27175   // Fallthrough block for FirstInsertedMBB.
   27176   FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
   27177   // The true block for the branch of FirstInsertedMBB.
   27178   FirstInsertedMBB->addSuccessor(SinkMBB);
   27179   // This is fallthrough.
   27180   SecondInsertedMBB->addSuccessor(SinkMBB);
   27181 
   27182   // Create the conditional branch instructions.
   27183   X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
   27184   unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
   27185   BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
   27186 
   27187   X86::CondCode SecondCC =
   27188       X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
   27189   unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
   27190   BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
   27191 
   27192   //  SinkMBB:
   27193   //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
   27194   unsigned DestReg = FirstCMOV.getOperand(0).getReg();
   27195   unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
   27196   unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
   27197   MachineInstrBuilder MIB =
   27198       BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
   27199           .addReg(Op1Reg)
   27200           .addMBB(SecondInsertedMBB)
   27201           .addReg(Op2Reg)
   27202           .addMBB(ThisMBB);
   27203 
   27204   // The second SecondInsertedMBB provides the same incoming value as the
   27205   // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
   27206   MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
   27207   // Copy the PHI result to the register defined by the second CMOV.
   27208   BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
   27209           TII->get(TargetOpcode::COPY),
   27210           SecondCascadedCMOV.getOperand(0).getReg())
   27211       .addReg(FirstCMOV.getOperand(0).getReg());
   27212 
   27213   // Now remove the CMOVs.
   27214   FirstCMOV.eraseFromParent();
   27215   SecondCascadedCMOV.eraseFromParent();
   27216 
   27217   return SinkMBB;
   27218 }
   27219 
   27220 MachineBasicBlock *
   27221 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
   27222                                      MachineBasicBlock *ThisMBB) const {
   27223   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   27224   DebugLoc DL = MI.getDebugLoc();
   27225 
   27226   // To "insert" a SELECT_CC instruction, we actually have to insert the
   27227   // diamond control-flow pattern.  The incoming instruction knows the
   27228   // destination vreg to set, the condition code register to branch on, the
   27229   // true/false values to select between and a branch opcode to use.
   27230 
   27231   //  ThisMBB:
   27232   //  ...
   27233   //   TrueVal = ...
   27234   //   cmpTY ccX, r1, r2
   27235   //   bCC copy1MBB
   27236   //   fallthrough --> FalseMBB
   27237 
   27238   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
   27239   // as described above, by inserting a BB, and then making a PHI at the join
   27240   // point to select the true and false operands of the CMOV in the PHI.
   27241   //
   27242   // The code also handles two different cases of multiple CMOV opcodes
   27243   // in a row.
   27244   //
   27245   // Case 1:
   27246   // In this case, there are multiple CMOVs in a row, all which are based on
   27247   // the same condition setting (or the exact opposite condition setting).
   27248   // In this case we can lower all the CMOVs using a single inserted BB, and
   27249   // then make a number of PHIs at the join point to model the CMOVs. The only
   27250   // trickiness here, is that in a case like:
   27251   //
   27252   // t2 = CMOV cond1 t1, f1
   27253   // t3 = CMOV cond1 t2, f2
   27254   //
   27255   // when rewriting this into PHIs, we have to perform some renaming on the
   27256   // temps since you cannot have a PHI operand refer to a PHI result earlier
   27257   // in the same block.  The "simple" but wrong lowering would be:
   27258   //
   27259   // t2 = PHI t1(BB1), f1(BB2)
   27260   // t3 = PHI t2(BB1), f2(BB2)
   27261   //
   27262   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
   27263   // renaming is to note that on the path through BB1, t2 is really just a
   27264   // copy of t1, and do that renaming, properly generating:
   27265   //
   27266   // t2 = PHI t1(BB1), f1(BB2)
   27267   // t3 = PHI t1(BB1), f2(BB2)
   27268   //
   27269   // Case 2:
   27270   // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
   27271   // function - EmitLoweredCascadedSelect.
   27272 
   27273   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
   27274   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
   27275   MachineInstr *LastCMOV = &MI;
   27276   MachineBasicBlock::iterator NextMIIt =
   27277       std::next(MachineBasicBlock::iterator(MI));
   27278 
   27279   // Check for case 1, where there are multiple CMOVs with the same condition
   27280   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
   27281   // number of jumps the most.
   27282 
   27283   if (isCMOVPseudo(MI)) {
   27284     // See if we have a string of CMOVS with the same condition.
   27285     while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
   27286            (NextMIIt->getOperand(3).getImm() == CC ||
   27287             NextMIIt->getOperand(3).getImm() == OppCC)) {
   27288       LastCMOV = &*NextMIIt;
   27289       ++NextMIIt;
   27290     }
   27291   }
   27292 
   27293   // This checks for case 2, but only do this if we didn't already find
   27294   // case 1, as indicated by LastCMOV == MI.
   27295   if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
   27296       NextMIIt->getOpcode() == MI.getOpcode() &&
   27297       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
   27298       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
   27299       NextMIIt->getOperand(1).isKill()) {
   27300     return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
   27301   }
   27302 
   27303   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
   27304   MachineFunction *F = ThisMBB->getParent();
   27305   MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
   27306   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
   27307 
   27308   MachineFunction::iterator It = ++ThisMBB->getIterator();
   27309   F->insert(It, FalseMBB);
   27310   F->insert(It, SinkMBB);
   27311 
   27312   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   27313   // live into the sink and copy blocks.
   27314   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   27315   if (!LastCMOV->killsRegister(X86::EFLAGS) &&
   27316       !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
   27317     FalseMBB->addLiveIn(X86::EFLAGS);
   27318     SinkMBB->addLiveIn(X86::EFLAGS);
   27319   }
   27320 
   27321   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
   27322   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
   27323                   std::next(MachineBasicBlock::iterator(LastCMOV)),
   27324                   ThisMBB->end());
   27325   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
   27326 
   27327   // Fallthrough block for ThisMBB.
   27328   ThisMBB->addSuccessor(FalseMBB);
   27329   // The true block target of the first (or only) branch is always a SinkMBB.
   27330   ThisMBB->addSuccessor(SinkMBB);
   27331   // Fallthrough block for FalseMBB.
   27332   FalseMBB->addSuccessor(SinkMBB);
   27333 
   27334   // Create the conditional branch instruction.
   27335   unsigned Opc = X86::GetCondBranchFromCond(CC);
   27336   BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
   27337 
   27338   //  SinkMBB:
   27339   //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
   27340   //  ...
   27341   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
   27342   MachineBasicBlock::iterator MIItEnd =
   27343       std::next(MachineBasicBlock::iterator(LastCMOV));
   27344   createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
   27345 
   27346   // Now remove the CMOV(s).
   27347   ThisMBB->erase(MIItBegin, MIItEnd);
   27348 
   27349   return SinkMBB;
   27350 }
   27351 
   27352 MachineBasicBlock *
   27353 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
   27354                                        MachineBasicBlock *BB) const {
   27355   // Combine the following atomic floating-point modification pattern:
   27356   //   a.store(reg OP a.load(acquire), release)
   27357   // Transform them into:
   27358   //   OPss (%gpr), %xmm
   27359   //   movss %xmm, (%gpr)
   27360   // Or sd equivalent for 64-bit operations.
   27361   unsigned MOp, FOp;
   27362   switch (MI.getOpcode()) {
   27363   default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
   27364   case X86::RELEASE_FADD32mr:
   27365     FOp = X86::ADDSSrm;
   27366     MOp = X86::MOVSSmr;
   27367     break;
   27368   case X86::RELEASE_FADD64mr:
   27369     FOp = X86::ADDSDrm;
   27370     MOp = X86::MOVSDmr;
   27371     break;
   27372   }
   27373   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   27374   DebugLoc DL = MI.getDebugLoc();
   27375   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   27376   unsigned ValOpIdx = X86::AddrNumOperands;
   27377   unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
   27378   MachineInstrBuilder MIB =
   27379       BuildMI(*BB, MI, DL, TII->get(FOp),
   27380               MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
   27381           .addReg(VSrc);
   27382   for (int i = 0; i < X86::AddrNumOperands; ++i) {
   27383     MachineOperand &Operand = MI.getOperand(i);
   27384     // Clear any kill flags on register operands as we'll create a second
   27385     // instruction using the same address operands.
   27386     if (Operand.isReg())
   27387       Operand.setIsKill(false);
   27388     MIB.add(Operand);
   27389   }
   27390   MachineInstr *FOpMI = MIB;
   27391   MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
   27392   for (int i = 0; i < X86::AddrNumOperands; ++i)
   27393     MIB.add(MI.getOperand(i));
   27394   MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
   27395   MI.eraseFromParent(); // The pseudo instruction is gone now.
   27396   return BB;
   27397 }
   27398 
   27399 MachineBasicBlock *
   27400 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
   27401                                         MachineBasicBlock *BB) const {
   27402   MachineFunction *MF = BB->getParent();
   27403   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   27404   DebugLoc DL = MI.getDebugLoc();
   27405   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   27406 
   27407   assert(MF->shouldSplitStack());
   27408 
   27409   const bool Is64Bit = Subtarget.is64Bit();
   27410   const bool IsLP64 = Subtarget.isTarget64BitLP64();
   27411 
   27412   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   27413   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
   27414 
   27415   // BB:
   27416   //  ... [Till the alloca]
   27417   // If stacklet is not large enough, jump to mallocMBB
   27418   //
   27419   // bumpMBB:
   27420   //  Allocate by subtracting from RSP
   27421   //  Jump to continueMBB
   27422   //
   27423   // mallocMBB:
   27424   //  Allocate by call to runtime
   27425   //
   27426   // continueMBB:
   27427   //  ...
   27428   //  [rest of original BB]
   27429   //
   27430 
   27431   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   27432   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   27433   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   27434 
   27435   MachineRegisterInfo &MRI = MF->getRegInfo();
   27436   const TargetRegisterClass *AddrRegClass =
   27437       getRegClassFor(getPointerTy(MF->getDataLayout()));
   27438 
   27439   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
   27440            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
   27441            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
   27442            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
   27443            sizeVReg = MI.getOperand(1).getReg(),
   27444            physSPReg =
   27445                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
   27446 
   27447   MachineFunction::iterator MBBIter = ++BB->getIterator();
   27448 
   27449   MF->insert(MBBIter, bumpMBB);
   27450   MF->insert(MBBIter, mallocMBB);
   27451   MF->insert(MBBIter, continueMBB);
   27452 
   27453   continueMBB->splice(continueMBB->begin(), BB,
   27454                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
   27455   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
   27456 
   27457   // Add code to the main basic block to check if the stack limit has been hit,
   27458   // and if so, jump to mallocMBB otherwise to bumpMBB.
   27459   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
   27460   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
   27461     .addReg(tmpSPVReg).addReg(sizeVReg);
   27462   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
   27463     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
   27464     .addReg(SPLimitVReg);
   27465   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
   27466 
   27467   // bumpMBB simply decreases the stack pointer, since we know the current
   27468   // stacklet has enough space.
   27469   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
   27470     .addReg(SPLimitVReg);
   27471   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
   27472     .addReg(SPLimitVReg);
   27473   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
   27474 
   27475   // Calls into a routine in libgcc to allocate more space from the heap.
   27476   const uint32_t *RegMask =
   27477       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
   27478   if (IsLP64) {
   27479     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
   27480       .addReg(sizeVReg);
   27481     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
   27482       .addExternalSymbol("__morestack_allocate_stack_space")
   27483       .addRegMask(RegMask)
   27484       .addReg(X86::RDI, RegState::Implicit)
   27485       .addReg(X86::RAX, RegState::ImplicitDefine);
   27486   } else if (Is64Bit) {
   27487     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
   27488       .addReg(sizeVReg);
   27489     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
   27490       .addExternalSymbol("__morestack_allocate_stack_space")
   27491       .addRegMask(RegMask)
   27492       .addReg(X86::EDI, RegState::Implicit)
   27493       .addReg(X86::EAX, RegState::ImplicitDefine);
   27494   } else {
   27495     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
   27496       .addImm(12);
   27497     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
   27498     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
   27499       .addExternalSymbol("__morestack_allocate_stack_space")
   27500       .addRegMask(RegMask)
   27501       .addReg(X86::EAX, RegState::ImplicitDefine);
   27502   }
   27503 
   27504   if (!Is64Bit)
   27505     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
   27506       .addImm(16);
   27507 
   27508   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
   27509     .addReg(IsLP64 ? X86::RAX : X86::EAX);
   27510   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
   27511 
   27512   // Set up the CFG correctly.
   27513   BB->addSuccessor(bumpMBB);
   27514   BB->addSuccessor(mallocMBB);
   27515   mallocMBB->addSuccessor(continueMBB);
   27516   bumpMBB->addSuccessor(continueMBB);
   27517 
   27518   // Take care of the PHI nodes.
   27519   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
   27520           MI.getOperand(0).getReg())
   27521       .addReg(mallocPtrVReg)
   27522       .addMBB(mallocMBB)
   27523       .addReg(bumpSPPtrVReg)
   27524       .addMBB(bumpMBB);
   27525 
   27526   // Delete the original pseudo instruction.
   27527   MI.eraseFromParent();
   27528 
   27529   // And we're done.
   27530   return continueMBB;
   27531 }
   27532 
   27533 MachineBasicBlock *
   27534 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
   27535                                        MachineBasicBlock *BB) const {
   27536   MachineFunction *MF = BB->getParent();
   27537   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   27538   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
   27539   DebugLoc DL = MI.getDebugLoc();
   27540 
   27541   assert(!isAsynchronousEHPersonality(
   27542              classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
   27543          "SEH does not use catchret!");
   27544 
   27545   // Only 32-bit EH needs to worry about manually restoring stack pointers.
   27546   if (!Subtarget.is32Bit())
   27547     return BB;
   27548 
   27549   // C++ EH creates a new target block to hold the restore code, and wires up
   27550   // the new block to the return destination with a normal JMP_4.
   27551   MachineBasicBlock *RestoreMBB =
   27552       MF->CreateMachineBasicBlock(BB->getBasicBlock());
   27553   assert(BB->succ_size() == 1);
   27554   MF->insert(std::next(BB->getIterator()), RestoreMBB);
   27555   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
   27556   BB->addSuccessor(RestoreMBB);
   27557   MI.getOperand(0).setMBB(RestoreMBB);
   27558 
   27559   auto RestoreMBBI = RestoreMBB->begin();
   27560   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
   27561   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
   27562   return BB;
   27563 }
   27564 
   27565 MachineBasicBlock *
   27566 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
   27567                                        MachineBasicBlock *BB) const {
   27568   MachineFunction *MF = BB->getParent();
   27569   const Constant *PerFn = MF->getFunction().getPersonalityFn();
   27570   bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
   27571   // Only 32-bit SEH requires special handling for catchpad.
   27572   if (IsSEH && Subtarget.is32Bit()) {
   27573     const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   27574     DebugLoc DL = MI.getDebugLoc();
   27575     BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
   27576   }
   27577   MI.eraseFromParent();
   27578   return BB;
   27579 }
   27580 
   27581 MachineBasicBlock *
   27582 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
   27583                                       MachineBasicBlock *BB) const {
   27584   // So, here we replace TLSADDR with the sequence:
   27585   // adjust_stackdown -> TLSADDR -> adjust_stackup.
   27586   // We need this because TLSADDR is lowered into calls
   27587   // inside MC, therefore without the two markers shrink-wrapping
   27588   // may push the prologue/epilogue pass them.
   27589   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   27590   DebugLoc DL = MI.getDebugLoc();
   27591   MachineFunction &MF = *BB->getParent();
   27592 
   27593   // Emit CALLSEQ_START right before the instruction.
   27594   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   27595   MachineInstrBuilder CallseqStart =
   27596     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
   27597   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
   27598 
   27599   // Emit CALLSEQ_END right after the instruction.
   27600   // We don't call erase from parent because we want to keep the
   27601   // original instruction around.
   27602   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   27603   MachineInstrBuilder CallseqEnd =
   27604     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
   27605   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
   27606 
   27607   return BB;
   27608 }
   27609 
   27610 MachineBasicBlock *
   27611 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
   27612                                       MachineBasicBlock *BB) const {
   27613   // This is pretty easy.  We're taking the value that we received from
   27614   // our load from the relocation, sticking it in either RDI (x86-64)
   27615   // or EAX and doing an indirect call.  The return value will then
   27616   // be in the normal return register.
   27617   MachineFunction *F = BB->getParent();
   27618   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   27619   DebugLoc DL = MI.getDebugLoc();
   27620 
   27621   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
   27622   assert(MI.getOperand(3).isGlobal() && "This should be a global");
   27623 
   27624   // Get a register mask for the lowered call.
   27625   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   27626   // proper register mask.
   27627   const uint32_t *RegMask =
   27628       Subtarget.is64Bit() ?
   27629       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
   27630       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
   27631   if (Subtarget.is64Bit()) {
   27632     MachineInstrBuilder MIB =
   27633         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
   27634             .addReg(X86::RIP)
   27635             .addImm(0)
   27636             .addReg(0)
   27637             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
   27638                               MI.getOperand(3).getTargetFlags())
   27639             .addReg(0);
   27640     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
   27641     addDirectMem(MIB, X86::RDI);
   27642     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
   27643   } else if (!isPositionIndependent()) {
   27644     MachineInstrBuilder MIB =
   27645         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
   27646             .addReg(0)
   27647             .addImm(0)
   27648             .addReg(0)
   27649             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
   27650                               MI.getOperand(3).getTargetFlags())
   27651             .addReg(0);
   27652     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
   27653     addDirectMem(MIB, X86::EAX);
   27654     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   27655   } else {
   27656     MachineInstrBuilder MIB =
   27657         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
   27658             .addReg(TII->getGlobalBaseReg(F))
   27659             .addImm(0)
   27660             .addReg(0)
   27661             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
   27662                               MI.getOperand(3).getTargetFlags())
   27663             .addReg(0);
   27664     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
   27665     addDirectMem(MIB, X86::EAX);
   27666     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   27667   }
   27668 
   27669   MI.eraseFromParent(); // The pseudo instruction is gone now.
   27670   return BB;
   27671 }
   27672 
   27673 static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
   27674   switch (RPOpc) {
   27675   case X86::RETPOLINE_CALL32:
   27676     return X86::CALLpcrel32;
   27677   case X86::RETPOLINE_CALL64:
   27678     return X86::CALL64pcrel32;
   27679   case X86::RETPOLINE_TCRETURN32:
   27680     return X86::TCRETURNdi;
   27681   case X86::RETPOLINE_TCRETURN64:
   27682     return X86::TCRETURNdi64;
   27683   }
   27684   llvm_unreachable("not retpoline opcode");
   27685 }
   27686 
   27687 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
   27688                                       unsigned Reg) {
   27689   if (Subtarget.useRetpolineExternalThunk()) {
   27690     // When using an external thunk for retpolines, we pick names that match the
   27691     // names GCC happens to use as well. This helps simplify the implementation
   27692     // of the thunks for kernels where they have no easy ability to create
   27693     // aliases and are doing non-trivial configuration of the thunk's body. For
   27694     // example, the Linux kernel will do boot-time hot patching of the thunk
   27695     // bodies and cannot easily export aliases of these to loaded modules.
   27696     //
   27697     // Note that at any point in the future, we may need to change the semantics
   27698     // of how we implement retpolines and at that time will likely change the
   27699     // name of the called thunk. Essentially, there is no hard guarantee that
   27700     // LLVM will generate calls to specific thunks, we merely make a best-effort
   27701     // attempt to help out kernels and other systems where duplicating the
   27702     // thunks is costly.
   27703     switch (Reg) {
   27704     case X86::EAX:
   27705       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
   27706       return "__x86_indirect_thunk_eax";
   27707     case X86::ECX:
   27708       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
   27709       return "__x86_indirect_thunk_ecx";
   27710     case X86::EDX:
   27711       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
   27712       return "__x86_indirect_thunk_edx";
   27713     case X86::EDI:
   27714       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
   27715       return "__x86_indirect_thunk_edi";
   27716     case X86::R11:
   27717       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
   27718       return "__x86_indirect_thunk_r11";
   27719     }
   27720     llvm_unreachable("unexpected reg for retpoline");
   27721   }
   27722 
   27723   // When targeting an internal COMDAT thunk use an LLVM-specific name.
   27724   switch (Reg) {
   27725   case X86::EAX:
   27726     assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
   27727     return "__llvm_retpoline_eax";
   27728   case X86::ECX:
   27729     assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
   27730     return "__llvm_retpoline_ecx";
   27731   case X86::EDX:
   27732     assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
   27733     return "__llvm_retpoline_edx";
   27734   case X86::EDI:
   27735     assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
   27736     return "__llvm_retpoline_edi";
   27737   case X86::R11:
   27738     assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
   27739     return "__llvm_retpoline_r11";
   27740   }
   27741   llvm_unreachable("unexpected reg for retpoline");
   27742 }
   27743 
   27744 MachineBasicBlock *
   27745 X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
   27746                                         MachineBasicBlock *BB) const {
   27747   // Copy the virtual register into the R11 physical register and
   27748   // call the retpoline thunk.
   27749   DebugLoc DL = MI.getDebugLoc();
   27750   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   27751   unsigned CalleeVReg = MI.getOperand(0).getReg();
   27752   unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
   27753 
   27754   // Find an available scratch register to hold the callee. On 64-bit, we can
   27755   // just use R11, but we scan for uses anyway to ensure we don't generate
   27756   // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
   27757   // already a register use operand to the call to hold the callee. If none
   27758   // are available, use EDI instead. EDI is chosen because EBX is the PIC base
   27759   // register and ESI is the base pointer to realigned stack frames with VLAs.
   27760   SmallVector<unsigned, 3> AvailableRegs;
   27761   if (Subtarget.is64Bit())
   27762     AvailableRegs.push_back(X86::R11);
   27763   else
   27764     AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
   27765 
   27766   // Zero out any registers that are already used.
   27767   for (const auto &MO : MI.operands()) {
   27768     if (MO.isReg() && MO.isUse())
   27769       for (unsigned &Reg : AvailableRegs)
   27770         if (Reg == MO.getReg())
   27771           Reg = 0;
   27772   }
   27773 
   27774   // Choose the first remaining non-zero available register.
   27775   unsigned AvailableReg = 0;
   27776   for (unsigned MaybeReg : AvailableRegs) {
   27777     if (MaybeReg) {
   27778       AvailableReg = MaybeReg;
   27779       break;
   27780     }
   27781   }
   27782   if (!AvailableReg)
   27783     report_fatal_error("calling convention incompatible with retpoline, no "
   27784                        "available registers");
   27785 
   27786   const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
   27787 
   27788   BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
   27789       .addReg(CalleeVReg);
   27790   MI.getOperand(0).ChangeToES(Symbol);
   27791   MI.setDesc(TII->get(Opc));
   27792   MachineInstrBuilder(*BB->getParent(), &MI)
   27793       .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
   27794   return BB;
   27795 }
   27796 
   27797 /// SetJmp implies future control flow change upon calling the corresponding
   27798 /// LongJmp.
   27799 /// Instead of using the 'return' instruction, the long jump fixes the stack and
   27800 /// performs an indirect branch. To do so it uses the registers that were stored
   27801 /// in the jump buffer (when calling SetJmp).
   27802 /// In case the shadow stack is enabled we need to fix it as well, because some
   27803 /// return addresses will be skipped.
   27804 /// The function will save the SSP for future fixing in the function
   27805 /// emitLongJmpShadowStackFix.
   27806 /// \sa emitLongJmpShadowStackFix
   27807 /// \param [in] MI The temporary Machine Instruction for the builtin.
   27808 /// \param [in] MBB The Machine Basic Block that will be modified.
   27809 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
   27810                                                  MachineBasicBlock *MBB) const {
   27811   DebugLoc DL = MI.getDebugLoc();
   27812   MachineFunction *MF = MBB->getParent();
   27813   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   27814   MachineRegisterInfo &MRI = MF->getRegInfo();
   27815   MachineInstrBuilder MIB;
   27816 
   27817   // Memory Reference.
   27818   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   27819   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
   27820 
   27821   // Initialize a register with zero.
   27822   MVT PVT = getPointerTy(MF->getDataLayout());
   27823   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
   27824   unsigned ZReg = MRI.createVirtualRegister(PtrRC);
   27825   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
   27826   BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
   27827       .addDef(ZReg)
   27828       .addReg(ZReg, RegState::Undef)
   27829       .addReg(ZReg, RegState::Undef);
   27830 
   27831   // Read the current SSP Register value to the zeroed register.
   27832   unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
   27833   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
   27834   BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
   27835 
   27836   // Write the SSP register value to offset 3 in input memory buffer.
   27837   unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
   27838   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
   27839   const int64_t SSPOffset = 3 * PVT.getStoreSize();
   27840   const unsigned MemOpndSlot = 1;
   27841   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   27842     if (i == X86::AddrDisp)
   27843       MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
   27844     else
   27845       MIB.add(MI.getOperand(MemOpndSlot + i));
   27846   }
   27847   MIB.addReg(SSPCopyReg);
   27848   MIB.setMemRefs(MMOBegin, MMOEnd);
   27849 }
   27850 
   27851 MachineBasicBlock *
   27852 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
   27853                                     MachineBasicBlock *MBB) const {
   27854   DebugLoc DL = MI.getDebugLoc();
   27855   MachineFunction *MF = MBB->getParent();
   27856   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   27857   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   27858   MachineRegisterInfo &MRI = MF->getRegInfo();
   27859 
   27860   const BasicBlock *BB = MBB->getBasicBlock();
   27861   MachineFunction::iterator I = ++MBB->getIterator();
   27862 
   27863   // Memory Reference
   27864   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   27865   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
   27866 
   27867   unsigned DstReg;
   27868   unsigned MemOpndSlot = 0;
   27869 
   27870   unsigned CurOp = 0;
   27871 
   27872   DstReg = MI.getOperand(CurOp++).getReg();
   27873   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   27874   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
   27875   (void)TRI;
   27876   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   27877   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
   27878 
   27879   MemOpndSlot = CurOp;
   27880 
   27881   MVT PVT = getPointerTy(MF->getDataLayout());
   27882   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
   27883          "Invalid Pointer Size!");
   27884 
   27885   // For v = setjmp(buf), we generate
   27886   //
   27887   // thisMBB:
   27888   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
   27889   //  SjLjSetup restoreMBB
   27890   //
   27891   // mainMBB:
   27892   //  v_main = 0
   27893   //
   27894   // sinkMBB:
   27895   //  v = phi(main, restore)
   27896   //
   27897   // restoreMBB:
   27898   //  if base pointer being used, load it from frame
   27899   //  v_restore = 1
   27900 
   27901   MachineBasicBlock *thisMBB = MBB;
   27902   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   27903   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   27904   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
   27905   MF->insert(I, mainMBB);
   27906   MF->insert(I, sinkMBB);
   27907   MF->push_back(restoreMBB);
   27908   restoreMBB->setHasAddressTaken();
   27909 
   27910   MachineInstrBuilder MIB;
   27911 
   27912   // Transfer the remainder of BB and its successor edges to sinkMBB.
   27913   sinkMBB->splice(sinkMBB->begin(), MBB,
   27914                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   27915   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
   27916 
   27917   // thisMBB:
   27918   unsigned PtrStoreOpc = 0;
   27919   unsigned LabelReg = 0;
   27920   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   27921   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
   27922                      !isPositionIndependent();
   27923 
   27924   // Prepare IP either in reg or imm.
   27925   if (!UseImmLabel) {
   27926     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
   27927     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
   27928     LabelReg = MRI.createVirtualRegister(PtrRC);
   27929     if (Subtarget.is64Bit()) {
   27930       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
   27931               .addReg(X86::RIP)
   27932               .addImm(0)
   27933               .addReg(0)
   27934               .addMBB(restoreMBB)
   27935               .addReg(0);
   27936     } else {
   27937       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
   27938       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
   27939               .addReg(XII->getGlobalBaseReg(MF))
   27940               .addImm(0)
   27941               .addReg(0)
   27942               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
   27943               .addReg(0);
   27944     }
   27945   } else
   27946     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   27947   // Store IP
   27948   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
   27949   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   27950     if (i == X86::AddrDisp)
   27951       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
   27952     else
   27953       MIB.add(MI.getOperand(MemOpndSlot + i));
   27954   }
   27955   if (!UseImmLabel)
   27956     MIB.addReg(LabelReg);
   27957   else
   27958     MIB.addMBB(restoreMBB);
   27959   MIB.setMemRefs(MMOBegin, MMOEnd);
   27960 
   27961   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
   27962     emitSetJmpShadowStackFix(MI, thisMBB);
   27963   }
   27964 
   27965   // Setup
   27966   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
   27967           .addMBB(restoreMBB);
   27968 
   27969   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   27970   MIB.addRegMask(RegInfo->getNoPreservedMask());
   27971   thisMBB->addSuccessor(mainMBB);
   27972   thisMBB->addSuccessor(restoreMBB);
   27973 
   27974   // mainMBB:
   27975   //  EAX = 0
   27976   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
   27977   mainMBB->addSuccessor(sinkMBB);
   27978 
   27979   // sinkMBB:
   27980   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
   27981           TII->get(X86::PHI), DstReg)
   27982     .addReg(mainDstReg).addMBB(mainMBB)
   27983     .addReg(restoreDstReg).addMBB(restoreMBB);
   27984 
   27985   // restoreMBB:
   27986   if (RegInfo->hasBasePointer(*MF)) {
   27987     const bool Uses64BitFramePtr =
   27988         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
   27989     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
   27990     X86FI->setRestoreBasePointer(MF);
   27991     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
   27992     unsigned BasePtr = RegInfo->getBaseRegister();
   27993     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
   27994     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
   27995                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
   27996       .setMIFlag(MachineInstr::FrameSetup);
   27997   }
   27998   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
   27999   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   28000   restoreMBB->addSuccessor(sinkMBB);
   28001 
   28002   MI.eraseFromParent();
   28003   return sinkMBB;
   28004 }
   28005 
   28006 /// Fix the shadow stack using the previously saved SSP pointer.
   28007 /// \sa emitSetJmpShadowStackFix
   28008 /// \param [in] MI The temporary Machine Instruction for the builtin.
   28009 /// \param [in] MBB The Machine Basic Block that will be modified.
   28010 /// \return The sink MBB that will perform the future indirect branch.
   28011 MachineBasicBlock *
   28012 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
   28013                                              MachineBasicBlock *MBB) const {
   28014   DebugLoc DL = MI.getDebugLoc();
   28015   MachineFunction *MF = MBB->getParent();
   28016   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   28017   MachineRegisterInfo &MRI = MF->getRegInfo();
   28018 
   28019   // Memory Reference
   28020   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   28021   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
   28022 
   28023   MVT PVT = getPointerTy(MF->getDataLayout());
   28024   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
   28025 
   28026   // checkSspMBB:
   28027   //         xor vreg1, vreg1
   28028   //         rdssp vreg1
   28029   //         test vreg1, vreg1
   28030   //         je sinkMBB   # Jump if Shadow Stack is not supported
   28031   // fallMBB:
   28032   //         mov buf+24/12(%rip), vreg2
   28033   //         sub vreg1, vreg2
   28034   //         jbe sinkMBB  # No need to fix the Shadow Stack
   28035   // fixShadowMBB:
   28036   //         shr 3/2, vreg2
   28037   //         incssp vreg2  # fix the SSP according to the lower 8 bits
   28038   //         shr 8, vreg2
   28039   //         je sinkMBB
   28040   // fixShadowLoopPrepareMBB:
   28041   //         shl vreg2
   28042   //         mov 128, vreg3
   28043   // fixShadowLoopMBB:
   28044   //         incssp vreg3
   28045   //         dec vreg2
   28046   //         jne fixShadowLoopMBB # Iterate until you finish fixing
   28047   //                              # the Shadow Stack
   28048   // sinkMBB:
   28049 
   28050   MachineFunction::iterator I = ++MBB->getIterator();
   28051   const BasicBlock *BB = MBB->getBasicBlock();
   28052 
   28053   MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
   28054   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
   28055   MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
   28056   MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
   28057   MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
   28058   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   28059   MF->insert(I, checkSspMBB);
   28060   MF->insert(I, fallMBB);
   28061   MF->insert(I, fixShadowMBB);
   28062   MF->insert(I, fixShadowLoopPrepareMBB);
   28063   MF->insert(I, fixShadowLoopMBB);
   28064   MF->insert(I, sinkMBB);
   28065 
   28066   // Transfer the remainder of BB and its successor edges to sinkMBB.
   28067   sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
   28068                   MBB->end());
   28069   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
   28070 
   28071   MBB->addSuccessor(checkSspMBB);
   28072 
   28073   // Initialize a register with zero.
   28074   unsigned ZReg = MRI.createVirtualRegister(PtrRC);
   28075   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
   28076   BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
   28077       .addDef(ZReg)
   28078       .addReg(ZReg, RegState::Undef)
   28079       .addReg(ZReg, RegState::Undef);
   28080 
   28081   // Read the current SSP Register value to the zeroed register.
   28082   unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
   28083   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
   28084   BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
   28085 
   28086   // Check whether the result of the SSP register is zero and jump directly
   28087   // to the sink.
   28088   unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
   28089   BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
   28090       .addReg(SSPCopyReg)
   28091       .addReg(SSPCopyReg);
   28092   BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
   28093   checkSspMBB->addSuccessor(sinkMBB);
   28094   checkSspMBB->addSuccessor(fallMBB);
   28095 
   28096   // Reload the previously saved SSP register value.
   28097   unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
   28098   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   28099   const int64_t SPPOffset = 3 * PVT.getStoreSize();
   28100   MachineInstrBuilder MIB =
   28101       BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
   28102   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   28103     if (i == X86::AddrDisp)
   28104       MIB.addDisp(MI.getOperand(i), SPPOffset);
   28105     else
   28106       MIB.add(MI.getOperand(i));
   28107   }
   28108   MIB.setMemRefs(MMOBegin, MMOEnd);
   28109 
   28110   // Subtract the current SSP from the previous SSP.
   28111   unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
   28112   unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
   28113   BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
   28114       .addReg(PrevSSPReg)
   28115       .addReg(SSPCopyReg);
   28116 
   28117   // Jump to sink in case PrevSSPReg <= SSPCopyReg.
   28118   BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
   28119   fallMBB->addSuccessor(sinkMBB);
   28120   fallMBB->addSuccessor(fixShadowMBB);
   28121 
   28122   // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
   28123   unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
   28124   unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
   28125   unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
   28126   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
   28127       .addReg(SspSubReg)
   28128       .addImm(Offset);
   28129 
   28130   // Increase SSP when looking only on the lower 8 bits of the delta.
   28131   unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
   28132   BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
   28133 
   28134   // Reset the lower 8 bits.
   28135   unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
   28136   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
   28137       .addReg(SspFirstShrReg)
   28138       .addImm(8);
   28139 
   28140   // Jump if the result of the shift is zero.
   28141   BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
   28142   fixShadowMBB->addSuccessor(sinkMBB);
   28143   fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
   28144 
   28145   // Do a single shift left.
   28146   unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
   28147   unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
   28148   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
   28149       .addReg(SspSecondShrReg);
   28150 
   28151   // Save the value 128 to a register (will be used next with incssp).
   28152   unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
   28153   unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
   28154   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
   28155       .addImm(128);
   28156   fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
   28157 
   28158   // Since incssp only looks at the lower 8 bits, we might need to do several
   28159   // iterations of incssp until we finish fixing the shadow stack.
   28160   unsigned DecReg = MRI.createVirtualRegister(PtrRC);
   28161   unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
   28162   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
   28163       .addReg(SspAfterShlReg)
   28164       .addMBB(fixShadowLoopPrepareMBB)
   28165       .addReg(DecReg)
   28166       .addMBB(fixShadowLoopMBB);
   28167 
   28168   // Every iteration we increase the SSP by 128.
   28169   BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
   28170 
   28171   // Every iteration we decrement the counter by 1.
   28172   unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
   28173   BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
   28174 
   28175   // Jump if the counter is not zero yet.
   28176   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
   28177   fixShadowLoopMBB->addSuccessor(sinkMBB);
   28178   fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
   28179 
   28180   return sinkMBB;
   28181 }
   28182 
   28183 MachineBasicBlock *
   28184 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
   28185                                      MachineBasicBlock *MBB) const {
   28186   DebugLoc DL = MI.getDebugLoc();
   28187   MachineFunction *MF = MBB->getParent();
   28188   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   28189   MachineRegisterInfo &MRI = MF->getRegInfo();
   28190 
   28191   // Memory Reference
   28192   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   28193   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
   28194 
   28195   MVT PVT = getPointerTy(MF->getDataLayout());
   28196   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
   28197          "Invalid Pointer Size!");
   28198 
   28199   const TargetRegisterClass *RC =
   28200     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   28201   unsigned Tmp = MRI.createVirtualRegister(RC);
   28202   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   28203   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   28204   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   28205   unsigned SP = RegInfo->getStackRegister();
   28206 
   28207   MachineInstrBuilder MIB;
   28208 
   28209   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   28210   const int64_t SPOffset = 2 * PVT.getStoreSize();
   28211 
   28212   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   28213   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
   28214 
   28215   MachineBasicBlock *thisMBB = MBB;
   28216 
   28217   // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
   28218   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
   28219     thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
   28220   }
   28221 
   28222   // Reload FP
   28223   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
   28224   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
   28225     MIB.add(MI.getOperand(i));
   28226   MIB.setMemRefs(MMOBegin, MMOEnd);
   28227 
   28228   // Reload IP
   28229   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
   28230   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   28231     if (i == X86::AddrDisp)
   28232       MIB.addDisp(MI.getOperand(i), LabelOffset);
   28233     else
   28234       MIB.add(MI.getOperand(i));
   28235   }
   28236   MIB.setMemRefs(MMOBegin, MMOEnd);
   28237 
   28238   // Reload SP
   28239   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
   28240   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
   28241     if (i == X86::AddrDisp)
   28242       MIB.addDisp(MI.getOperand(i), SPOffset);
   28243     else
   28244       MIB.add(MI.getOperand(i));
   28245   }
   28246   MIB.setMemRefs(MMOBegin, MMOEnd);
   28247 
   28248   // Jump
   28249   BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
   28250 
   28251   MI.eraseFromParent();
   28252   return thisMBB;
   28253 }
   28254 
   28255 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
   28256                                                MachineBasicBlock *MBB,
   28257                                                MachineBasicBlock *DispatchBB,
   28258                                                int FI) const {
   28259   DebugLoc DL = MI.getDebugLoc();
   28260   MachineFunction *MF = MBB->getParent();
   28261   MachineRegisterInfo *MRI = &MF->getRegInfo();
   28262   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   28263 
   28264   MVT PVT = getPointerTy(MF->getDataLayout());
   28265   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
   28266 
   28267   unsigned Op = 0;
   28268   unsigned VR = 0;
   28269 
   28270   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
   28271                      !isPositionIndependent();
   28272 
   28273   if (UseImmLabel) {
   28274     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   28275   } else {
   28276     const TargetRegisterClass *TRC =
   28277         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   28278     VR = MRI->createVirtualRegister(TRC);
   28279     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
   28280 
   28281     if (Subtarget.is64Bit())
   28282       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
   28283           .addReg(X86::RIP)
   28284           .addImm(1)
   28285           .addReg(0)
   28286           .addMBB(DispatchBB)
   28287           .addReg(0);
   28288     else
   28289       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
   28290           .addReg(0) /* TII->getGlobalBaseReg(MF) */
   28291           .addImm(1)
   28292           .addReg(0)
   28293           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
   28294           .addReg(0);
   28295   }
   28296 
   28297   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
   28298   addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
   28299   if (UseImmLabel)
   28300     MIB.addMBB(DispatchBB);
   28301   else
   28302     MIB.addReg(VR);
   28303 }
   28304 
   28305 MachineBasicBlock *
   28306 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   28307                                          MachineBasicBlock *BB) const {
   28308   DebugLoc DL = MI.getDebugLoc();
   28309   MachineFunction *MF = BB->getParent();
   28310   MachineFrameInfo &MFI = MF->getFrameInfo();
   28311   MachineRegisterInfo *MRI = &MF->getRegInfo();
   28312   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   28313   int FI = MFI.getFunctionContextIndex();
   28314 
   28315   // Get a mapping of the call site numbers to all of the landing pads they're
   28316   // associated with.
   28317   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
   28318   unsigned MaxCSNum = 0;
   28319   for (auto &MBB : *MF) {
   28320     if (!MBB.isEHPad())
   28321       continue;
   28322 
   28323     MCSymbol *Sym = nullptr;
   28324     for (const auto &MI : MBB) {
   28325       if (MI.isDebugInstr())
   28326         continue;
   28327 
   28328       assert(MI.isEHLabel() && "expected EH_LABEL");
   28329       Sym = MI.getOperand(0).getMCSymbol();
   28330       break;
   28331     }
   28332 
   28333     if (!MF->hasCallSiteLandingPad(Sym))
   28334       continue;
   28335 
   28336     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
   28337       CallSiteNumToLPad[CSI].push_back(&MBB);
   28338       MaxCSNum = std::max(MaxCSNum, CSI);
   28339     }
   28340   }
   28341 
   28342   // Get an ordered list of the machine basic blocks for the jump table.
   28343   std::vector<MachineBasicBlock *> LPadList;
   28344   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
   28345   LPadList.reserve(CallSiteNumToLPad.size());
   28346 
   28347   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
   28348     for (auto &LP : CallSiteNumToLPad[CSI]) {
   28349       LPadList.push_back(LP);
   28350       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
   28351     }
   28352   }
   28353 
   28354   assert(!LPadList.empty() &&
   28355          "No landing pad destinations for the dispatch jump table!");
   28356 
   28357   // Create the MBBs for the dispatch code.
   28358 
   28359   // Shove the dispatch's address into the return slot in the function context.
   28360   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
   28361   DispatchBB->setIsEHPad(true);
   28362 
   28363   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
   28364   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
   28365   DispatchBB->addSuccessor(TrapBB);
   28366 
   28367   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
   28368   DispatchBB->addSuccessor(DispContBB);
   28369 
   28370   // Insert MBBs.
   28371   MF->push_back(DispatchBB);
   28372   MF->push_back(DispContBB);
   28373   MF->push_back(TrapBB);
   28374 
   28375   // Insert code into the entry block that creates and registers the function
   28376   // context.
   28377   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
   28378 
   28379   // Create the jump table and associated information
   28380   unsigned JTE = getJumpTableEncoding();
   28381   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
   28382   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
   28383 
   28384   const X86RegisterInfo &RI = TII->getRegisterInfo();
   28385   // Add a register mask with no preserved registers.  This results in all
   28386   // registers being marked as clobbered.
   28387   if (RI.hasBasePointer(*MF)) {
   28388     const bool FPIs64Bit =
   28389         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
   28390     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
   28391     MFI->setRestoreBasePointer(MF);
   28392 
   28393     unsigned FP = RI.getFrameRegister(*MF);
   28394     unsigned BP = RI.getBaseRegister();
   28395     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
   28396     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
   28397                  MFI->getRestoreBasePointerOffset())
   28398         .addRegMask(RI.getNoPreservedMask());
   28399   } else {
   28400     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
   28401         .addRegMask(RI.getNoPreservedMask());
   28402   }
   28403 
   28404   // IReg is used as an index in a memory operand and therefore can't be SP
   28405   unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
   28406   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
   28407                     Subtarget.is64Bit() ? 8 : 4);
   28408   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
   28409       .addReg(IReg)
   28410       .addImm(LPadList.size());
   28411   BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
   28412 
   28413   if (Subtarget.is64Bit()) {
   28414     unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
   28415     unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
   28416 
   28417     // leaq .LJTI0_0(%rip), BReg
   28418     BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
   28419         .addReg(X86::RIP)
   28420         .addImm(1)
   28421         .addReg(0)
   28422         .addJumpTableIndex(MJTI)
   28423         .addReg(0);
   28424     // movzx IReg64, IReg
   28425     BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
   28426         .addImm(0)
   28427         .addReg(IReg)
   28428         .addImm(X86::sub_32bit);
   28429 
   28430     switch (JTE) {
   28431     case MachineJumpTableInfo::EK_BlockAddress:
   28432       // jmpq *(BReg,IReg64,8)
   28433       BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
   28434           .addReg(BReg)
   28435           .addImm(8)
   28436           .addReg(IReg64)
   28437           .addImm(0)
   28438           .addReg(0);
   28439       break;
   28440     case MachineJumpTableInfo::EK_LabelDifference32: {
   28441       unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
   28442       unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
   28443       unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
   28444 
   28445       // movl (BReg,IReg64,4), OReg
   28446       BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
   28447           .addReg(BReg)
   28448           .addImm(4)
   28449           .addReg(IReg64)
   28450           .addImm(0)
   28451           .addReg(0);
   28452       // movsx OReg64, OReg
   28453       BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
   28454       // addq BReg, OReg64, TReg
   28455       BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
   28456           .addReg(OReg64)
   28457           .addReg(BReg);
   28458       // jmpq *TReg
   28459       BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
   28460       break;
   28461     }
   28462     default:
   28463       llvm_unreachable("Unexpected jump table encoding");
   28464     }
   28465   } else {
   28466     // jmpl *.LJTI0_0(,IReg,4)
   28467     BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
   28468         .addReg(0)
   28469         .addImm(4)
   28470         .addReg(IReg)
   28471         .addJumpTableIndex(MJTI)
   28472         .addReg(0);
   28473   }
   28474 
   28475   // Add the jump table entries as successors to the MBB.
   28476   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
   28477   for (auto &LP : LPadList)
   28478     if (SeenMBBs.insert(LP).second)
   28479       DispContBB->addSuccessor(LP);
   28480 
   28481   // N.B. the order the invoke BBs are processed in doesn't matter here.
   28482   SmallVector<MachineBasicBlock *, 64> MBBLPads;
   28483   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
   28484   for (MachineBasicBlock *MBB : InvokeBBs) {
   28485     // Remove the landing pad successor from the invoke block and replace it
   28486     // with the new dispatch block.
   28487     // Keep a copy of Successors since it's modified inside the loop.
   28488     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
   28489                                                    MBB->succ_rend());
   28490     // FIXME: Avoid quadratic complexity.
   28491     for (auto MBBS : Successors) {
   28492       if (MBBS->isEHPad()) {
   28493         MBB->removeSuccessor(MBBS);
   28494         MBBLPads.push_back(MBBS);
   28495       }
   28496     }
   28497 
   28498     MBB->addSuccessor(DispatchBB);
   28499 
   28500     // Find the invoke call and mark all of the callee-saved registers as
   28501     // 'implicit defined' so that they're spilled.  This prevents code from
   28502     // moving instructions to before the EH block, where they will never be
   28503     // executed.
   28504     for (auto &II : reverse(*MBB)) {
   28505       if (!II.isCall())
   28506         continue;
   28507 
   28508       DenseMap<unsigned, bool> DefRegs;
   28509       for (auto &MOp : II.operands())
   28510         if (MOp.isReg())
   28511           DefRegs[MOp.getReg()] = true;
   28512 
   28513       MachineInstrBuilder MIB(*MF, &II);
   28514       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
   28515         unsigned Reg = SavedRegs[RI];
   28516         if (!DefRegs[Reg])
   28517           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
   28518       }
   28519 
   28520       break;
   28521     }
   28522   }
   28523 
   28524   // Mark all former landing pads as non-landing pads.  The dispatch is the only
   28525   // landing pad now.
   28526   for (auto &LP : MBBLPads)
   28527     LP->setIsEHPad(false);
   28528 
   28529   // The instruction is gone now.
   28530   MI.eraseFromParent();
   28531   return BB;
   28532 }
   28533 
   28534 MachineBasicBlock *
   28535 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   28536                                                MachineBasicBlock *BB) const {
   28537   MachineFunction *MF = BB->getParent();
   28538   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   28539   DebugLoc DL = MI.getDebugLoc();
   28540 
   28541   switch (MI.getOpcode()) {
   28542   default: llvm_unreachable("Unexpected instr type to insert");
   28543   case X86::TLS_addr32:
   28544   case X86::TLS_addr64:
   28545   case X86::TLS_base_addr32:
   28546   case X86::TLS_base_addr64:
   28547     return EmitLoweredTLSAddr(MI, BB);
   28548   case X86::RETPOLINE_CALL32:
   28549   case X86::RETPOLINE_CALL64:
   28550   case X86::RETPOLINE_TCRETURN32:
   28551   case X86::RETPOLINE_TCRETURN64:
   28552     return EmitLoweredRetpoline(MI, BB);
   28553   case X86::CATCHRET:
   28554     return EmitLoweredCatchRet(MI, BB);
   28555   case X86::CATCHPAD:
   28556     return EmitLoweredCatchPad(MI, BB);
   28557   case X86::SEG_ALLOCA_32:
   28558   case X86::SEG_ALLOCA_64:
   28559     return EmitLoweredSegAlloca(MI, BB);
   28560   case X86::TLSCall_32:
   28561   case X86::TLSCall_64:
   28562     return EmitLoweredTLSCall(MI, BB);
   28563   case X86::CMOV_FR32:
   28564   case X86::CMOV_FR64:
   28565   case X86::CMOV_F128:
   28566   case X86::CMOV_GR8:
   28567   case X86::CMOV_GR16:
   28568   case X86::CMOV_GR32:
   28569   case X86::CMOV_RFP32:
   28570   case X86::CMOV_RFP64:
   28571   case X86::CMOV_RFP80:
   28572   case X86::CMOV_V2F64:
   28573   case X86::CMOV_V2I64:
   28574   case X86::CMOV_V4F32:
   28575   case X86::CMOV_V4F64:
   28576   case X86::CMOV_V4I64:
   28577   case X86::CMOV_V16F32:
   28578   case X86::CMOV_V8F32:
   28579   case X86::CMOV_V8F64:
   28580   case X86::CMOV_V8I64:
   28581   case X86::CMOV_V8I1:
   28582   case X86::CMOV_V16I1:
   28583   case X86::CMOV_V32I1:
   28584   case X86::CMOV_V64I1:
   28585     return EmitLoweredSelect(MI, BB);
   28586 
   28587   case X86::RDFLAGS32:
   28588   case X86::RDFLAGS64: {
   28589     unsigned PushF =
   28590         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
   28591     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
   28592     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
   28593     // Permit reads of the EFLAGS and DF registers without them being defined.
   28594     // This intrinsic exists to read external processor state in flags, such as
   28595     // the trap flag, interrupt flag, and direction flag, none of which are
   28596     // modeled by the backend.
   28597     assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
   28598            "Unexpected register in operand!");
   28599     Push->getOperand(2).setIsUndef();
   28600     assert(Push->getOperand(3).getReg() == X86::DF &&
   28601            "Unexpected register in operand!");
   28602     Push->getOperand(3).setIsUndef();
   28603     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
   28604 
   28605     MI.eraseFromParent(); // The pseudo is gone now.
   28606     return BB;
   28607   }
   28608 
   28609   case X86::WRFLAGS32:
   28610   case X86::WRFLAGS64: {
   28611     unsigned Push =
   28612         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
   28613     unsigned PopF =
   28614         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
   28615     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
   28616     BuildMI(*BB, MI, DL, TII->get(PopF));
   28617 
   28618     MI.eraseFromParent(); // The pseudo is gone now.
   28619     return BB;
   28620   }
   28621 
   28622   case X86::RELEASE_FADD32mr:
   28623   case X86::RELEASE_FADD64mr:
   28624     return EmitLoweredAtomicFP(MI, BB);
   28625 
   28626   case X86::FP32_TO_INT16_IN_MEM:
   28627   case X86::FP32_TO_INT32_IN_MEM:
   28628   case X86::FP32_TO_INT64_IN_MEM:
   28629   case X86::FP64_TO_INT16_IN_MEM:
   28630   case X86::FP64_TO_INT32_IN_MEM:
   28631   case X86::FP64_TO_INT64_IN_MEM:
   28632   case X86::FP80_TO_INT16_IN_MEM:
   28633   case X86::FP80_TO_INT32_IN_MEM:
   28634   case X86::FP80_TO_INT64_IN_MEM: {
   28635     // Change the floating point control register to use "round towards zero"
   28636     // mode when truncating to an integer value.
   28637     int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
   28638     addFrameReference(BuildMI(*BB, MI, DL,
   28639                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
   28640 
   28641     // Load the old value of the high byte of the control word...
   28642     unsigned OldCW =
   28643       MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
   28644     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
   28645                       CWFrameIdx);
   28646 
   28647     // Set the high part to be round to zero...
   28648     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
   28649       .addImm(0xC7F);
   28650 
   28651     // Reload the modified control word now...
   28652     addFrameReference(BuildMI(*BB, MI, DL,
   28653                               TII->get(X86::FLDCW16m)), CWFrameIdx);
   28654 
   28655     // Restore the memory image of control word to original value
   28656     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
   28657       .addReg(OldCW);
   28658 
   28659     // Get the X86 opcode to use.
   28660     unsigned Opc;
   28661     switch (MI.getOpcode()) {
   28662     default: llvm_unreachable("illegal opcode!");
   28663     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
   28664     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
   28665     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
   28666     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
   28667     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
   28668     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
   28669     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
   28670     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
   28671     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
   28672     }
   28673 
   28674     X86AddressMode AM = getAddressFromInstr(&MI, 0);
   28675     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
   28676         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
   28677 
   28678     // Reload the original control word now.
   28679     addFrameReference(BuildMI(*BB, MI, DL,
   28680                               TII->get(X86::FLDCW16m)), CWFrameIdx);
   28681 
   28682     MI.eraseFromParent(); // The pseudo instruction is gone now.
   28683     return BB;
   28684   }
   28685   // Thread synchronization.
   28686   case X86::MONITOR:
   28687     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
   28688   case X86::MONITORX:
   28689     return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
   28690 
   28691   // Cache line zero
   28692   case X86::CLZERO:
   28693     return emitClzero(&MI, BB, Subtarget);
   28694 
   28695   // PKU feature
   28696   case X86::WRPKRU:
   28697     return emitWRPKRU(MI, BB, Subtarget);
   28698   case X86::RDPKRU:
   28699     return emitRDPKRU(MI, BB, Subtarget);
   28700   // xbegin
   28701   case X86::XBEGIN:
   28702     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
   28703 
   28704   case X86::VASTART_SAVE_XMM_REGS:
   28705     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
   28706 
   28707   case X86::VAARG_64:
   28708     return EmitVAARG64WithCustomInserter(MI, BB);
   28709 
   28710   case X86::EH_SjLj_SetJmp32:
   28711   case X86::EH_SjLj_SetJmp64:
   28712     return emitEHSjLjSetJmp(MI, BB);
   28713 
   28714   case X86::EH_SjLj_LongJmp32:
   28715   case X86::EH_SjLj_LongJmp64:
   28716     return emitEHSjLjLongJmp(MI, BB);
   28717 
   28718   case X86::Int_eh_sjlj_setup_dispatch:
   28719     return EmitSjLjDispatchBlock(MI, BB);
   28720 
   28721   case TargetOpcode::STATEPOINT:
   28722     // As an implementation detail, STATEPOINT shares the STACKMAP format at
   28723     // this point in the process.  We diverge later.
   28724     return emitPatchPoint(MI, BB);
   28725 
   28726   case TargetOpcode::STACKMAP:
   28727   case TargetOpcode::PATCHPOINT:
   28728     return emitPatchPoint(MI, BB);
   28729 
   28730   case TargetOpcode::PATCHABLE_EVENT_CALL:
   28731     return emitXRayCustomEvent(MI, BB);
   28732 
   28733   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
   28734     return emitXRayTypedEvent(MI, BB);
   28735 
   28736   case X86::LCMPXCHG8B: {
   28737     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   28738     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
   28739     // requires a memory operand. If it happens that current architecture is
   28740     // i686 and for current function we need a base pointer
   28741     // - which is ESI for i686 - register allocator would not be able to
   28742     // allocate registers for an address in form of X(%reg, %reg, Y)
   28743     // - there never would be enough unreserved registers during regalloc
   28744     // (without the need for base ptr the only option would be X(%edi, %esi, Y).
   28745     // We are giving a hand to register allocator by precomputing the address in
   28746     // a new vreg using LEA.
   28747 
   28748     // If it is not i686 or there is no base pointer - nothing to do here.
   28749     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
   28750       return BB;
   28751 
   28752     // Even though this code does not necessarily needs the base pointer to
   28753     // be ESI, we check for that. The reason: if this assert fails, there are
   28754     // some changes happened in the compiler base pointer handling, which most
   28755     // probably have to be addressed somehow here.
   28756     assert(TRI->getBaseRegister() == X86::ESI &&
   28757            "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
   28758            "base pointer in mind");
   28759 
   28760     MachineRegisterInfo &MRI = MF->getRegInfo();
   28761     MVT SPTy = getPointerTy(MF->getDataLayout());
   28762     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
   28763     unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
   28764 
   28765     X86AddressMode AM = getAddressFromInstr(&MI, 0);
   28766     // Regalloc does not need any help when the memory operand of CMPXCHG8B
   28767     // does not use index register.
   28768     if (AM.IndexReg == X86::NoRegister)
   28769       return BB;
   28770 
   28771     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
   28772     // four operand definitions that are E[ABCD] registers. We skip them and
   28773     // then insert the LEA.
   28774     MachineBasicBlock::iterator MBBI(MI);
   28775     while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
   28776            MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
   28777       --MBBI;
   28778     addFullAddress(
   28779         BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
   28780 
   28781     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
   28782 
   28783     return BB;
   28784   }
   28785   case X86::LCMPXCHG16B:
   28786     return BB;
   28787   case X86::LCMPXCHG8B_SAVE_EBX:
   28788   case X86::LCMPXCHG16B_SAVE_RBX: {
   28789     unsigned BasePtr =
   28790         MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
   28791     if (!BB->isLiveIn(BasePtr))
   28792       BB->addLiveIn(BasePtr);
   28793     return BB;
   28794   }
   28795   }
   28796 }
   28797 
   28798 //===----------------------------------------------------------------------===//
   28799 //                           X86 Optimization Hooks
   28800 //===----------------------------------------------------------------------===//
   28801 
   28802 bool
   28803 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
   28804                                                 const APInt &Demanded,
   28805                                                 TargetLoweringOpt &TLO) const {
   28806   // Only optimize Ands to prevent shrinking a constant that could be
   28807   // matched by movzx.
   28808   if (Op.getOpcode() != ISD::AND)
   28809     return false;
   28810 
   28811   EVT VT = Op.getValueType();
   28812 
   28813   // Ignore vectors.
   28814   if (VT.isVector())
   28815     return false;
   28816 
   28817   unsigned Size = VT.getSizeInBits();
   28818 
   28819   // Make sure the RHS really is a constant.
   28820   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   28821   if (!C)
   28822     return false;
   28823 
   28824   const APInt &Mask = C->getAPIntValue();
   28825 
   28826   // Clear all non-demanded bits initially.
   28827   APInt ShrunkMask = Mask & Demanded;
   28828 
   28829   // Find the width of the shrunk mask.
   28830   unsigned Width = ShrunkMask.getActiveBits();
   28831 
   28832   // If the mask is all 0s there's nothing to do here.
   28833   if (Width == 0)
   28834     return false;
   28835 
   28836   // Find the next power of 2 width, rounding up to a byte.
   28837   Width = PowerOf2Ceil(std::max(Width, 8U));
   28838   // Truncate the width to size to handle illegal types.
   28839   Width = std::min(Width, Size);
   28840 
   28841   // Calculate a possible zero extend mask for this constant.
   28842   APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
   28843 
   28844   // If we aren't changing the mask, just return true to keep it and prevent
   28845   // the caller from optimizing.
   28846   if (ZeroExtendMask == Mask)
   28847     return true;
   28848 
   28849   // Make sure the new mask can be represented by a combination of mask bits
   28850   // and non-demanded bits.
   28851   if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
   28852     return false;
   28853 
   28854   // Replace the constant with the zero extend mask.
   28855   SDLoc DL(Op);
   28856   SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
   28857   SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
   28858   return TLO.CombineTo(Op, NewOp);
   28859 }
   28860 
   28861 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   28862                                                       KnownBits &Known,
   28863                                                       const APInt &DemandedElts,
   28864                                                       const SelectionDAG &DAG,
   28865                                                       unsigned Depth) const {
   28866   unsigned BitWidth = Known.getBitWidth();
   28867   unsigned Opc = Op.getOpcode();
   28868   EVT VT = Op.getValueType();
   28869   assert((Opc >= ISD::BUILTIN_OP_END ||
   28870           Opc == ISD::INTRINSIC_WO_CHAIN ||
   28871           Opc == ISD::INTRINSIC_W_CHAIN ||
   28872           Opc == ISD::INTRINSIC_VOID) &&
   28873          "Should use MaskedValueIsZero if you don't know whether Op"
   28874          " is a target node!");
   28875 
   28876   Known.resetAll();
   28877   switch (Opc) {
   28878   default: break;
   28879   case X86ISD::SETCC:
   28880     Known.Zero.setBitsFrom(1);
   28881     break;
   28882   case X86ISD::MOVMSK: {
   28883     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
   28884     Known.Zero.setBitsFrom(NumLoBits);
   28885     break;
   28886   }
   28887   case X86ISD::PEXTRB:
   28888   case X86ISD::PEXTRW: {
   28889     SDValue Src = Op.getOperand(0);
   28890     EVT SrcVT = Src.getValueType();
   28891     APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
   28892                                             Op.getConstantOperandVal(1));
   28893     DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
   28894     Known = Known.zextOrTrunc(BitWidth);
   28895     Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
   28896     break;
   28897   }
   28898   case X86ISD::VSHLI:
   28899   case X86ISD::VSRLI: {
   28900     if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   28901       if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
   28902         Known.setAllZero();
   28903         break;
   28904       }
   28905 
   28906       DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
   28907       unsigned ShAmt = ShiftImm->getZExtValue();
   28908       if (Opc == X86ISD::VSHLI) {
   28909         Known.Zero <<= ShAmt;
   28910         Known.One <<= ShAmt;
   28911         // Low bits are known zero.
   28912         Known.Zero.setLowBits(ShAmt);
   28913       } else {
   28914         Known.Zero.lshrInPlace(ShAmt);
   28915         Known.One.lshrInPlace(ShAmt);
   28916         // High bits are known zero.
   28917         Known.Zero.setHighBits(ShAmt);
   28918       }
   28919     }
   28920     break;
   28921   }
   28922   case X86ISD::PACKUS: {
   28923     // PACKUS is just a truncation if the upper half is zero.
   28924     // TODO: Add DemandedElts support.
   28925     KnownBits Known2;
   28926     DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
   28927     DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1);
   28928     Known.One &= Known2.One;
   28929     Known.Zero &= Known2.Zero;
   28930     if (Known.countMinLeadingZeros() < BitWidth)
   28931       Known.resetAll();
   28932     Known = Known.trunc(BitWidth);
   28933     break;
   28934   }
   28935   case X86ISD::VZEXT: {
   28936     // TODO: Add DemandedElts support.
   28937     SDValue N0 = Op.getOperand(0);
   28938     unsigned NumElts = VT.getVectorNumElements();
   28939 
   28940     EVT SrcVT = N0.getValueType();
   28941     unsigned InNumElts = SrcVT.getVectorNumElements();
   28942     unsigned InBitWidth = SrcVT.getScalarSizeInBits();
   28943     assert(InNumElts >= NumElts && "Illegal VZEXT input");
   28944 
   28945     Known = KnownBits(InBitWidth);
   28946     APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
   28947     DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
   28948     Known = Known.zext(BitWidth);
   28949     Known.Zero.setBitsFrom(InBitWidth);
   28950     break;
   28951   }
   28952   case X86ISD::CMOV: {
   28953     DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
   28954     // If we don't know any bits, early out.
   28955     if (Known.isUnknown())
   28956       break;
   28957     KnownBits Known2;
   28958     DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
   28959 
   28960     // Only known if known in both the LHS and RHS.
   28961     Known.One &= Known2.One;
   28962     Known.Zero &= Known2.Zero;
   28963     break;
   28964   }
   28965   case X86ISD::UDIVREM8_ZEXT_HREG:
   28966     // TODO: Support more than just the zero extended bits?
   28967     if (Op.getResNo() != 1)
   28968       break;
   28969     // The remainder is zero extended.
   28970     Known.Zero.setBitsFrom(8);
   28971     break;
   28972   }
   28973 
   28974   // Handle target shuffles.
   28975   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
   28976   if (isTargetShuffle(Opc)) {
   28977     bool IsUnary;
   28978     SmallVector<int, 64> Mask;
   28979     SmallVector<SDValue, 2> Ops;
   28980     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
   28981                              IsUnary)) {
   28982       unsigned NumOps = Ops.size();
   28983       unsigned NumElts = VT.getVectorNumElements();
   28984       if (Mask.size() == NumElts) {
   28985         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
   28986         Known.Zero.setAllBits(); Known.One.setAllBits();
   28987         for (unsigned i = 0; i != NumElts; ++i) {
   28988           if (!DemandedElts[i])
   28989             continue;
   28990           int M = Mask[i];
   28991           if (M == SM_SentinelUndef) {
   28992             // For UNDEF elements, we don't know anything about the common state
   28993             // of the shuffle result.
   28994             Known.resetAll();
   28995             break;
   28996           } else if (M == SM_SentinelZero) {
   28997             Known.One.clearAllBits();
   28998             continue;
   28999           }
   29000           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
   29001                  "Shuffle index out of range");
   29002 
   29003           unsigned OpIdx = (unsigned)M / NumElts;
   29004           unsigned EltIdx = (unsigned)M % NumElts;
   29005           if (Ops[OpIdx].getValueType() != VT) {
   29006             // TODO - handle target shuffle ops with different value types.
   29007             Known.resetAll();
   29008             break;
   29009           }
   29010           DemandedOps[OpIdx].setBit(EltIdx);
   29011         }
   29012         // Known bits are the values that are shared by every demanded element.
   29013         for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
   29014           if (!DemandedOps[i])
   29015             continue;
   29016           KnownBits Known2;
   29017           DAG.computeKnownBits(Ops[i], Known2, DemandedOps[i], Depth + 1);
   29018           Known.One &= Known2.One;
   29019           Known.Zero &= Known2.Zero;
   29020         }
   29021       }
   29022     }
   29023   }
   29024 }
   29025 
   29026 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
   29027     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
   29028     unsigned Depth) const {
   29029   unsigned VTBits = Op.getScalarValueSizeInBits();
   29030   unsigned Opcode = Op.getOpcode();
   29031   switch (Opcode) {
   29032   case X86ISD::SETCC_CARRY:
   29033     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
   29034     return VTBits;
   29035 
   29036   case X86ISD::VSEXT: {
   29037     // TODO: Add DemandedElts support.
   29038     SDValue Src = Op.getOperand(0);
   29039     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
   29040     Tmp += VTBits - Src.getScalarValueSizeInBits();
   29041     return Tmp;
   29042   }
   29043 
   29044   case X86ISD::VTRUNC: {
   29045     // TODO: Add DemandedElts support.
   29046     SDValue Src = Op.getOperand(0);
   29047     unsigned NumSrcBits = Src.getScalarValueSizeInBits();
   29048     assert(VTBits < NumSrcBits && "Illegal truncation input type");
   29049     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
   29050     if (Tmp > (NumSrcBits - VTBits))
   29051       return Tmp - (NumSrcBits - VTBits);
   29052     return 1;
   29053   }
   29054 
   29055   case X86ISD::PACKSS: {
   29056     // PACKSS is just a truncation if the sign bits extend to the packed size.
   29057     // TODO: Add DemandedElts support.
   29058     unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
   29059     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
   29060     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
   29061     unsigned Tmp = std::min(Tmp0, Tmp1);
   29062     if (Tmp > (SrcBits - VTBits))
   29063       return Tmp - (SrcBits - VTBits);
   29064     return 1;
   29065   }
   29066 
   29067   case X86ISD::VSHLI: {
   29068     SDValue Src = Op.getOperand(0);
   29069     APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
   29070     if (ShiftVal.uge(VTBits))
   29071       return VTBits; // Shifted all bits out --> zero.
   29072     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
   29073     if (ShiftVal.uge(Tmp))
   29074       return 1; // Shifted all sign bits out --> unknown.
   29075     return Tmp - ShiftVal.getZExtValue();
   29076   }
   29077 
   29078   case X86ISD::VSRAI: {
   29079     SDValue Src = Op.getOperand(0);
   29080     APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
   29081     if (ShiftVal.uge(VTBits - 1))
   29082       return VTBits; // Sign splat.
   29083     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
   29084     ShiftVal += Tmp;
   29085     return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
   29086   }
   29087 
   29088   case X86ISD::PCMPGT:
   29089   case X86ISD::PCMPEQ:
   29090   case X86ISD::CMPP:
   29091   case X86ISD::VPCOM:
   29092   case X86ISD::VPCOMU:
   29093     // Vector compares return zero/all-bits result values.
   29094     return VTBits;
   29095 
   29096   case X86ISD::CMOV: {
   29097     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
   29098     if (Tmp0 == 1) return 1;  // Early out.
   29099     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
   29100     return std::min(Tmp0, Tmp1);
   29101   }
   29102   case X86ISD::SDIVREM8_SEXT_HREG:
   29103     // TODO: Support more than just the sign extended bits?
   29104     if (Op.getResNo() != 1)
   29105       break;
   29106     // The remainder is sign extended.
   29107     return VTBits - 7;
   29108   }
   29109 
   29110   // Fallback case.
   29111   return 1;
   29112 }
   29113 
   29114 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
   29115   if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
   29116     return N->getOperand(0);
   29117   return N;
   29118 }
   29119 
   29120 /// Returns true (and the GlobalValue and the offset) if the node is a
   29121 /// GlobalAddress + offset.
   29122 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   29123                                        const GlobalValue* &GA,
   29124                                        int64_t &Offset) const {
   29125   if (N->getOpcode() == X86ISD::Wrapper) {
   29126     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
   29127       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
   29128       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
   29129       return true;
   29130     }
   29131   }
   29132   return TargetLowering::isGAPlusOffset(N, GA, Offset);
   29133 }
   29134 
   29135 // Attempt to match a combined shuffle mask against supported unary shuffle
   29136 // instructions.
   29137 // TODO: Investigate sharing more of this with shuffle lowering.
   29138 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   29139                                     bool AllowFloatDomain, bool AllowIntDomain,
   29140                                     SDValue &V1, const SDLoc &DL,
   29141                                     SelectionDAG &DAG,
   29142                                     const X86Subtarget &Subtarget,
   29143                                     unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
   29144   unsigned NumMaskElts = Mask.size();
   29145   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
   29146 
   29147   // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
   29148   if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
   29149       isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
   29150     Shuffle = X86ISD::VZEXT_MOVL;
   29151     SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
   29152     return true;
   29153   }
   29154 
   29155   // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
   29156   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
   29157   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
   29158                          (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
   29159     unsigned MaxScale = 64 / MaskEltSize;
   29160     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
   29161       bool Match = true;
   29162       unsigned NumDstElts = NumMaskElts / Scale;
   29163       for (unsigned i = 0; i != NumDstElts && Match; ++i) {
   29164         Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
   29165         Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
   29166       }
   29167       if (Match) {
   29168         unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
   29169         MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
   29170                                             MVT::getIntegerVT(MaskEltSize);
   29171         SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
   29172 
   29173         if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
   29174           V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
   29175           Shuffle = unsigned(X86ISD::VZEXT);
   29176         } else
   29177           Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
   29178 
   29179         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
   29180         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
   29181         return true;
   29182       }
   29183     }
   29184   }
   29185 
   29186   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
   29187   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
   29188       isUndefOrEqual(Mask[0], 0) &&
   29189       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
   29190     Shuffle = X86ISD::VZEXT_MOVL;
   29191     SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
   29192     return true;
   29193   }
   29194 
   29195   // Check if we have SSE3 which will let us use MOVDDUP etc. The
   29196   // instructions are no slower than UNPCKLPD but has the option to
   29197   // fold the input operand into even an unaligned memory load.
   29198   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
   29199     if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
   29200       Shuffle = X86ISD::MOVDDUP;
   29201       SrcVT = DstVT = MVT::v2f64;
   29202       return true;
   29203     }
   29204     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
   29205       Shuffle = X86ISD::MOVSLDUP;
   29206       SrcVT = DstVT = MVT::v4f32;
   29207       return true;
   29208     }
   29209     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
   29210       Shuffle = X86ISD::MOVSHDUP;
   29211       SrcVT = DstVT = MVT::v4f32;
   29212       return true;
   29213     }
   29214   }
   29215 
   29216   if (MaskVT.is256BitVector() && AllowFloatDomain) {
   29217     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
   29218     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
   29219       Shuffle = X86ISD::MOVDDUP;
   29220       SrcVT = DstVT = MVT::v4f64;
   29221       return true;
   29222     }
   29223     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
   29224       Shuffle = X86ISD::MOVSLDUP;
   29225       SrcVT = DstVT = MVT::v8f32;
   29226       return true;
   29227     }
   29228     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
   29229       Shuffle = X86ISD::MOVSHDUP;
   29230       SrcVT = DstVT = MVT::v8f32;
   29231       return true;
   29232     }
   29233   }
   29234 
   29235   if (MaskVT.is512BitVector() && AllowFloatDomain) {
   29236     assert(Subtarget.hasAVX512() &&
   29237            "AVX512 required for 512-bit vector shuffles");
   29238     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
   29239       Shuffle = X86ISD::MOVDDUP;
   29240       SrcVT = DstVT = MVT::v8f64;
   29241       return true;
   29242     }
   29243     if (isTargetShuffleEquivalent(
   29244             Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
   29245       Shuffle = X86ISD::MOVSLDUP;
   29246       SrcVT = DstVT = MVT::v16f32;
   29247       return true;
   29248     }
   29249     if (isTargetShuffleEquivalent(
   29250             Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
   29251       Shuffle = X86ISD::MOVSHDUP;
   29252       SrcVT = DstVT = MVT::v16f32;
   29253       return true;
   29254     }
   29255   }
   29256 
   29257   // Attempt to match against broadcast-from-vector.
   29258   if (Subtarget.hasAVX2()) {
   29259     SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
   29260     if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
   29261       SrcVT = DstVT = MaskVT;
   29262       Shuffle = X86ISD::VBROADCAST;
   29263       return true;
   29264     }
   29265   }
   29266 
   29267   return false;
   29268 }
   29269 
   29270 // Attempt to match a combined shuffle mask against supported unary immediate
   29271 // permute instructions.
   29272 // TODO: Investigate sharing more of this with shuffle lowering.
   29273 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   29274                                            const APInt &Zeroable,
   29275                                            bool AllowFloatDomain,
   29276                                            bool AllowIntDomain,
   29277                                            const X86Subtarget &Subtarget,
   29278                                            unsigned &Shuffle, MVT &ShuffleVT,
   29279                                            unsigned &PermuteImm) {
   29280   unsigned NumMaskElts = Mask.size();
   29281   unsigned InputSizeInBits = MaskVT.getSizeInBits();
   29282   unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
   29283   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
   29284 
   29285   bool ContainsZeros =
   29286       llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
   29287 
   29288   // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
   29289   if (!ContainsZeros && MaskScalarSizeInBits == 64) {
   29290     // Check for lane crossing permutes.
   29291     if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
   29292       // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
   29293       if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
   29294         Shuffle = X86ISD::VPERMI;
   29295         ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
   29296         PermuteImm = getV4X86ShuffleImm(Mask);
   29297         return true;
   29298       }
   29299       if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
   29300         SmallVector<int, 4> RepeatedMask;
   29301         if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
   29302           Shuffle = X86ISD::VPERMI;
   29303           ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
   29304           PermuteImm = getV4X86ShuffleImm(RepeatedMask);
   29305           return true;
   29306         }
   29307       }
   29308     } else if (AllowFloatDomain && Subtarget.hasAVX()) {
   29309       // VPERMILPD can permute with a non-repeating shuffle.
   29310       Shuffle = X86ISD::VPERMILPI;
   29311       ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
   29312       PermuteImm = 0;
   29313       for (int i = 0, e = Mask.size(); i != e; ++i) {
   29314         int M = Mask[i];
   29315         if (M == SM_SentinelUndef)
   29316           continue;
   29317         assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
   29318         PermuteImm |= (M & 1) << i;
   29319       }
   29320       return true;
   29321     }
   29322   }
   29323 
   29324   // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
   29325   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
   29326   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
   29327   if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
   29328       !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
   29329     SmallVector<int, 4> RepeatedMask;
   29330     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
   29331       // Narrow the repeated mask to create 32-bit element permutes.
   29332       SmallVector<int, 4> WordMask = RepeatedMask;
   29333       if (MaskScalarSizeInBits == 64)
   29334         scaleShuffleMask<int>(2, RepeatedMask, WordMask);
   29335 
   29336       Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
   29337       ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
   29338       ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
   29339       PermuteImm = getV4X86ShuffleImm(WordMask);
   29340       return true;
   29341     }
   29342   }
   29343 
   29344   // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
   29345   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
   29346     SmallVector<int, 4> RepeatedMask;
   29347     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
   29348       ArrayRef<int> LoMask(Mask.data() + 0, 4);
   29349       ArrayRef<int> HiMask(Mask.data() + 4, 4);
   29350 
   29351       // PSHUFLW: permute lower 4 elements only.
   29352       if (isUndefOrInRange(LoMask, 0, 4) &&
   29353           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
   29354         Shuffle = X86ISD::PSHUFLW;
   29355         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
   29356         PermuteImm = getV4X86ShuffleImm(LoMask);
   29357         return true;
   29358       }
   29359 
   29360       // PSHUFHW: permute upper 4 elements only.
   29361       if (isUndefOrInRange(HiMask, 4, 8) &&
   29362           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
   29363         // Offset the HiMask so that we can create the shuffle immediate.
   29364         int OffsetHiMask[4];
   29365         for (int i = 0; i != 4; ++i)
   29366           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
   29367 
   29368         Shuffle = X86ISD::PSHUFHW;
   29369         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
   29370         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
   29371         return true;
   29372       }
   29373     }
   29374   }
   29375 
   29376   // Attempt to match against byte/bit shifts.
   29377   // FIXME: Add 512-bit support.
   29378   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
   29379                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
   29380     int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
   29381                                              MaskScalarSizeInBits, Mask,
   29382                                              0, Zeroable, Subtarget);
   29383     if (0 < ShiftAmt) {
   29384       PermuteImm = (unsigned)ShiftAmt;
   29385       return true;
   29386     }
   29387   }
   29388 
   29389   return false;
   29390 }
   29391 
   29392 // Attempt to match a combined unary shuffle mask against supported binary
   29393 // shuffle instructions.
   29394 // TODO: Investigate sharing more of this with shuffle lowering.
   29395 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   29396                                      bool AllowFloatDomain, bool AllowIntDomain,
   29397                                      SDValue &V1, SDValue &V2, const SDLoc &DL,
   29398                                      SelectionDAG &DAG,
   29399                                      const X86Subtarget &Subtarget,
   29400                                      unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
   29401                                      bool IsUnary) {
   29402   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
   29403 
   29404   if (MaskVT.is128BitVector()) {
   29405     if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
   29406       V2 = V1;
   29407       V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
   29408       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
   29409       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
   29410       return true;
   29411     }
   29412     if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
   29413       V2 = V1;
   29414       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
   29415       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
   29416       return true;
   29417     }
   29418     if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
   29419         (AllowFloatDomain || !Subtarget.hasSSE41())) {
   29420       std::swap(V1, V2);
   29421       Shuffle = X86ISD::MOVSD;
   29422       SrcVT = DstVT = MVT::v2f64;
   29423       return true;
   29424     }
   29425     if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
   29426         (AllowFloatDomain || !Subtarget.hasSSE41())) {
   29427       Shuffle = X86ISD::MOVSS;
   29428       SrcVT = DstVT = MVT::v4f32;
   29429       return true;
   29430     }
   29431   }
   29432 
   29433   // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
   29434   // TODO add support for 256/512-bit types.
   29435   if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
   29436     if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
   29437                                    Subtarget)) {
   29438       DstVT = MaskVT;
   29439       return true;
   29440     }
   29441   }
   29442 
   29443   // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
   29444   if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
   29445       (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
   29446       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
   29447       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
   29448       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
   29449     if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
   29450                                     DAG, Subtarget)) {
   29451       SrcVT = DstVT = MaskVT;
   29452       if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
   29453         SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
   29454       return true;
   29455     }
   29456   }
   29457 
   29458   return false;
   29459 }
   29460 
   29461 static bool matchBinaryPermuteVectorShuffle(
   29462     MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
   29463     bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
   29464     const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
   29465     unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
   29466   unsigned NumMaskElts = Mask.size();
   29467   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
   29468 
   29469   // Attempt to match against PALIGNR byte rotate.
   29470   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
   29471                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
   29472     int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
   29473     if (0 < ByteRotation) {
   29474       Shuffle = X86ISD::PALIGNR;
   29475       ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
   29476       PermuteImm = ByteRotation;
   29477       return true;
   29478     }
   29479   }
   29480 
   29481   // Attempt to combine to X86ISD::BLENDI.
   29482   if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
   29483                             (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
   29484       (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
   29485     uint64_t BlendMask = 0;
   29486     bool ForceV1Zero = false, ForceV2Zero = false;
   29487     SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
   29488     if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
   29489                                   BlendMask)) {
   29490       if (MaskVT == MVT::v16i16) {
   29491         // We can only use v16i16 PBLENDW if the lanes are repeated.
   29492         SmallVector<int, 8> RepeatedMask;
   29493         if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
   29494                                         RepeatedMask)) {
   29495           assert(RepeatedMask.size() == 8 &&
   29496                  "Repeated mask size doesn't match!");
   29497           PermuteImm = 0;
   29498           for (int i = 0; i < 8; ++i)
   29499             if (RepeatedMask[i] >= 8)
   29500               PermuteImm |= 1 << i;
   29501           V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
   29502           V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
   29503           Shuffle = X86ISD::BLENDI;
   29504           ShuffleVT = MaskVT;
   29505           return true;
   29506         }
   29507       } else {
   29508         // Determine a type compatible with X86ISD::BLENDI.
   29509         ShuffleVT = MaskVT;
   29510         if (Subtarget.hasAVX2()) {
   29511           if (ShuffleVT == MVT::v4i64)
   29512             ShuffleVT = MVT::v8i32;
   29513           else if (ShuffleVT == MVT::v2i64)
   29514             ShuffleVT = MVT::v4i32;
   29515         } else {
   29516           if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
   29517             ShuffleVT = MVT::v8i16;
   29518           else if (ShuffleVT == MVT::v4i64)
   29519             ShuffleVT = MVT::v4f64;
   29520           else if (ShuffleVT == MVT::v8i32)
   29521             ShuffleVT = MVT::v8f32;
   29522         }
   29523 
   29524         if (!ShuffleVT.isFloatingPoint()) {
   29525           int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
   29526           BlendMask =
   29527               scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
   29528           ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
   29529           ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
   29530         }
   29531 
   29532         V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
   29533         V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
   29534         PermuteImm = (unsigned)BlendMask;
   29535         Shuffle = X86ISD::BLENDI;
   29536         return true;
   29537       }
   29538     }
   29539   }
   29540 
   29541   // Attempt to combine to INSERTPS.
   29542   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
   29543       MaskVT.is128BitVector()) {
   29544     if (Zeroable.getBoolValue() &&
   29545         matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
   29546       Shuffle = X86ISD::INSERTPS;
   29547       ShuffleVT = MVT::v4f32;
   29548       return true;
   29549     }
   29550   }
   29551 
   29552   // Attempt to combine to SHUFPD.
   29553   if (AllowFloatDomain && EltSizeInBits == 64 &&
   29554       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
   29555        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
   29556        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
   29557     if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
   29558       Shuffle = X86ISD::SHUFP;
   29559       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
   29560       return true;
   29561     }
   29562   }
   29563 
   29564   // Attempt to combine to SHUFPS.
   29565   if (AllowFloatDomain && EltSizeInBits == 32 &&
   29566       ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
   29567        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
   29568        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
   29569     SmallVector<int, 4> RepeatedMask;
   29570     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
   29571       // Match each half of the repeated mask, to determine if its just
   29572       // referencing one of the vectors, is zeroable or entirely undef.
   29573       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
   29574         int M0 = RepeatedMask[Offset];
   29575         int M1 = RepeatedMask[Offset + 1];
   29576 
   29577         if (isUndefInRange(RepeatedMask, Offset, 2)) {
   29578           return DAG.getUNDEF(MaskVT);
   29579         } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
   29580           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
   29581           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
   29582           return getZeroVector(MaskVT, Subtarget, DAG, DL);
   29583         } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
   29584           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
   29585           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
   29586           return V1;
   29587         } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
   29588           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
   29589           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
   29590           return V2;
   29591         }
   29592 
   29593         return SDValue();
   29594       };
   29595 
   29596       int ShufMask[4] = {-1, -1, -1, -1};
   29597       SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
   29598       SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
   29599 
   29600       if (Lo && Hi) {
   29601         V1 = Lo;
   29602         V2 = Hi;
   29603         Shuffle = X86ISD::SHUFP;
   29604         ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
   29605         PermuteImm = getV4X86ShuffleImm(ShufMask);
   29606         return true;
   29607       }
   29608     }
   29609   }
   29610 
   29611   return false;
   29612 }
   29613 
   29614 /// Combine an arbitrary chain of shuffles into a single instruction if
   29615 /// possible.
   29616 ///
   29617 /// This is the leaf of the recursive combine below. When we have found some
   29618 /// chain of single-use x86 shuffle instructions and accumulated the combined
   29619 /// shuffle mask represented by them, this will try to pattern match that mask
   29620 /// into either a single instruction if there is a special purpose instruction
   29621 /// for this operation, or into a PSHUFB instruction which is a fully general
   29622 /// instruction but should only be used to replace chains over a certain depth.
   29623 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   29624                                       ArrayRef<int> BaseMask, int Depth,
   29625                                       bool HasVariableMask, SelectionDAG &DAG,
   29626                                       const X86Subtarget &Subtarget) {
   29627   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
   29628   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
   29629          "Unexpected number of shuffle inputs!");
   29630 
   29631   // Find the inputs that enter the chain. Note that multiple uses are OK
   29632   // here, we're not going to remove the operands we find.
   29633   bool UnaryShuffle = (Inputs.size() == 1);
   29634   SDValue V1 = peekThroughBitcasts(Inputs[0]);
   29635   SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
   29636                              : peekThroughBitcasts(Inputs[1]));
   29637 
   29638   MVT VT1 = V1.getSimpleValueType();
   29639   MVT VT2 = V2.getSimpleValueType();
   29640   MVT RootVT = Root.getSimpleValueType();
   29641   assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
   29642          VT2.getSizeInBits() == RootVT.getSizeInBits() &&
   29643          "Vector size mismatch");
   29644 
   29645   SDLoc DL(Root);
   29646   SDValue Res;
   29647 
   29648   unsigned NumBaseMaskElts = BaseMask.size();
   29649   if (NumBaseMaskElts == 1) {
   29650     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
   29651     return DAG.getBitcast(RootVT, V1);
   29652   }
   29653 
   29654   unsigned RootSizeInBits = RootVT.getSizeInBits();
   29655   unsigned NumRootElts = RootVT.getVectorNumElements();
   29656   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
   29657   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
   29658                      (RootVT.isFloatingPoint() && Depth >= 2) ||
   29659                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
   29660 
   29661   // Don't combine if we are a AVX512/EVEX target and the mask element size
   29662   // is different from the root element size - this would prevent writemasks
   29663   // from being reused.
   29664   // TODO - this currently prevents all lane shuffles from occurring.
   29665   // TODO - check for writemasks usage instead of always preventing combining.
   29666   // TODO - attempt to narrow Mask back to writemask size.
   29667   bool IsEVEXShuffle =
   29668       RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
   29669 
   29670   // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
   29671 
   29672   // Handle 128-bit lane shuffles of 256-bit vectors.
   29673   // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
   29674   // we need to use the zeroing feature.
   29675   // TODO - this should support binary shuffles.
   29676   if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
   29677       !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
   29678       !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
   29679     if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
   29680       return SDValue(); // Nothing to do!
   29681     MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
   29682     unsigned PermMask = 0;
   29683     PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
   29684     PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
   29685 
   29686     Res = DAG.getBitcast(ShuffleVT, V1);
   29687     Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
   29688                       DAG.getUNDEF(ShuffleVT),
   29689                       DAG.getConstant(PermMask, DL, MVT::i8));
   29690     return DAG.getBitcast(RootVT, Res);
   29691   }
   29692 
   29693   // For masks that have been widened to 128-bit elements or more,
   29694   // narrow back down to 64-bit elements.
   29695   SmallVector<int, 64> Mask;
   29696   if (BaseMaskEltSizeInBits > 64) {
   29697     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
   29698     int MaskScale = BaseMaskEltSizeInBits / 64;
   29699     scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
   29700   } else {
   29701     Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
   29702   }
   29703 
   29704   unsigned NumMaskElts = Mask.size();
   29705   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
   29706 
   29707   // Determine the effective mask value type.
   29708   FloatDomain &= (32 <= MaskEltSizeInBits);
   29709   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
   29710                            : MVT::getIntegerVT(MaskEltSizeInBits);
   29711   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
   29712 
   29713   // Only allow legal mask types.
   29714   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
   29715     return SDValue();
   29716 
   29717   // Attempt to match the mask against known shuffle patterns.
   29718   MVT ShuffleSrcVT, ShuffleVT;
   29719   unsigned Shuffle, PermuteImm;
   29720 
   29721   // Which shuffle domains are permitted?
   29722   // Permit domain crossing at higher combine depths.
   29723   bool AllowFloatDomain = FloatDomain || (Depth > 3);
   29724   bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
   29725                         (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
   29726 
   29727   // Determine zeroable mask elements.
   29728   APInt Zeroable(NumMaskElts, 0);
   29729   for (unsigned i = 0; i != NumMaskElts; ++i)
   29730     if (isUndefOrZero(Mask[i]))
   29731       Zeroable.setBit(i);
   29732 
   29733   if (UnaryShuffle) {
   29734     // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
   29735     // directly if we don't shuffle the lower element and we shuffle the upper
   29736     // (zero) elements within themselves.
   29737     if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
   29738         (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
   29739       unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
   29740       ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
   29741       if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
   29742           isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
   29743         return DAG.getBitcast(RootVT, V1);
   29744       }
   29745     }
   29746 
   29747     SDValue NewV1 = V1; // Save operand in case early exit happens.
   29748     if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
   29749                                 NewV1, DL, DAG, Subtarget, Shuffle,
   29750                                 ShuffleSrcVT, ShuffleVT) &&
   29751         (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
   29752       if (Depth == 1 && Root.getOpcode() == Shuffle)
   29753         return SDValue(); // Nothing to do!
   29754       Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
   29755       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
   29756       return DAG.getBitcast(RootVT, Res);
   29757     }
   29758 
   29759     if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
   29760                                        AllowIntDomain, Subtarget, Shuffle,
   29761                                        ShuffleVT, PermuteImm) &&
   29762         (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
   29763       if (Depth == 1 && Root.getOpcode() == Shuffle)
   29764         return SDValue(); // Nothing to do!
   29765       Res = DAG.getBitcast(ShuffleVT, V1);
   29766       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
   29767                         DAG.getConstant(PermuteImm, DL, MVT::i8));
   29768       return DAG.getBitcast(RootVT, Res);
   29769     }
   29770   }
   29771 
   29772   SDValue NewV1 = V1; // Save operands in case early exit happens.
   29773   SDValue NewV2 = V2;
   29774   if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
   29775                                NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
   29776                                ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
   29777       (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
   29778     if (Depth == 1 && Root.getOpcode() == Shuffle)
   29779       return SDValue(); // Nothing to do!
   29780     NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
   29781     NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
   29782     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
   29783     return DAG.getBitcast(RootVT, Res);
   29784   }
   29785 
   29786   NewV1 = V1; // Save operands in case early exit happens.
   29787   NewV2 = V2;
   29788   if (matchBinaryPermuteVectorShuffle(
   29789           MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
   29790           NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
   29791       (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
   29792     if (Depth == 1 && Root.getOpcode() == Shuffle)
   29793       return SDValue(); // Nothing to do!
   29794     NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
   29795     NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
   29796     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
   29797                       DAG.getConstant(PermuteImm, DL, MVT::i8));
   29798     return DAG.getBitcast(RootVT, Res);
   29799   }
   29800 
   29801   // Typically from here on, we need an integer version of MaskVT.
   29802   MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
   29803   IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
   29804 
   29805   // Annoyingly, SSE4A instructions don't map into the above match helpers.
   29806   if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
   29807     uint64_t BitLen, BitIdx;
   29808     if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
   29809                                   Zeroable)) {
   29810       if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
   29811         return SDValue(); // Nothing to do!
   29812       V1 = DAG.getBitcast(IntMaskVT, V1);
   29813       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
   29814                         DAG.getConstant(BitLen, DL, MVT::i8),
   29815                         DAG.getConstant(BitIdx, DL, MVT::i8));
   29816       return DAG.getBitcast(RootVT, Res);
   29817     }
   29818 
   29819     if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
   29820       if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
   29821         return SDValue(); // Nothing to do!
   29822       V1 = DAG.getBitcast(IntMaskVT, V1);
   29823       V2 = DAG.getBitcast(IntMaskVT, V2);
   29824       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
   29825                         DAG.getConstant(BitLen, DL, MVT::i8),
   29826                         DAG.getConstant(BitIdx, DL, MVT::i8));
   29827       return DAG.getBitcast(RootVT, Res);
   29828     }
   29829   }
   29830 
   29831   // Don't try to re-form single instruction chains under any circumstances now
   29832   // that we've done encoding canonicalization for them.
   29833   if (Depth < 2)
   29834     return SDValue();
   29835 
   29836   // Depth threshold above which we can efficiently use variable mask shuffles.
   29837   int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
   29838   bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
   29839 
   29840   bool MaskContainsZeros =
   29841       any_of(Mask, [](int M) { return M == SM_SentinelZero; });
   29842 
   29843   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
   29844     // If we have a single input lane-crossing shuffle then lower to VPERMV.
   29845     if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
   29846         ((Subtarget.hasAVX2() &&
   29847           (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
   29848          (Subtarget.hasAVX512() &&
   29849           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
   29850            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
   29851          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
   29852          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
   29853          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
   29854          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
   29855       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
   29856       Res = DAG.getBitcast(MaskVT, V1);
   29857       Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
   29858       return DAG.getBitcast(RootVT, Res);
   29859     }
   29860 
   29861     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
   29862     // vector as the second source.
   29863     if (UnaryShuffle && AllowVariableMask &&
   29864         ((Subtarget.hasAVX512() &&
   29865           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
   29866            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
   29867          (Subtarget.hasVLX() &&
   29868           (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
   29869            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
   29870          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
   29871          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
   29872          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
   29873          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
   29874       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
   29875       for (unsigned i = 0; i != NumMaskElts; ++i)
   29876         if (Mask[i] == SM_SentinelZero)
   29877           Mask[i] = NumMaskElts + i;
   29878 
   29879       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
   29880       Res = DAG.getBitcast(MaskVT, V1);
   29881       SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
   29882       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
   29883       return DAG.getBitcast(RootVT, Res);
   29884     }
   29885 
   29886     // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
   29887     if (AllowVariableMask && !MaskContainsZeros &&
   29888         ((Subtarget.hasAVX512() &&
   29889           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
   29890            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
   29891          (Subtarget.hasVLX() &&
   29892           (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
   29893            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
   29894          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
   29895          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
   29896          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
   29897          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
   29898       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
   29899       V1 = DAG.getBitcast(MaskVT, V1);
   29900       V2 = DAG.getBitcast(MaskVT, V2);
   29901       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
   29902       return DAG.getBitcast(RootVT, Res);
   29903     }
   29904     return SDValue();
   29905   }
   29906 
   29907   // See if we can combine a single input shuffle with zeros to a bit-mask,
   29908   // which is much simpler than any shuffle.
   29909   if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
   29910       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
   29911       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
   29912     APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
   29913     APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
   29914     APInt UndefElts(NumMaskElts, 0);
   29915     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
   29916     for (unsigned i = 0; i != NumMaskElts; ++i) {
   29917       int M = Mask[i];
   29918       if (M == SM_SentinelUndef) {
   29919         UndefElts.setBit(i);
   29920         continue;
   29921       }
   29922       if (M == SM_SentinelZero)
   29923         continue;
   29924       EltBits[i] = AllOnes;
   29925     }
   29926     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
   29927     Res = DAG.getBitcast(MaskVT, V1);
   29928     unsigned AndOpcode =
   29929         FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
   29930     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
   29931     return DAG.getBitcast(RootVT, Res);
   29932   }
   29933 
   29934   // If we have a single input shuffle with different shuffle patterns in the
   29935   // the 128-bit lanes use the variable mask to VPERMILPS.
   29936   // TODO Combine other mask types at higher depths.
   29937   if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
   29938       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
   29939        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
   29940     SmallVector<SDValue, 16> VPermIdx;
   29941     for (int M : Mask) {
   29942       SDValue Idx =
   29943           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
   29944       VPermIdx.push_back(Idx);
   29945     }
   29946     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
   29947     Res = DAG.getBitcast(MaskVT, V1);
   29948     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
   29949     return DAG.getBitcast(RootVT, Res);
   29950   }
   29951 
   29952   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
   29953   // to VPERMIL2PD/VPERMIL2PS.
   29954   if (AllowVariableMask && Subtarget.hasXOP() &&
   29955       (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
   29956        MaskVT == MVT::v8f32)) {
   29957     // VPERMIL2 Operation.
   29958     // Bits[3] - Match Bit.
   29959     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
   29960     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
   29961     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
   29962     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
   29963     SmallVector<int, 8> VPerm2Idx;
   29964     unsigned M2ZImm = 0;
   29965     for (int M : Mask) {
   29966       if (M == SM_SentinelUndef) {
   29967         VPerm2Idx.push_back(-1);
   29968         continue;
   29969       }
   29970       if (M == SM_SentinelZero) {
   29971         M2ZImm = 2;
   29972         VPerm2Idx.push_back(8);
   29973         continue;
   29974       }
   29975       int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
   29976       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
   29977       VPerm2Idx.push_back(Index);
   29978     }
   29979     V1 = DAG.getBitcast(MaskVT, V1);
   29980     V2 = DAG.getBitcast(MaskVT, V2);
   29981     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
   29982     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
   29983                       DAG.getConstant(M2ZImm, DL, MVT::i8));
   29984     return DAG.getBitcast(RootVT, Res);
   29985   }
   29986 
   29987   // If we have 3 or more shuffle instructions or a chain involving a variable
   29988   // mask, we can replace them with a single PSHUFB instruction profitably.
   29989   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
   29990   // instructions, but in practice PSHUFB tends to be *very* fast so we're
   29991   // more aggressive.
   29992   if (UnaryShuffle && AllowVariableMask &&
   29993       ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
   29994        (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
   29995        (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
   29996     SmallVector<SDValue, 16> PSHUFBMask;
   29997     int NumBytes = RootVT.getSizeInBits() / 8;
   29998     int Ratio = NumBytes / NumMaskElts;
   29999     for (int i = 0; i < NumBytes; ++i) {
   30000       int M = Mask[i / Ratio];
   30001       if (M == SM_SentinelUndef) {
   30002         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
   30003         continue;
   30004       }
   30005       if (M == SM_SentinelZero) {
   30006         PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
   30007         continue;
   30008       }
   30009       M = Ratio * M + i % Ratio;
   30010       assert((M / 16) == (i / 16) && "Lane crossing detected");
   30011       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
   30012     }
   30013     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
   30014     Res = DAG.getBitcast(ByteVT, V1);
   30015     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
   30016     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
   30017     return DAG.getBitcast(RootVT, Res);
   30018   }
   30019 
   30020   // With XOP, if we have a 128-bit binary input shuffle we can always combine
   30021   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
   30022   // slower than PSHUFB on targets that support both.
   30023   if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
   30024     // VPPERM Mask Operation
   30025     // Bits[4:0] - Byte Index (0 - 31)
   30026     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
   30027     SmallVector<SDValue, 16> VPPERMMask;
   30028     int NumBytes = 16;
   30029     int Ratio = NumBytes / NumMaskElts;
   30030     for (int i = 0; i < NumBytes; ++i) {
   30031       int M = Mask[i / Ratio];
   30032       if (M == SM_SentinelUndef) {
   30033         VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
   30034         continue;
   30035       }
   30036       if (M == SM_SentinelZero) {
   30037         VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
   30038         continue;
   30039       }
   30040       M = Ratio * M + i % Ratio;
   30041       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
   30042     }
   30043     MVT ByteVT = MVT::v16i8;
   30044     V1 = DAG.getBitcast(ByteVT, V1);
   30045     V2 = DAG.getBitcast(ByteVT, V2);
   30046     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
   30047     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
   30048     return DAG.getBitcast(RootVT, Res);
   30049   }
   30050 
   30051   // Failed to find any combines.
   30052   return SDValue();
   30053 }
   30054 
   30055 // Attempt to constant fold all of the constant source ops.
   30056 // Returns true if the entire shuffle is folded to a constant.
   30057 // TODO: Extend this to merge multiple constant Ops and update the mask.
   30058 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
   30059                                            ArrayRef<int> Mask, SDValue Root,
   30060                                            bool HasVariableMask,
   30061                                            SelectionDAG &DAG,
   30062                                            const X86Subtarget &Subtarget) {
   30063   MVT VT = Root.getSimpleValueType();
   30064 
   30065   unsigned SizeInBits = VT.getSizeInBits();
   30066   unsigned NumMaskElts = Mask.size();
   30067   unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
   30068   unsigned NumOps = Ops.size();
   30069 
   30070   // Extract constant bits from each source op.
   30071   bool OneUseConstantOp = false;
   30072   SmallVector<APInt, 16> UndefEltsOps(NumOps);
   30073   SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
   30074   for (unsigned i = 0; i != NumOps; ++i) {
   30075     SDValue SrcOp = Ops[i];
   30076     OneUseConstantOp |= SrcOp.hasOneUse();
   30077     if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
   30078                                        RawBitsOps[i]))
   30079       return SDValue();
   30080   }
   30081 
   30082   // Only fold if at least one of the constants is only used once or
   30083   // the combined shuffle has included a variable mask shuffle, this
   30084   // is to avoid constant pool bloat.
   30085   if (!OneUseConstantOp && !HasVariableMask)
   30086     return SDValue();
   30087 
   30088   // Shuffle the constant bits according to the mask.
   30089   APInt UndefElts(NumMaskElts, 0);
   30090   APInt ZeroElts(NumMaskElts, 0);
   30091   APInt ConstantElts(NumMaskElts, 0);
   30092   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
   30093                                         APInt::getNullValue(MaskSizeInBits));
   30094   for (unsigned i = 0; i != NumMaskElts; ++i) {
   30095     int M = Mask[i];
   30096     if (M == SM_SentinelUndef) {
   30097       UndefElts.setBit(i);
   30098       continue;
   30099     } else if (M == SM_SentinelZero) {
   30100       ZeroElts.setBit(i);
   30101       continue;
   30102     }
   30103     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
   30104 
   30105     unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
   30106     unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
   30107 
   30108     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
   30109     if (SrcUndefElts[SrcMaskIdx]) {
   30110       UndefElts.setBit(i);
   30111       continue;
   30112     }
   30113 
   30114     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
   30115     APInt &Bits = SrcEltBits[SrcMaskIdx];
   30116     if (!Bits) {
   30117       ZeroElts.setBit(i);
   30118       continue;
   30119     }
   30120 
   30121     ConstantElts.setBit(i);
   30122     ConstantBitData[i] = Bits;
   30123   }
   30124   assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
   30125 
   30126   // Create the constant data.
   30127   MVT MaskSVT;
   30128   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
   30129     MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
   30130   else
   30131     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
   30132 
   30133   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
   30134 
   30135   SDLoc DL(Root);
   30136   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
   30137   return DAG.getBitcast(VT, CstOp);
   30138 }
   30139 
   30140 /// Fully generic combining of x86 shuffle instructions.
   30141 ///
   30142 /// This should be the last combine run over the x86 shuffle instructions. Once
   30143 /// they have been fully optimized, this will recursively consider all chains
   30144 /// of single-use shuffle instructions, build a generic model of the cumulative
   30145 /// shuffle operation, and check for simpler instructions which implement this
   30146 /// operation. We use this primarily for two purposes:
   30147 ///
   30148 /// 1) Collapse generic shuffles to specialized single instructions when
   30149 ///    equivalent. In most cases, this is just an encoding size win, but
   30150 ///    sometimes we will collapse multiple generic shuffles into a single
   30151 ///    special-purpose shuffle.
   30152 /// 2) Look for sequences of shuffle instructions with 3 or more total
   30153 ///    instructions, and replace them with the slightly more expensive SSSE3
   30154 ///    PSHUFB instruction if available. We do this as the last combining step
   30155 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
   30156 ///    a suitable short sequence of other instructions. The PSHUFB will either
   30157 ///    use a register or have to read from memory and so is slightly (but only
   30158 ///    slightly) more expensive than the other shuffle instructions.
   30159 ///
   30160 /// Because this is inherently a quadratic operation (for each shuffle in
   30161 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
   30162 /// This should never be an issue in practice as the shuffle lowering doesn't
   30163 /// produce sequences of more than 8 instructions.
   30164 ///
   30165 /// FIXME: We will currently miss some cases where the redundant shuffling
   30166 /// would simplify under the threshold for PSHUFB formation because of
   30167 /// combine-ordering. To fix this, we should do the redundant instruction
   30168 /// combining in this recursive walk.
   30169 static SDValue combineX86ShufflesRecursively(
   30170     ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
   30171     ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
   30172     bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) {
   30173   // Bound the depth of our recursive combine because this is ultimately
   30174   // quadratic in nature.
   30175   const unsigned MaxRecursionDepth = 8;
   30176   if (Depth > MaxRecursionDepth)
   30177     return SDValue();
   30178 
   30179   // Directly rip through bitcasts to find the underlying operand.
   30180   SDValue Op = SrcOps[SrcOpIndex];
   30181   Op = peekThroughOneUseBitcasts(Op);
   30182 
   30183   MVT VT = Op.getSimpleValueType();
   30184   if (!VT.isVector())
   30185     return SDValue(); // Bail if we hit a non-vector.
   30186 
   30187   assert(Root.getSimpleValueType().isVector() &&
   30188          "Shuffles operate on vector types!");
   30189   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
   30190          "Can only combine shuffles of the same vector register size.");
   30191 
   30192   // Extract target shuffle mask and resolve sentinels and inputs.
   30193   SmallVector<int, 64> OpMask;
   30194   SmallVector<SDValue, 2> OpInputs;
   30195   if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
   30196     return SDValue();
   30197 
   30198   assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
   30199   SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
   30200   SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
   30201 
   30202   // Add the inputs to the Ops list, avoiding duplicates.
   30203   SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
   30204 
   30205   int InputIdx0 = -1, InputIdx1 = -1;
   30206   for (int i = 0, e = Ops.size(); i < e; ++i) {
   30207     SDValue BC = peekThroughBitcasts(Ops[i]);
   30208     if (Input0 && BC == peekThroughBitcasts(Input0))
   30209       InputIdx0 = i;
   30210     if (Input1 && BC == peekThroughBitcasts(Input1))
   30211       InputIdx1 = i;
   30212   }
   30213 
   30214   if (Input0 && InputIdx0 < 0) {
   30215     InputIdx0 = SrcOpIndex;
   30216     Ops[SrcOpIndex] = Input0;
   30217   }
   30218   if (Input1 && InputIdx1 < 0) {
   30219     InputIdx1 = Ops.size();
   30220     Ops.push_back(Input1);
   30221   }
   30222 
   30223   assert(((RootMask.size() > OpMask.size() &&
   30224            RootMask.size() % OpMask.size() == 0) ||
   30225           (OpMask.size() > RootMask.size() &&
   30226            OpMask.size() % RootMask.size() == 0) ||
   30227           OpMask.size() == RootMask.size()) &&
   30228          "The smaller number of elements must divide the larger.");
   30229 
   30230   // This function can be performance-critical, so we rely on the power-of-2
   30231   // knowledge that we have about the mask sizes to replace div/rem ops with
   30232   // bit-masks and shifts.
   30233   assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
   30234   assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
   30235   unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
   30236   unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
   30237 
   30238   unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
   30239   unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
   30240   unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
   30241   assert((RootRatio == 1 || OpRatio == 1) &&
   30242          "Must not have a ratio for both incoming and op masks!");
   30243 
   30244   assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
   30245   assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
   30246   assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
   30247   unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
   30248   unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
   30249 
   30250   SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
   30251 
   30252   // Merge this shuffle operation's mask into our accumulated mask. Note that
   30253   // this shuffle's mask will be the first applied to the input, followed by the
   30254   // root mask to get us all the way to the root value arrangement. The reason
   30255   // for this order is that we are recursing up the operation chain.
   30256   for (unsigned i = 0; i < MaskWidth; ++i) {
   30257     unsigned RootIdx = i >> RootRatioLog2;
   30258     if (RootMask[RootIdx] < 0) {
   30259       // This is a zero or undef lane, we're done.
   30260       Mask[i] = RootMask[RootIdx];
   30261       continue;
   30262     }
   30263 
   30264     unsigned RootMaskedIdx =
   30265         RootRatio == 1
   30266             ? RootMask[RootIdx]
   30267             : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
   30268 
   30269     // Just insert the scaled root mask value if it references an input other
   30270     // than the SrcOp we're currently inserting.
   30271     if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
   30272         (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
   30273       Mask[i] = RootMaskedIdx;
   30274       continue;
   30275     }
   30276 
   30277     RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
   30278     unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
   30279     if (OpMask[OpIdx] < 0) {
   30280       // The incoming lanes are zero or undef, it doesn't matter which ones we
   30281       // are using.
   30282       Mask[i] = OpMask[OpIdx];
   30283       continue;
   30284     }
   30285 
   30286     // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
   30287     unsigned OpMaskedIdx =
   30288         OpRatio == 1
   30289             ? OpMask[OpIdx]
   30290             : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
   30291 
   30292     OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
   30293     if (OpMask[OpIdx] < (int)OpMask.size()) {
   30294       assert(0 <= InputIdx0 && "Unknown target shuffle input");
   30295       OpMaskedIdx += InputIdx0 * MaskWidth;
   30296     } else {
   30297       assert(0 <= InputIdx1 && "Unknown target shuffle input");
   30298       OpMaskedIdx += InputIdx1 * MaskWidth;
   30299     }
   30300 
   30301     Mask[i] = OpMaskedIdx;
   30302   }
   30303 
   30304   // Handle the all undef/zero cases early.
   30305   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
   30306     return DAG.getUNDEF(Root.getValueType());
   30307 
   30308   // TODO - should we handle the mixed zero/undef case as well? Just returning
   30309   // a zero mask will lose information on undef elements possibly reducing
   30310   // future combine possibilities.
   30311   if (all_of(Mask, [](int Idx) { return Idx < 0; }))
   30312     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
   30313                          SDLoc(Root));
   30314 
   30315   // Remove unused shuffle source ops.
   30316   resolveTargetShuffleInputsAndMask(Ops, Mask);
   30317   assert(!Ops.empty() && "Shuffle with no inputs detected");
   30318 
   30319   HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
   30320 
   30321   // Update the list of shuffle nodes that have been combined so far.
   30322   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
   30323                                                 SrcNodes.end());
   30324   CombinedNodes.push_back(Op.getNode());
   30325 
   30326   // See if we can recurse into each shuffle source op (if it's a target
   30327   // shuffle). The source op should only be combined if it either has a
   30328   // single use (i.e. current Op) or all its users have already been combined.
   30329   // Don't recurse if we already have more source ops than we can combine in
   30330   // the remaining recursion depth.
   30331   if (Ops.size() < (MaxRecursionDepth - Depth)) {
   30332     for (int i = 0, e = Ops.size(); i < e; ++i)
   30333       if (Ops[i].getNode()->hasOneUse() ||
   30334           SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
   30335         if (SDValue Res = combineX86ShufflesRecursively(
   30336                 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
   30337                 DAG, Subtarget))
   30338           return Res;
   30339   }
   30340 
   30341   // Attempt to constant fold all of the constant source ops.
   30342   if (SDValue Cst = combineX86ShufflesConstants(
   30343           Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
   30344     return Cst;
   30345 
   30346   // We can only combine unary and binary shuffle mask cases.
   30347   if (Ops.size() > 2)
   30348     return SDValue();
   30349 
   30350   // Minor canonicalization of the accumulated shuffle mask to make it easier
   30351   // to match below. All this does is detect masks with sequential pairs of
   30352   // elements, and shrink them to the half-width mask. It does this in a loop
   30353   // so it will reduce the size of the mask to the minimal width mask which
   30354   // performs an equivalent shuffle.
   30355   SmallVector<int, 64> WidenedMask;
   30356   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
   30357     Mask = std::move(WidenedMask);
   30358   }
   30359 
   30360   // Canonicalization of binary shuffle masks to improve pattern matching by
   30361   // commuting the inputs.
   30362   if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
   30363     ShuffleVectorSDNode::commuteMask(Mask);
   30364     std::swap(Ops[0], Ops[1]);
   30365   }
   30366 
   30367   // Finally, try to combine into a single shuffle instruction.
   30368   return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
   30369                                 Subtarget);
   30370 }
   30371 
   30372 /// Get the PSHUF-style mask from PSHUF node.
   30373 ///
   30374 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
   30375 /// PSHUF-style masks that can be reused with such instructions.
   30376 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
   30377   MVT VT = N.getSimpleValueType();
   30378   SmallVector<int, 4> Mask;
   30379   SmallVector<SDValue, 2> Ops;
   30380   bool IsUnary;
   30381   bool HaveMask =
   30382       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
   30383   (void)HaveMask;
   30384   assert(HaveMask);
   30385 
   30386   // If we have more than 128-bits, only the low 128-bits of shuffle mask
   30387   // matter. Check that the upper masks are repeats and remove them.
   30388   if (VT.getSizeInBits() > 128) {
   30389     int LaneElts = 128 / VT.getScalarSizeInBits();
   30390 #ifndef NDEBUG
   30391     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
   30392       for (int j = 0; j < LaneElts; ++j)
   30393         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
   30394                "Mask doesn't repeat in high 128-bit lanes!");
   30395 #endif
   30396     Mask.resize(LaneElts);
   30397   }
   30398 
   30399   switch (N.getOpcode()) {
   30400   case X86ISD::PSHUFD:
   30401     return Mask;
   30402   case X86ISD::PSHUFLW:
   30403     Mask.resize(4);
   30404     return Mask;
   30405   case X86ISD::PSHUFHW:
   30406     Mask.erase(Mask.begin(), Mask.begin() + 4);
   30407     for (int &M : Mask)
   30408       M -= 4;
   30409     return Mask;
   30410   default:
   30411     llvm_unreachable("No valid shuffle instruction found!");
   30412   }
   30413 }
   30414 
   30415 /// Search for a combinable shuffle across a chain ending in pshufd.
   30416 ///
   30417 /// We walk up the chain and look for a combinable shuffle, skipping over
   30418 /// shuffles that we could hoist this shuffle's transformation past without
   30419 /// altering anything.
   30420 static SDValue
   30421 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
   30422                              SelectionDAG &DAG) {
   30423   assert(N.getOpcode() == X86ISD::PSHUFD &&
   30424          "Called with something other than an x86 128-bit half shuffle!");
   30425   SDLoc DL(N);
   30426 
   30427   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
   30428   // of the shuffles in the chain so that we can form a fresh chain to replace
   30429   // this one.
   30430   SmallVector<SDValue, 8> Chain;
   30431   SDValue V = N.getOperand(0);
   30432   for (; V.hasOneUse(); V = V.getOperand(0)) {
   30433     switch (V.getOpcode()) {
   30434     default:
   30435       return SDValue(); // Nothing combined!
   30436 
   30437     case ISD::BITCAST:
   30438       // Skip bitcasts as we always know the type for the target specific
   30439       // instructions.
   30440       continue;
   30441 
   30442     case X86ISD::PSHUFD:
   30443       // Found another dword shuffle.
   30444       break;
   30445 
   30446     case X86ISD::PSHUFLW:
   30447       // Check that the low words (being shuffled) are the identity in the
   30448       // dword shuffle, and the high words are self-contained.
   30449       if (Mask[0] != 0 || Mask[1] != 1 ||
   30450           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
   30451         return SDValue();
   30452 
   30453       Chain.push_back(V);
   30454       continue;
   30455 
   30456     case X86ISD::PSHUFHW:
   30457       // Check that the high words (being shuffled) are the identity in the
   30458       // dword shuffle, and the low words are self-contained.
   30459       if (Mask[2] != 2 || Mask[3] != 3 ||
   30460           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
   30461         return SDValue();
   30462 
   30463       Chain.push_back(V);
   30464       continue;
   30465 
   30466     case X86ISD::UNPCKL:
   30467     case X86ISD::UNPCKH:
   30468       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
   30469       // shuffle into a preceding word shuffle.
   30470       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
   30471           V.getSimpleValueType().getVectorElementType() != MVT::i16)
   30472         return SDValue();
   30473 
   30474       // Search for a half-shuffle which we can combine with.
   30475       unsigned CombineOp =
   30476           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
   30477       if (V.getOperand(0) != V.getOperand(1) ||
   30478           !V->isOnlyUserOf(V.getOperand(0).getNode()))
   30479         return SDValue();
   30480       Chain.push_back(V);
   30481       V = V.getOperand(0);
   30482       do {
   30483         switch (V.getOpcode()) {
   30484         default:
   30485           return SDValue(); // Nothing to combine.
   30486 
   30487         case X86ISD::PSHUFLW:
   30488         case X86ISD::PSHUFHW:
   30489           if (V.getOpcode() == CombineOp)
   30490             break;
   30491 
   30492           Chain.push_back(V);
   30493 
   30494           LLVM_FALLTHROUGH;
   30495         case ISD::BITCAST:
   30496           V = V.getOperand(0);
   30497           continue;
   30498         }
   30499         break;
   30500       } while (V.hasOneUse());
   30501       break;
   30502     }
   30503     // Break out of the loop if we break out of the switch.
   30504     break;
   30505   }
   30506 
   30507   if (!V.hasOneUse())
   30508     // We fell out of the loop without finding a viable combining instruction.
   30509     return SDValue();
   30510 
   30511   // Merge this node's mask and our incoming mask.
   30512   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   30513   for (int &M : Mask)
   30514     M = VMask[M];
   30515   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
   30516                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   30517 
   30518   // Rebuild the chain around this new shuffle.
   30519   while (!Chain.empty()) {
   30520     SDValue W = Chain.pop_back_val();
   30521 
   30522     if (V.getValueType() != W.getOperand(0).getValueType())
   30523       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
   30524 
   30525     switch (W.getOpcode()) {
   30526     default:
   30527       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
   30528 
   30529     case X86ISD::UNPCKL:
   30530     case X86ISD::UNPCKH:
   30531       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
   30532       break;
   30533 
   30534     case X86ISD::PSHUFD:
   30535     case X86ISD::PSHUFLW:
   30536     case X86ISD::PSHUFHW:
   30537       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
   30538       break;
   30539     }
   30540   }
   30541   if (V.getValueType() != N.getValueType())
   30542     V = DAG.getBitcast(N.getValueType(), V);
   30543 
   30544   // Return the new chain to replace N.
   30545   return V;
   30546 }
   30547 
   30548 /// Search for a combinable shuffle across a chain ending in pshuflw or
   30549 /// pshufhw.
   30550 ///
   30551 /// We walk up the chain, skipping shuffles of the other half and looking
   30552 /// through shuffles which switch halves trying to find a shuffle of the same
   30553 /// pair of dwords.
   30554 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
   30555                                         SelectionDAG &DAG,
   30556                                         TargetLowering::DAGCombinerInfo &DCI) {
   30557   assert(
   30558       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
   30559       "Called with something other than an x86 128-bit half shuffle!");
   30560   SDLoc DL(N);
   30561   unsigned CombineOpcode = N.getOpcode();
   30562 
   30563   // Walk up a single-use chain looking for a combinable shuffle.
   30564   SDValue V = N.getOperand(0);
   30565   for (; V.hasOneUse(); V = V.getOperand(0)) {
   30566     switch (V.getOpcode()) {
   30567     default:
   30568       return false; // Nothing combined!
   30569 
   30570     case ISD::BITCAST:
   30571       // Skip bitcasts as we always know the type for the target specific
   30572       // instructions.
   30573       continue;
   30574 
   30575     case X86ISD::PSHUFLW:
   30576     case X86ISD::PSHUFHW:
   30577       if (V.getOpcode() == CombineOpcode)
   30578         break;
   30579 
   30580       // Other-half shuffles are no-ops.
   30581       continue;
   30582     }
   30583     // Break out of the loop if we break out of the switch.
   30584     break;
   30585   }
   30586 
   30587   if (!V.hasOneUse())
   30588     // We fell out of the loop without finding a viable combining instruction.
   30589     return false;
   30590 
   30591   // Combine away the bottom node as its shuffle will be accumulated into
   30592   // a preceding shuffle.
   30593   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
   30594 
   30595   // Record the old value.
   30596   SDValue Old = V;
   30597 
   30598   // Merge this node's mask and our incoming mask (adjusted to account for all
   30599   // the pshufd instructions encountered).
   30600   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   30601   for (int &M : Mask)
   30602     M = VMask[M];
   30603   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
   30604                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   30605 
   30606   // Check that the shuffles didn't cancel each other out. If not, we need to
   30607   // combine to the new one.
   30608   if (Old != V)
   30609     // Replace the combinable shuffle with the combined one, updating all users
   30610     // so that we re-evaluate the chain here.
   30611     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
   30612 
   30613   return true;
   30614 }
   30615 
   30616 /// Try to combine x86 target specific shuffles.
   30617 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   30618                                     TargetLowering::DAGCombinerInfo &DCI,
   30619                                     const X86Subtarget &Subtarget) {
   30620   SDLoc DL(N);
   30621   MVT VT = N.getSimpleValueType();
   30622   SmallVector<int, 4> Mask;
   30623   unsigned Opcode = N.getOpcode();
   30624 
   30625   // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
   30626   // single instruction.
   30627   if (VT.getScalarSizeInBits() == 64 &&
   30628       (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
   30629        Opcode == X86ISD::UNPCKL)) {
   30630     auto BC0 = peekThroughBitcasts(N.getOperand(0));
   30631     auto BC1 = peekThroughBitcasts(N.getOperand(1));
   30632     EVT VT0 = BC0.getValueType();
   30633     EVT VT1 = BC1.getValueType();
   30634     unsigned Opcode0 = BC0.getOpcode();
   30635     unsigned Opcode1 = BC1.getOpcode();
   30636     if (Opcode0 == Opcode1 && VT0 == VT1 &&
   30637         (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
   30638          Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
   30639          Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
   30640       SDValue Lo, Hi;
   30641       if (Opcode == X86ISD::MOVSD) {
   30642         Lo = BC1.getOperand(0);
   30643         Hi = BC0.getOperand(1);
   30644       } else {
   30645         Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
   30646         Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
   30647       }
   30648       SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
   30649       return DAG.getBitcast(VT, Horiz);
   30650     }
   30651   }
   30652 
   30653   switch (Opcode) {
   30654   case X86ISD::VBROADCAST: {
   30655     // If broadcasting from another shuffle, attempt to simplify it.
   30656     // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
   30657     SDValue Src = N.getOperand(0);
   30658     SDValue BC = peekThroughBitcasts(Src);
   30659     EVT SrcVT = Src.getValueType();
   30660     EVT BCVT = BC.getValueType();
   30661     if (isTargetShuffle(BC.getOpcode()) &&
   30662         VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
   30663       unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
   30664       SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
   30665                                         SM_SentinelUndef);
   30666       for (unsigned i = 0; i != Scale; ++i)
   30667         DemandedMask[i] = i;
   30668       if (SDValue Res = combineX86ShufflesRecursively(
   30669               {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
   30670               /*HasVarMask*/ false, DAG, Subtarget))
   30671         return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
   30672                            DAG.getBitcast(SrcVT, Res));
   30673     }
   30674     return SDValue();
   30675   }
   30676   case X86ISD::PSHUFD:
   30677   case X86ISD::PSHUFLW:
   30678   case X86ISD::PSHUFHW:
   30679     Mask = getPSHUFShuffleMask(N);
   30680     assert(Mask.size() == 4);
   30681     break;
   30682   case X86ISD::UNPCKL: {
   30683     // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
   30684     // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
   30685     // moves upper half elements into the lower half part. For example:
   30686     //
   30687     // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
   30688     //     undef:v16i8
   30689     // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
   30690     //
   30691     // will be combined to:
   30692     //
   30693     // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
   30694 
   30695     // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
   30696     // happen due to advanced instructions.
   30697     if (!VT.is128BitVector())
   30698       return SDValue();
   30699 
   30700     auto Op0 = N.getOperand(0);
   30701     auto Op1 = N.getOperand(1);
   30702     if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
   30703       ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
   30704 
   30705       unsigned NumElts = VT.getVectorNumElements();
   30706       SmallVector<int, 8> ExpectedMask(NumElts, -1);
   30707       std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
   30708                 NumElts / 2);
   30709 
   30710       auto ShufOp = Op1.getOperand(0);
   30711       if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
   30712         return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
   30713     }
   30714     return SDValue();
   30715   }
   30716   case X86ISD::MOVSD:
   30717   case X86ISD::MOVSS: {
   30718     SDValue N0 = N.getOperand(0);
   30719     SDValue N1 = N.getOperand(1);
   30720 
   30721     // Canonicalize scalar FPOps:
   30722     // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
   30723     // If commutable, allow OP(N1[0], N0[0]).
   30724     unsigned Opcode1 = N1.getOpcode();
   30725     if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
   30726         Opcode1 == ISD::FDIV) {
   30727       SDValue N10 = N1.getOperand(0);
   30728       SDValue N11 = N1.getOperand(1);
   30729       if (N10 == N0 ||
   30730           (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
   30731         if (N10 != N0)
   30732           std::swap(N10, N11);
   30733         MVT SVT = VT.getVectorElementType();
   30734         SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
   30735         N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
   30736         N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
   30737         SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
   30738         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
   30739         return DAG.getNode(Opcode, DL, VT, N0, SclVec);
   30740       }
   30741     }
   30742 
   30743     return SDValue();
   30744   }
   30745   case X86ISD::INSERTPS: {
   30746     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
   30747     SDValue Op0 = N.getOperand(0);
   30748     SDValue Op1 = N.getOperand(1);
   30749     SDValue Op2 = N.getOperand(2);
   30750     unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
   30751     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
   30752     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
   30753     unsigned ZeroMask = InsertPSMask & 0xF;
   30754 
   30755     // If we zero out all elements from Op0 then we don't need to reference it.
   30756     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
   30757       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
   30758                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
   30759 
   30760     // If we zero out the element from Op1 then we don't need to reference it.
   30761     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
   30762       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
   30763                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
   30764 
   30765     // Attempt to merge insertps Op1 with an inner target shuffle node.
   30766     SmallVector<int, 8> TargetMask1;
   30767     SmallVector<SDValue, 2> Ops1;
   30768     if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
   30769       int M = TargetMask1[SrcIdx];
   30770       if (isUndefOrZero(M)) {
   30771         // Zero/UNDEF insertion - zero out element and remove dependency.
   30772         InsertPSMask |= (1u << DstIdx);
   30773         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
   30774                            DAG.getConstant(InsertPSMask, DL, MVT::i8));
   30775       }
   30776       // Update insertps mask srcidx and reference the source input directly.
   30777       assert(0 <= M && M < 8 && "Shuffle index out of range");
   30778       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
   30779       Op1 = Ops1[M < 4 ? 0 : 1];
   30780       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
   30781                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
   30782     }
   30783 
   30784     // Attempt to merge insertps Op0 with an inner target shuffle node.
   30785     SmallVector<int, 8> TargetMask0;
   30786     SmallVector<SDValue, 2> Ops0;
   30787     if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
   30788       return SDValue();
   30789 
   30790     bool Updated = false;
   30791     bool UseInput00 = false;
   30792     bool UseInput01 = false;
   30793     for (int i = 0; i != 4; ++i) {
   30794       int M = TargetMask0[i];
   30795       if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
   30796         // No change if element is already zero or the inserted element.
   30797         continue;
   30798       } else if (isUndefOrZero(M)) {
   30799         // If the target mask is undef/zero then we must zero the element.
   30800         InsertPSMask |= (1u << i);
   30801         Updated = true;
   30802         continue;
   30803       }
   30804 
   30805       // The input vector element must be inline.
   30806       if (M != i && M != (i + 4))
   30807         return SDValue();
   30808 
   30809       // Determine which inputs of the target shuffle we're using.
   30810       UseInput00 |= (0 <= M && M < 4);
   30811       UseInput01 |= (4 <= M);
   30812     }
   30813 
   30814     // If we're not using both inputs of the target shuffle then use the
   30815     // referenced input directly.
   30816     if (UseInput00 && !UseInput01) {
   30817       Updated = true;
   30818       Op0 = Ops0[0];
   30819     } else if (!UseInput00 && UseInput01) {
   30820       Updated = true;
   30821       Op0 = Ops0[1];
   30822     }
   30823 
   30824     if (Updated)
   30825       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
   30826                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
   30827 
   30828     return SDValue();
   30829   }
   30830   default:
   30831     return SDValue();
   30832   }
   30833 
   30834   // Nuke no-op shuffles that show up after combining.
   30835   if (isNoopShuffleMask(Mask))
   30836     return N.getOperand(0);
   30837 
   30838   // Look for simplifications involving one or two shuffle instructions.
   30839   SDValue V = N.getOperand(0);
   30840   switch (N.getOpcode()) {
   30841   default:
   30842     break;
   30843   case X86ISD::PSHUFLW:
   30844   case X86ISD::PSHUFHW:
   30845     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
   30846 
   30847     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
   30848       return SDValue(); // We combined away this shuffle, so we're done.
   30849 
   30850     // See if this reduces to a PSHUFD which is no more expensive and can
   30851     // combine with more operations. Note that it has to at least flip the
   30852     // dwords as otherwise it would have been removed as a no-op.
   30853     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
   30854       int DMask[] = {0, 1, 2, 3};
   30855       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
   30856       DMask[DOffset + 0] = DOffset + 1;
   30857       DMask[DOffset + 1] = DOffset + 0;
   30858       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
   30859       V = DAG.getBitcast(DVT, V);
   30860       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
   30861                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
   30862       return DAG.getBitcast(VT, V);
   30863     }
   30864 
   30865     // Look for shuffle patterns which can be implemented as a single unpack.
   30866     // FIXME: This doesn't handle the location of the PSHUFD generically, and
   30867     // only works when we have a PSHUFD followed by two half-shuffles.
   30868     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
   30869         (V.getOpcode() == X86ISD::PSHUFLW ||
   30870          V.getOpcode() == X86ISD::PSHUFHW) &&
   30871         V.getOpcode() != N.getOpcode() &&
   30872         V.hasOneUse()) {
   30873       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
   30874       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
   30875         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   30876         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
   30877         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
   30878         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
   30879         int WordMask[8];
   30880         for (int i = 0; i < 4; ++i) {
   30881           WordMask[i + NOffset] = Mask[i] + NOffset;
   30882           WordMask[i + VOffset] = VMask[i] + VOffset;
   30883         }
   30884         // Map the word mask through the DWord mask.
   30885         int MappedMask[8];
   30886         for (int i = 0; i < 8; ++i)
   30887           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
   30888         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
   30889             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
   30890           // We can replace all three shuffles with an unpack.
   30891           V = DAG.getBitcast(VT, D.getOperand(0));
   30892           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
   30893                                                 : X86ISD::UNPCKH,
   30894                              DL, VT, V, V);
   30895         }
   30896       }
   30897     }
   30898 
   30899     break;
   30900 
   30901   case X86ISD::PSHUFD:
   30902     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
   30903       return NewN;
   30904 
   30905     break;
   30906   }
   30907 
   30908   return SDValue();
   30909 }
   30910 
   30911 /// Checks if the shuffle mask takes subsequent elements
   30912 /// alternately from two vectors.
   30913 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
   30914 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
   30915 
   30916   int ParitySrc[2] = {-1, -1};
   30917   unsigned Size = Mask.size();
   30918   for (unsigned i = 0; i != Size; ++i) {
   30919     int M = Mask[i];
   30920     if (M < 0)
   30921       continue;
   30922 
   30923     // Make sure we are using the matching element from the input.
   30924     if ((M % Size) != i)
   30925       return false;
   30926 
   30927     // Make sure we use the same input for all elements of the same parity.
   30928     int Src = M / Size;
   30929     if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
   30930       return false;
   30931     ParitySrc[i % 2] = Src;
   30932   }
   30933 
   30934   // Make sure each input is used.
   30935   if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
   30936     return false;
   30937 
   30938   Op0Even = ParitySrc[0] == 0;
   30939   return true;
   30940 }
   30941 
   30942 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
   30943 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
   30944 /// are written to the parameters \p Opnd0 and \p Opnd1.
   30945 ///
   30946 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
   30947 /// so it is easier to generically match. We also insert dummy vector shuffle
   30948 /// nodes for the operands which explicitly discard the lanes which are unused
   30949 /// by this operation to try to flow through the rest of the combiner
   30950 /// the fact that they're unused.
   30951 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
   30952                              SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
   30953                              bool &IsSubAdd) {
   30954 
   30955   EVT VT = N->getValueType(0);
   30956   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   30957   if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
   30958       !VT.getSimpleVT().isFloatingPoint())
   30959     return false;
   30960 
   30961   // We only handle target-independent shuffles.
   30962   // FIXME: It would be easy and harmless to use the target shuffle mask
   30963   // extraction tool to support more.
   30964   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
   30965     return false;
   30966 
   30967   SDValue V1 = N->getOperand(0);
   30968   SDValue V2 = N->getOperand(1);
   30969 
   30970   // Make sure we have an FADD and an FSUB.
   30971   if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
   30972       (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
   30973       V1.getOpcode() == V2.getOpcode())
   30974     return false;
   30975 
   30976   // If there are other uses of these operations we can't fold them.
   30977   if (!V1->hasOneUse() || !V2->hasOneUse())
   30978     return false;
   30979 
   30980   // Ensure that both operations have the same operands. Note that we can
   30981   // commute the FADD operands.
   30982   SDValue LHS, RHS;
   30983   if (V1.getOpcode() == ISD::FSUB) {
   30984     LHS = V1->getOperand(0); RHS = V1->getOperand(1);
   30985     if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
   30986         (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
   30987       return false;
   30988   } else {
   30989     assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
   30990     LHS = V2->getOperand(0); RHS = V2->getOperand(1);
   30991     if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
   30992         (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
   30993       return false;
   30994   }
   30995 
   30996   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
   30997   bool Op0Even;
   30998   if (!isAddSubOrSubAddMask(Mask, Op0Even))
   30999     return false;
   31000 
   31001   // It's a subadd if the vector in the even parity is an FADD.
   31002   IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
   31003                      : V2->getOpcode() == ISD::FADD;
   31004 
   31005   Opnd0 = LHS;
   31006   Opnd1 = RHS;
   31007   return true;
   31008 }
   31009 
   31010 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
   31011 static SDValue combineShuffleToFMAddSub(SDNode *N,
   31012                                         const X86Subtarget &Subtarget,
   31013                                         SelectionDAG &DAG) {
   31014   // We only handle target-independent shuffles.
   31015   // FIXME: It would be easy and harmless to use the target shuffle mask
   31016   // extraction tool to support more.
   31017   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
   31018     return SDValue();
   31019 
   31020   MVT VT = N->getSimpleValueType(0);
   31021   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   31022   if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
   31023     return SDValue();
   31024 
   31025   // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
   31026   SDValue Op0 = N->getOperand(0);
   31027   SDValue Op1 = N->getOperand(1);
   31028   SDValue FMAdd = Op0, FMSub = Op1;
   31029   if (FMSub.getOpcode() != X86ISD::FMSUB)
   31030     std::swap(FMAdd, FMSub);
   31031 
   31032   if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
   31033       FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
   31034       FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
   31035       FMAdd.getOperand(2) != FMSub.getOperand(2))
   31036     return SDValue();
   31037 
   31038   // Check for correct shuffle mask.
   31039   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
   31040   bool Op0Even;
   31041   if (!isAddSubOrSubAddMask(Mask, Op0Even))
   31042     return SDValue();
   31043 
   31044   // FMAddSub takes zeroth operand from FMSub node.
   31045   SDLoc DL(N);
   31046   bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
   31047   unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
   31048   return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
   31049                      FMAdd.getOperand(2));
   31050 }
   31051 
   31052 /// Try to combine a shuffle into a target-specific add-sub or
   31053 /// mul-add-sub node.
   31054 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
   31055                                                 const X86Subtarget &Subtarget,
   31056                                                 SelectionDAG &DAG) {
   31057   if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
   31058     return V;
   31059 
   31060   SDValue Opnd0, Opnd1;
   31061   bool IsSubAdd;
   31062   if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
   31063     return SDValue();
   31064 
   31065   MVT VT = N->getSimpleValueType(0);
   31066   SDLoc DL(N);
   31067 
   31068   // Try to generate X86ISD::FMADDSUB node here.
   31069   SDValue Opnd2;
   31070   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
   31071     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
   31072     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
   31073   }
   31074 
   31075   if (IsSubAdd)
   31076     return SDValue();
   31077 
   31078   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
   31079   // the ADDSUB idiom has been successfully recognized. There are no known
   31080   // X86 targets with 512-bit ADDSUB instructions!
   31081   if (VT.is512BitVector())
   31082     return SDValue();
   31083 
   31084   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
   31085 }
   31086 
   31087 // We are looking for a shuffle where both sources are concatenated with undef
   31088 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
   31089 // if we can express this as a single-source shuffle, that's preferable.
   31090 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
   31091                                            const X86Subtarget &Subtarget) {
   31092   if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
   31093     return SDValue();
   31094 
   31095   EVT VT = N->getValueType(0);
   31096 
   31097   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
   31098   if (!VT.is128BitVector() && !VT.is256BitVector())
   31099     return SDValue();
   31100 
   31101   if (VT.getVectorElementType() != MVT::i32 &&
   31102       VT.getVectorElementType() != MVT::i64 &&
   31103       VT.getVectorElementType() != MVT::f32 &&
   31104       VT.getVectorElementType() != MVT::f64)
   31105     return SDValue();
   31106 
   31107   SDValue N0 = N->getOperand(0);
   31108   SDValue N1 = N->getOperand(1);
   31109 
   31110   // Check that both sources are concats with undef.
   31111   if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
   31112       N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
   31113       N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
   31114       !N1.getOperand(1).isUndef())
   31115     return SDValue();
   31116 
   31117   // Construct the new shuffle mask. Elements from the first source retain their
   31118   // index, but elements from the second source no longer need to skip an undef.
   31119   SmallVector<int, 8> Mask;
   31120   int NumElts = VT.getVectorNumElements();
   31121 
   31122   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   31123   for (int Elt : SVOp->getMask())
   31124     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
   31125 
   31126   SDLoc DL(N);
   31127   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
   31128                                N1.getOperand(0));
   31129   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
   31130 }
   31131 
   31132 /// Eliminate a redundant shuffle of a horizontal math op.
   31133 static SDValue foldShuffleOfHorizOp(SDNode *N) {
   31134   if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
   31135     return SDValue();
   31136 
   31137   SDValue HOp = N->getOperand(0);
   31138   if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
   31139       HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
   31140     return SDValue();
   31141 
   31142   // 128-bit horizontal math instructions are defined to operate on adjacent
   31143   // lanes of each operand as:
   31144   // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
   31145   // ...similarly for v2f64 and v8i16.
   31146   // TODO: Handle UNDEF operands.
   31147   if (HOp.getOperand(0) != HOp.getOperand(1))
   31148     return SDValue();
   31149 
   31150   // When the operands of a horizontal math op are identical, the low half of
   31151   // the result is the same as the high half. If the shuffle is also replicating
   31152   // low and high halves, we don't need the shuffle.
   31153   // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
   31154   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
   31155   // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
   31156   // but this should be tied to whatever horizontal op matching and shuffle
   31157   // canonicalization are producing.
   31158   if (HOp.getValueSizeInBits() == 128 &&
   31159       (isTargetShuffleEquivalent(Mask, {0, 0}) ||
   31160        isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
   31161        isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
   31162     return HOp;
   31163 
   31164   if (HOp.getValueSizeInBits() == 256 &&
   31165       (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
   31166        isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
   31167        isTargetShuffleEquivalent(
   31168            Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
   31169     return HOp;
   31170 
   31171   return SDValue();
   31172 }
   31173 
   31174 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
   31175                               TargetLowering::DAGCombinerInfo &DCI,
   31176                               const X86Subtarget &Subtarget) {
   31177   SDLoc dl(N);
   31178   EVT VT = N->getValueType(0);
   31179   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   31180   // If we have legalized the vector types, look for blends of FADD and FSUB
   31181   // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
   31182   if (TLI.isTypeLegal(VT)) {
   31183     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
   31184       return AddSub;
   31185 
   31186     if (SDValue HAddSub = foldShuffleOfHorizOp(N))
   31187       return HAddSub;
   31188   }
   31189 
   31190   // During Type Legalization, when promoting illegal vector types,
   31191   // the backend might introduce new shuffle dag nodes and bitcasts.
   31192   //
   31193   // This code performs the following transformation:
   31194   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
   31195   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
   31196   //
   31197   // We do this only if both the bitcast and the BINOP dag nodes have
   31198   // one use. Also, perform this transformation only if the new binary
   31199   // operation is legal. This is to avoid introducing dag nodes that
   31200   // potentially need to be further expanded (or custom lowered) into a
   31201   // less optimal sequence of dag nodes.
   31202   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
   31203       N->getOpcode() == ISD::VECTOR_SHUFFLE &&
   31204       N->getOperand(0).getOpcode() == ISD::BITCAST &&
   31205       N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
   31206     SDValue N0 = N->getOperand(0);
   31207     SDValue N1 = N->getOperand(1);
   31208 
   31209     SDValue BC0 = N0.getOperand(0);
   31210     EVT SVT = BC0.getValueType();
   31211     unsigned Opcode = BC0.getOpcode();
   31212     unsigned NumElts = VT.getVectorNumElements();
   31213 
   31214     if (BC0.hasOneUse() && SVT.isVector() &&
   31215         SVT.getVectorNumElements() * 2 == NumElts &&
   31216         TLI.isOperationLegal(Opcode, VT)) {
   31217       bool CanFold = false;
   31218       switch (Opcode) {
   31219       default : break;
   31220       case ISD::ADD:
   31221       case ISD::SUB:
   31222       case ISD::MUL:
   31223         // isOperationLegal lies for integer ops on floating point types.
   31224         CanFold = VT.isInteger();
   31225         break;
   31226       case ISD::FADD:
   31227       case ISD::FSUB:
   31228       case ISD::FMUL:
   31229         // isOperationLegal lies for floating point ops on integer types.
   31230         CanFold = VT.isFloatingPoint();
   31231         break;
   31232       }
   31233 
   31234       unsigned SVTNumElts = SVT.getVectorNumElements();
   31235       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   31236       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
   31237         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
   31238       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
   31239         CanFold = SVOp->getMaskElt(i) < 0;
   31240 
   31241       if (CanFold) {
   31242         SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
   31243         SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
   31244         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
   31245         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
   31246       }
   31247     }
   31248   }
   31249 
   31250   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
   31251   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
   31252   // consecutive, non-overlapping, and in the right order.
   31253   SmallVector<SDValue, 16> Elts;
   31254   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
   31255     if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
   31256       Elts.push_back(Elt);
   31257       continue;
   31258     }
   31259     Elts.clear();
   31260     break;
   31261   }
   31262 
   31263   if (Elts.size() == VT.getVectorNumElements())
   31264     if (SDValue LD =
   31265             EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
   31266       return LD;
   31267 
   31268   // For AVX2, we sometimes want to combine
   31269   // (vector_shuffle <mask> (concat_vectors t1, undef)
   31270   //                        (concat_vectors t2, undef))
   31271   // Into:
   31272   // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
   31273   // Since the latter can be efficiently lowered with VPERMD/VPERMQ
   31274   if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
   31275     return ShufConcat;
   31276 
   31277   if (isTargetShuffle(N->getOpcode())) {
   31278     SDValue Op(N, 0);
   31279     if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
   31280       return Shuffle;
   31281 
   31282     // Try recursively combining arbitrary sequences of x86 shuffle
   31283     // instructions into higher-order shuffles. We do this after combining
   31284     // specific PSHUF instruction sequences into their minimal form so that we
   31285     // can evaluate how many specialized shuffle instructions are involved in
   31286     // a particular chain.
   31287     if (SDValue Res = combineX86ShufflesRecursively(
   31288             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
   31289             /*HasVarMask*/ false, DAG, Subtarget))
   31290       return Res;
   31291   }
   31292 
   31293   return SDValue();
   31294 }
   31295 
   31296 /// Check if a vector extract from a target-specific shuffle of a load can be
   31297 /// folded into a single element load.
   31298 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
   31299 /// shuffles have been custom lowered so we need to handle those here.
   31300 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   31301                                          TargetLowering::DAGCombinerInfo &DCI) {
   31302   if (DCI.isBeforeLegalizeOps())
   31303     return SDValue();
   31304 
   31305   SDValue InVec = N->getOperand(0);
   31306   SDValue EltNo = N->getOperand(1);
   31307   EVT EltVT = N->getValueType(0);
   31308 
   31309   if (!isa<ConstantSDNode>(EltNo))
   31310     return SDValue();
   31311 
   31312   EVT OriginalVT = InVec.getValueType();
   31313 
   31314   // Peek through bitcasts, don't duplicate a load with other uses.
   31315   InVec = peekThroughOneUseBitcasts(InVec);
   31316 
   31317   EVT CurrentVT = InVec.getValueType();
   31318   if (!CurrentVT.isVector() ||
   31319       CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
   31320     return SDValue();
   31321 
   31322   if (!isTargetShuffle(InVec.getOpcode()))
   31323     return SDValue();
   31324 
   31325   // Don't duplicate a load with other uses.
   31326   if (!InVec.hasOneUse())
   31327     return SDValue();
   31328 
   31329   SmallVector<int, 16> ShuffleMask;
   31330   SmallVector<SDValue, 2> ShuffleOps;
   31331   bool UnaryShuffle;
   31332   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
   31333                             ShuffleOps, ShuffleMask, UnaryShuffle))
   31334     return SDValue();
   31335 
   31336   // Select the input vector, guarding against out of range extract vector.
   31337   unsigned NumElems = CurrentVT.getVectorNumElements();
   31338   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   31339   int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
   31340 
   31341   if (Idx == SM_SentinelZero)
   31342     return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
   31343                              : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
   31344   if (Idx == SM_SentinelUndef)
   31345     return DAG.getUNDEF(EltVT);
   31346 
   31347   assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
   31348   SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
   31349                                          : ShuffleOps[1];
   31350 
   31351   // If inputs to shuffle are the same for both ops, then allow 2 uses
   31352   unsigned AllowedUses =
   31353       (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
   31354 
   31355   if (LdNode.getOpcode() == ISD::BITCAST) {
   31356     // Don't duplicate a load with other uses.
   31357     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
   31358       return SDValue();
   31359 
   31360     AllowedUses = 1; // only allow 1 load use if we have a bitcast
   31361     LdNode = LdNode.getOperand(0);
   31362   }
   31363 
   31364   if (!ISD::isNormalLoad(LdNode.getNode()))
   31365     return SDValue();
   31366 
   31367   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
   31368 
   31369   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
   31370     return SDValue();
   31371 
   31372   // If there's a bitcast before the shuffle, check if the load type and
   31373   // alignment is valid.
   31374   unsigned Align = LN0->getAlignment();
   31375   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   31376   unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
   31377       EltVT.getTypeForEVT(*DAG.getContext()));
   31378 
   31379   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
   31380     return SDValue();
   31381 
   31382   // All checks match so transform back to vector_shuffle so that DAG combiner
   31383   // can finish the job
   31384   SDLoc dl(N);
   31385 
   31386   // Create shuffle node taking into account the case that its a unary shuffle
   31387   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
   31388   Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
   31389                                  ShuffleMask);
   31390   Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
   31391   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
   31392                      EltNo);
   31393 }
   31394 
   31395 // Try to match patterns such as
   31396 // (i16 bitcast (v16i1 x))
   31397 // ->
   31398 // (i16 movmsk (16i8 sext (v16i1 x)))
   31399 // before the illegal vector is scalarized on subtargets that don't have legal
   31400 // vxi1 types.
   31401 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
   31402                                   const X86Subtarget &Subtarget) {
   31403   EVT VT = BitCast.getValueType();
   31404   SDValue N0 = BitCast.getOperand(0);
   31405   EVT VecVT = N0->getValueType(0);
   31406 
   31407   if (!VT.isScalarInteger() || !VecVT.isSimple())
   31408     return SDValue();
   31409 
   31410   // With AVX512 vxi1 types are legal and we prefer using k-regs.
   31411   // MOVMSK is supported in SSE2 or later.
   31412   if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
   31413     return SDValue();
   31414 
   31415   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
   31416   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
   31417   // v8i16 and v16i16.
   31418   // For these two cases, we can shuffle the upper element bytes to a
   31419   // consecutive sequence at the start of the vector and treat the results as
   31420   // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
   31421   // for v16i16 this is not the case, because the shuffle is expensive, so we
   31422   // avoid sign-extending to this type entirely.
   31423   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
   31424   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
   31425   MVT SExtVT;
   31426   MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
   31427   switch (VecVT.getSimpleVT().SimpleTy) {
   31428   default:
   31429     return SDValue();
   31430   case MVT::v2i1:
   31431     SExtVT = MVT::v2i64;
   31432     FPCastVT = MVT::v2f64;
   31433     break;
   31434   case MVT::v4i1:
   31435     SExtVT = MVT::v4i32;
   31436     FPCastVT = MVT::v4f32;
   31437     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
   31438     // sign-extend to a 256-bit operation to avoid truncation.
   31439     if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
   31440         N0->getOperand(0).getValueType().is256BitVector()) {
   31441       SExtVT = MVT::v4i64;
   31442       FPCastVT = MVT::v4f64;
   31443     }
   31444     break;
   31445   case MVT::v8i1:
   31446     SExtVT = MVT::v8i16;
   31447     // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
   31448     // sign-extend to a 256-bit operation to match the compare.
   31449     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
   31450     // 256-bit because the shuffle is cheaper than sign extending the result of
   31451     // the compare.
   31452     if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
   31453         (N0->getOperand(0).getValueType().is256BitVector() ||
   31454          N0->getOperand(0).getValueType().is512BitVector())) {
   31455       SExtVT = MVT::v8i32;
   31456       FPCastVT = MVT::v8f32;
   31457     }
   31458     break;
   31459   case MVT::v16i1:
   31460     SExtVT = MVT::v16i8;
   31461     // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
   31462     // it is not profitable to sign-extend to 256-bit because this will
   31463     // require an extra cross-lane shuffle which is more expensive than
   31464     // truncating the result of the compare to 128-bits.
   31465     break;
   31466   case MVT::v32i1:
   31467     SExtVT = MVT::v32i8;
   31468     break;
   31469   };
   31470 
   31471   SDLoc DL(BitCast);
   31472   SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
   31473 
   31474   if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
   31475     V = getPMOVMSKB(DL, V, DAG, Subtarget);
   31476     return DAG.getZExtOrTrunc(V, DL, VT);
   31477   }
   31478 
   31479   if (SExtVT == MVT::v8i16) {
   31480     assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
   31481     V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
   31482                     DAG.getUNDEF(MVT::v8i16));
   31483   } else
   31484     assert(SExtVT.getScalarType() != MVT::i16 &&
   31485            "Vectors of i16 must be packed");
   31486   if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
   31487     V = DAG.getBitcast(FPCastVT, V);
   31488   V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
   31489   return DAG.getZExtOrTrunc(V, DL, VT);
   31490 }
   31491 
   31492 // Convert a vXi1 constant build vector to the same width scalar integer.
   31493 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
   31494   EVT SrcVT = Op.getValueType();
   31495   assert(SrcVT.getVectorElementType() == MVT::i1 &&
   31496          "Expected a vXi1 vector");
   31497   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
   31498          "Expected a constant build vector");
   31499 
   31500   APInt Imm(SrcVT.getVectorNumElements(), 0);
   31501   for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
   31502     SDValue In = Op.getOperand(Idx);
   31503     if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
   31504       Imm.setBit(Idx);
   31505   }
   31506   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
   31507   return DAG.getConstant(Imm, SDLoc(Op), IntVT);
   31508 }
   31509 
   31510 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   31511                                            TargetLowering::DAGCombinerInfo &DCI,
   31512                                            const X86Subtarget &Subtarget) {
   31513   assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
   31514 
   31515   if (!DCI.isBeforeLegalizeOps())
   31516     return SDValue();
   31517 
   31518   // Only do this if we have k-registers.
   31519   if (!Subtarget.hasAVX512())
   31520     return SDValue();
   31521 
   31522   EVT DstVT = N->getValueType(0);
   31523   SDValue Op = N->getOperand(0);
   31524   EVT SrcVT = Op.getValueType();
   31525 
   31526   if (!Op.hasOneUse())
   31527     return SDValue();
   31528 
   31529   // Look for logic ops.
   31530   if (Op.getOpcode() != ISD::AND &&
   31531       Op.getOpcode() != ISD::OR &&
   31532       Op.getOpcode() != ISD::XOR)
   31533     return SDValue();
   31534 
   31535   // Make sure we have a bitcast between mask registers and a scalar type.
   31536   if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
   31537         DstVT.isScalarInteger()) &&
   31538       !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
   31539         SrcVT.isScalarInteger()))
   31540     return SDValue();
   31541 
   31542   SDValue LHS = Op.getOperand(0);
   31543   SDValue RHS = Op.getOperand(1);
   31544 
   31545   if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
   31546       LHS.getOperand(0).getValueType() == DstVT)
   31547     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
   31548                        DAG.getBitcast(DstVT, RHS));
   31549 
   31550   if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
   31551       RHS.getOperand(0).getValueType() == DstVT)
   31552     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
   31553                        DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
   31554 
   31555   // If the RHS is a vXi1 build vector, this is a good reason to flip too.
   31556   // Most of these have to move a constant from the scalar domain anyway.
   31557   if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
   31558     RHS = combinevXi1ConstantToInteger(RHS, DAG);
   31559     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
   31560                        DAG.getBitcast(DstVT, LHS), RHS);
   31561   }
   31562 
   31563   return SDValue();
   31564 }
   31565 
   31566 static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
   31567                                     const X86Subtarget &Subtarget) {
   31568   SDLoc DL(N);
   31569   unsigned NumElts = N.getNumOperands();
   31570 
   31571   auto *BV = cast<BuildVectorSDNode>(N);
   31572   SDValue Splat = BV->getSplatValue();
   31573 
   31574   // Build MMX element from integer GPR or SSE float values.
   31575   auto CreateMMXElement = [&](SDValue V) {
   31576     if (V.isUndef())
   31577       return DAG.getUNDEF(MVT::x86mmx);
   31578     if (V.getValueType().isFloatingPoint()) {
   31579       if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
   31580         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
   31581         V = DAG.getBitcast(MVT::v2i64, V);
   31582         return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
   31583       }
   31584       V = DAG.getBitcast(MVT::i32, V);
   31585     } else {
   31586       V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
   31587     }
   31588     return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
   31589   };
   31590 
   31591   // Convert build vector ops to MMX data in the bottom elements.
   31592   SmallVector<SDValue, 8> Ops;
   31593 
   31594   // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
   31595   if (Splat) {
   31596     if (Splat.isUndef())
   31597       return DAG.getUNDEF(MVT::x86mmx);
   31598 
   31599     Splat = CreateMMXElement(Splat);
   31600 
   31601     if (Subtarget.hasSSE1()) {
   31602       // Unpack v8i8 to splat i8 elements to lowest 16-bits.
   31603       if (NumElts == 8)
   31604         Splat = DAG.getNode(
   31605             ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
   31606             DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
   31607             Splat);
   31608 
   31609       // Use PSHUFW to repeat 16-bit elements.
   31610       unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
   31611       return DAG.getNode(
   31612           ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
   31613           DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
   31614           DAG.getConstant(ShufMask, DL, MVT::i8));
   31615     }
   31616     Ops.append(NumElts, Splat);
   31617   } else {
   31618     for (unsigned i = 0; i != NumElts; ++i)
   31619       Ops.push_back(CreateMMXElement(N.getOperand(i)));
   31620   }
   31621 
   31622   // Use tree of PUNPCKLs to build up general MMX vector.
   31623   while (Ops.size() > 1) {
   31624     unsigned NumOps = Ops.size();
   31625     unsigned IntrinOp =
   31626         (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
   31627                      : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
   31628                                     : Intrinsic::x86_mmx_punpcklbw));
   31629     SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
   31630     for (unsigned i = 0; i != NumOps; i += 2)
   31631       Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
   31632                                Ops[i], Ops[i + 1]);
   31633     Ops.resize(NumOps / 2);
   31634   }
   31635 
   31636   return Ops[0];
   31637 }
   31638 
   31639 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
   31640                               TargetLowering::DAGCombinerInfo &DCI,
   31641                               const X86Subtarget &Subtarget) {
   31642   SDValue N0 = N->getOperand(0);
   31643   EVT VT = N->getValueType(0);
   31644   EVT SrcVT = N0.getValueType();
   31645 
   31646   // Try to match patterns such as
   31647   // (i16 bitcast (v16i1 x))
   31648   // ->
   31649   // (i16 movmsk (16i8 sext (v16i1 x)))
   31650   // before the setcc result is scalarized on subtargets that don't have legal
   31651   // vxi1 types.
   31652   if (DCI.isBeforeLegalize()) {
   31653     if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
   31654       return V;
   31655 
   31656     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
   31657     // type, widen both sides to avoid a trip through memory.
   31658     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
   31659         Subtarget.hasAVX512()) {
   31660       SDLoc dl(N);
   31661       N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
   31662       N0 = DAG.getBitcast(MVT::v8i1, N0);
   31663       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
   31664                          DAG.getIntPtrConstant(0, dl));
   31665     }
   31666 
   31667     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
   31668     // type, widen both sides to avoid a trip through memory.
   31669     if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
   31670         Subtarget.hasAVX512()) {
   31671       SDLoc dl(N);
   31672       unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
   31673       SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
   31674       Ops[0] = N0;
   31675       N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
   31676       N0 = DAG.getBitcast(MVT::i8, N0);
   31677       return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
   31678     }
   31679   }
   31680 
   31681   // Since MMX types are special and don't usually play with other vector types,
   31682   // it's better to handle them early to be sure we emit efficient code by
   31683   // avoiding store-load conversions.
   31684   if (VT == MVT::x86mmx) {
   31685     // Detect MMX constant vectors.
   31686     APInt UndefElts;
   31687     SmallVector<APInt, 1> EltBits;
   31688     if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
   31689       SDLoc DL(N0);
   31690       // Handle zero-extension of i32 with MOVD.
   31691       if (EltBits[0].countLeadingZeros() >= 32)
   31692         return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
   31693                            DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
   31694       // Else, bitcast to a double.
   31695       // TODO - investigate supporting sext 32-bit immediates on x86_64.
   31696       APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
   31697       return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
   31698     }
   31699 
   31700     // Detect bitcasts to x86mmx low word.
   31701     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
   31702         (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
   31703         N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
   31704       bool LowUndef = true, AllUndefOrZero = true;
   31705       for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
   31706         SDValue Op = N0.getOperand(i);
   31707         LowUndef &= Op.isUndef() || (i >= e/2);
   31708         AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
   31709       }
   31710       if (AllUndefOrZero) {
   31711         SDValue N00 = N0.getOperand(0);
   31712         SDLoc dl(N00);
   31713         N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
   31714                        : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
   31715         return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
   31716       }
   31717     }
   31718 
   31719     // Detect bitcasts of 64-bit build vectors and convert to a
   31720     // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
   31721     // lowest element.
   31722     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
   31723         (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
   31724          SrcVT == MVT::v8i8))
   31725       return createMMXBuildVector(N0, DAG, Subtarget);
   31726 
   31727     // Detect bitcasts between element or subvector extraction to x86mmx.
   31728     if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
   31729          N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
   31730         isNullConstant(N0.getOperand(1))) {
   31731       SDValue N00 = N0.getOperand(0);
   31732       if (N00.getValueType().is128BitVector())
   31733         return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
   31734                            DAG.getBitcast(MVT::v2i64, N00));
   31735     }
   31736 
   31737     // Detect bitcasts from FP_TO_SINT to x86mmx.
   31738     if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
   31739       SDLoc DL(N0);
   31740       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
   31741                                 DAG.getUNDEF(MVT::v2i32));
   31742       return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
   31743                          DAG.getBitcast(MVT::v2i64, Res));
   31744     }
   31745   }
   31746 
   31747   // Try to remove a bitcast of constant vXi1 vector. We have to legalize
   31748   // most of these to scalar anyway.
   31749   if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
   31750       SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
   31751       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
   31752     return combinevXi1ConstantToInteger(N0, DAG);
   31753   }
   31754 
   31755   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
   31756       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
   31757       isa<ConstantSDNode>(N0)) {
   31758     auto *C = cast<ConstantSDNode>(N0);
   31759     if (C->isAllOnesValue())
   31760       return DAG.getConstant(1, SDLoc(N0), VT);
   31761     if (C->isNullValue())
   31762       return DAG.getConstant(0, SDLoc(N0), VT);
   31763   }
   31764 
   31765   // Try to remove bitcasts from input and output of mask arithmetic to
   31766   // remove GPR<->K-register crossings.
   31767   if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
   31768     return V;
   31769 
   31770   // Convert a bitcasted integer logic operation that has one bitcasted
   31771   // floating-point operand into a floating-point logic operation. This may
   31772   // create a load of a constant, but that is cheaper than materializing the
   31773   // constant in an integer register and transferring it to an SSE register or
   31774   // transferring the SSE operand to integer register and back.
   31775   unsigned FPOpcode;
   31776   switch (N0.getOpcode()) {
   31777     case ISD::AND: FPOpcode = X86ISD::FAND; break;
   31778     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
   31779     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
   31780     default: return SDValue();
   31781   }
   31782 
   31783   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
   31784         (Subtarget.hasSSE2() && VT == MVT::f64)))
   31785     return SDValue();
   31786 
   31787   SDValue LogicOp0 = N0.getOperand(0);
   31788   SDValue LogicOp1 = N0.getOperand(1);
   31789   SDLoc DL0(N0);
   31790 
   31791   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
   31792   if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
   31793       LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
   31794       !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
   31795     SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
   31796     return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
   31797   }
   31798   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
   31799   if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
   31800       LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
   31801       !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
   31802     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
   31803     return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
   31804   }
   31805 
   31806   return SDValue();
   31807 }
   31808 
   31809 // Match a binop + shuffle pyramid that represents a horizontal reduction over
   31810 // the elements of a vector.
   31811 // Returns the vector that is being reduced on, or SDValue() if a reduction
   31812 // was not matched.
   31813 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
   31814                                    ArrayRef<ISD::NodeType> CandidateBinOps) {
   31815   // The pattern must end in an extract from index 0.
   31816   if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
   31817       !isNullConstant(Extract->getOperand(1)))
   31818     return SDValue();
   31819 
   31820   SDValue Op = Extract->getOperand(0);
   31821   unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
   31822 
   31823   // Match against one of the candidate binary ops.
   31824   if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
   31825         return Op.getOpcode() == unsigned(BinOp);
   31826       }))
   31827     return SDValue();
   31828 
   31829   // At each stage, we're looking for something that looks like:
   31830   // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
   31831   //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
   31832   //                               i32 undef, i32 undef, i32 undef, i32 undef>
   31833   // %a = binop <8 x i32> %op, %s
   31834   // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
   31835   // we expect something like:
   31836   // <4,5,6,7,u,u,u,u>
   31837   // <2,3,u,u,u,u,u,u>
   31838   // <1,u,u,u,u,u,u,u>
   31839   unsigned CandidateBinOp = Op.getOpcode();
   31840   for (unsigned i = 0; i < Stages; ++i) {
   31841     if (Op.getOpcode() != CandidateBinOp)
   31842       return SDValue();
   31843 
   31844     ShuffleVectorSDNode *Shuffle =
   31845         dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
   31846     if (Shuffle) {
   31847       Op = Op.getOperand(1);
   31848     } else {
   31849       Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
   31850       Op = Op.getOperand(0);
   31851     }
   31852 
   31853     // The first operand of the shuffle should be the same as the other operand
   31854     // of the binop.
   31855     if (!Shuffle || Shuffle->getOperand(0) != Op)
   31856       return SDValue();
   31857 
   31858     // Verify the shuffle has the expected (at this stage of the pyramid) mask.
   31859     for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
   31860       if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
   31861         return SDValue();
   31862   }
   31863 
   31864   BinOp = CandidateBinOp;
   31865   return Op;
   31866 }
   31867 
   31868 // Given a select, detect the following pattern:
   31869 // 1:    %2 = zext <N x i8> %0 to <N x i32>
   31870 // 2:    %3 = zext <N x i8> %1 to <N x i32>
   31871 // 3:    %4 = sub nsw <N x i32> %2, %3
   31872 // 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
   31873 // 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
   31874 // 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
   31875 // This is useful as it is the input into a SAD pattern.
   31876 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
   31877                               SDValue &Op1) {
   31878   // Check the condition of the select instruction is greater-than.
   31879   SDValue SetCC = Select->getOperand(0);
   31880   if (SetCC.getOpcode() != ISD::SETCC)
   31881     return false;
   31882   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
   31883   if (CC != ISD::SETGT && CC != ISD::SETLT)
   31884     return false;
   31885 
   31886   SDValue SelectOp1 = Select->getOperand(1);
   31887   SDValue SelectOp2 = Select->getOperand(2);
   31888 
   31889   // The following instructions assume SelectOp1 is the subtraction operand
   31890   // and SelectOp2 is the negation operand.
   31891   // In the case of SETLT this is the other way around.
   31892   if (CC == ISD::SETLT)
   31893     std::swap(SelectOp1, SelectOp2);
   31894 
   31895   // The second operand of the select should be the negation of the first
   31896   // operand, which is implemented as 0 - SelectOp1.
   31897   if (!(SelectOp2.getOpcode() == ISD::SUB &&
   31898         ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
   31899         SelectOp2.getOperand(1) == SelectOp1))
   31900     return false;
   31901 
   31902   // The first operand of SetCC is the first operand of the select, which is the
   31903   // difference between the two input vectors.
   31904   if (SetCC.getOperand(0) != SelectOp1)
   31905     return false;
   31906 
   31907   // In SetLT case, The second operand of the comparison can be either 1 or 0.
   31908   APInt SplatVal;
   31909   if ((CC == ISD::SETLT) &&
   31910       !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
   31911          SplatVal.isOneValue()) ||
   31912         (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
   31913     return false;
   31914 
   31915   // In SetGT case, The second operand of the comparison can be either -1 or 0.
   31916   if ((CC == ISD::SETGT) &&
   31917       !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
   31918         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
   31919     return false;
   31920 
   31921   // The first operand of the select is the difference between the two input
   31922   // vectors.
   31923   if (SelectOp1.getOpcode() != ISD::SUB)
   31924     return false;
   31925 
   31926   Op0 = SelectOp1.getOperand(0);
   31927   Op1 = SelectOp1.getOperand(1);
   31928 
   31929   // Check if the operands of the sub are zero-extended from vectors of i8.
   31930   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
   31931       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
   31932       Op1.getOpcode() != ISD::ZERO_EXTEND ||
   31933       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
   31934     return false;
   31935 
   31936   return true;
   31937 }
   31938 
   31939 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
   31940 // to these zexts.
   31941 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
   31942                             const SDValue &Zext1, const SDLoc &DL,
   31943                             const X86Subtarget &Subtarget) {
   31944   // Find the appropriate width for the PSADBW.
   31945   EVT InVT = Zext0.getOperand(0).getValueType();
   31946   unsigned RegSize = std::max(128u, InVT.getSizeInBits());
   31947 
   31948   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
   31949   // fill in the missing vector elements with 0.
   31950   unsigned NumConcat = RegSize / InVT.getSizeInBits();
   31951   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
   31952   Ops[0] = Zext0.getOperand(0);
   31953   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
   31954   SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
   31955   Ops[0] = Zext1.getOperand(0);
   31956   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
   31957 
   31958   // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
   31959   auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   31960                           ArrayRef<SDValue> Ops) {
   31961     MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
   31962     return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
   31963   };
   31964   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
   31965   return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
   31966                           PSADBWBuilder);
   31967 }
   31968 
   31969 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
   31970 // PHMINPOSUW.
   31971 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
   31972                                              const X86Subtarget &Subtarget) {
   31973   // Bail without SSE41.
   31974   if (!Subtarget.hasSSE41())
   31975     return SDValue();
   31976 
   31977   EVT ExtractVT = Extract->getValueType(0);
   31978   if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
   31979     return SDValue();
   31980 
   31981   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
   31982   unsigned BinOp;
   31983   SDValue Src = matchBinOpReduction(
   31984       Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
   31985   if (!Src)
   31986     return SDValue();
   31987 
   31988   EVT SrcVT = Src.getValueType();
   31989   EVT SrcSVT = SrcVT.getScalarType();
   31990   if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
   31991     return SDValue();
   31992 
   31993   SDLoc DL(Extract);
   31994   SDValue MinPos = Src;
   31995 
   31996   // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
   31997   while (SrcVT.getSizeInBits() > 128) {
   31998     unsigned NumElts = SrcVT.getVectorNumElements();
   31999     unsigned NumSubElts = NumElts / 2;
   32000     SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
   32001     unsigned SubSizeInBits = SrcVT.getSizeInBits();
   32002     SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
   32003     SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
   32004     MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
   32005   }
   32006   assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
   32007           (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
   32008          "Unexpected value type");
   32009 
   32010   // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
   32011   // to flip the value accordingly.
   32012   SDValue Mask;
   32013   unsigned MaskEltsBits = ExtractVT.getSizeInBits();
   32014   if (BinOp == ISD::SMAX)
   32015     Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
   32016   else if (BinOp == ISD::SMIN)
   32017     Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
   32018   else if (BinOp == ISD::UMAX)
   32019     Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
   32020 
   32021   if (Mask)
   32022     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
   32023 
   32024   // For v16i8 cases we need to perform UMIN on pairs of byte elements,
   32025   // shuffling each upper element down and insert zeros. This means that the
   32026   // v16i8 UMIN will leave the upper element as zero, performing zero-extension
   32027   // ready for the PHMINPOS.
   32028   if (ExtractVT == MVT::i8) {
   32029     SDValue Upper = DAG.getVectorShuffle(
   32030         SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
   32031         {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
   32032     MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
   32033   }
   32034 
   32035   // Perform the PHMINPOS on a v8i16 vector,
   32036   MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
   32037   MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
   32038   MinPos = DAG.getBitcast(SrcVT, MinPos);
   32039 
   32040   if (Mask)
   32041     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
   32042 
   32043   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
   32044                      DAG.getIntPtrConstant(0, DL));
   32045 }
   32046 
   32047 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
   32048 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
   32049                                                 SelectionDAG &DAG,
   32050                                                 const X86Subtarget &Subtarget) {
   32051   // Bail without SSE2 or with AVX512VL (which uses predicate registers).
   32052   if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
   32053     return SDValue();
   32054 
   32055   EVT ExtractVT = Extract->getValueType(0);
   32056   unsigned BitWidth = ExtractVT.getSizeInBits();
   32057   if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
   32058       ExtractVT != MVT::i8)
   32059     return SDValue();
   32060 
   32061   // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
   32062   unsigned BinOp = 0;
   32063   SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
   32064   if (!Match)
   32065     return SDValue();
   32066 
   32067   // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
   32068   // which we can't support here for now.
   32069   if (Match.getScalarValueSizeInBits() != BitWidth)
   32070     return SDValue();
   32071 
   32072   // We require AVX2 for PMOVMSKB for v16i16/v32i8;
   32073   unsigned MatchSizeInBits = Match.getValueSizeInBits();
   32074   if (!(MatchSizeInBits == 128 ||
   32075         (MatchSizeInBits == 256 &&
   32076          ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
   32077     return SDValue();
   32078 
   32079   // Don't bother performing this for 2-element vectors.
   32080   if (Match.getValueType().getVectorNumElements() <= 2)
   32081     return SDValue();
   32082 
   32083   // Check that we are extracting a reduction of all sign bits.
   32084   if (DAG.ComputeNumSignBits(Match) != BitWidth)
   32085     return SDValue();
   32086 
   32087   // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
   32088   MVT MaskVT;
   32089   if (64 == BitWidth || 32 == BitWidth)
   32090     MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
   32091                               MatchSizeInBits / BitWidth);
   32092   else
   32093     MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
   32094 
   32095   APInt CompareBits;
   32096   ISD::CondCode CondCode;
   32097   if (BinOp == ISD::OR) {
   32098     // any_of -> MOVMSK != 0
   32099     CompareBits = APInt::getNullValue(32);
   32100     CondCode = ISD::CondCode::SETNE;
   32101   } else {
   32102     // all_of -> MOVMSK == ((1 << NumElts) - 1)
   32103     CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
   32104     CondCode = ISD::CondCode::SETEQ;
   32105   }
   32106 
   32107   // Perform the select as i32/i64 and then truncate to avoid partial register
   32108   // stalls.
   32109   unsigned ResWidth = std::max(BitWidth, 32u);
   32110   EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
   32111   SDLoc DL(Extract);
   32112   SDValue Zero = DAG.getConstant(0, DL, ResVT);
   32113   SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
   32114   SDValue Res = DAG.getBitcast(MaskVT, Match);
   32115   Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
   32116   Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
   32117                         Ones, Zero, CondCode);
   32118   return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
   32119 }
   32120 
   32121 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   32122                                       const X86Subtarget &Subtarget) {
   32123   // PSADBW is only supported on SSE2 and up.
   32124   if (!Subtarget.hasSSE2())
   32125     return SDValue();
   32126 
   32127   // Verify the type we're extracting from is any integer type above i16.
   32128   EVT VT = Extract->getOperand(0).getValueType();
   32129   if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
   32130     return SDValue();
   32131 
   32132   unsigned RegSize = 128;
   32133   if (Subtarget.useBWIRegs())
   32134     RegSize = 512;
   32135   else if (Subtarget.hasAVX())
   32136     RegSize = 256;
   32137 
   32138   // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
   32139   // TODO: We should be able to handle larger vectors by splitting them before
   32140   // feeding them into several SADs, and then reducing over those.
   32141   if (RegSize / VT.getVectorNumElements() < 8)
   32142     return SDValue();
   32143 
   32144   // Match shuffle + add pyramid.
   32145   unsigned BinOp = 0;
   32146   SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
   32147 
   32148   // The operand is expected to be zero extended from i8
   32149   // (verified in detectZextAbsDiff).
   32150   // In order to convert to i64 and above, additional any/zero/sign
   32151   // extend is expected.
   32152   // The zero extend from 32 bit has no mathematical effect on the result.
   32153   // Also the sign extend is basically zero extend
   32154   // (extends the sign bit which is zero).
   32155   // So it is correct to skip the sign/zero extend instruction.
   32156   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
   32157     Root.getOpcode() == ISD::ZERO_EXTEND ||
   32158     Root.getOpcode() == ISD::ANY_EXTEND))
   32159     Root = Root.getOperand(0);
   32160 
   32161   // If there was a match, we want Root to be a select that is the root of an
   32162   // abs-diff pattern.
   32163   if (!Root || (Root.getOpcode() != ISD::VSELECT))
   32164     return SDValue();
   32165 
   32166   // Check whether we have an abs-diff pattern feeding into the select.
   32167   SDValue Zext0, Zext1;
   32168   if (!detectZextAbsDiff(Root, Zext0, Zext1))
   32169     return SDValue();
   32170 
   32171   // Create the SAD instruction.
   32172   SDLoc DL(Extract);
   32173   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
   32174 
   32175   // If the original vector was wider than 8 elements, sum over the results
   32176   // in the SAD vector.
   32177   unsigned Stages = Log2_32(VT.getVectorNumElements());
   32178   MVT SadVT = SAD.getSimpleValueType();
   32179   if (Stages > 3) {
   32180     unsigned SadElems = SadVT.getVectorNumElements();
   32181 
   32182     for(unsigned i = Stages - 3; i > 0; --i) {
   32183       SmallVector<int, 16> Mask(SadElems, -1);
   32184       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
   32185         Mask[j] = MaskEnd + j;
   32186 
   32187       SDValue Shuffle =
   32188           DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
   32189       SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
   32190     }
   32191   }
   32192 
   32193   MVT Type = Extract->getSimpleValueType(0);
   32194   unsigned TypeSizeInBits = Type.getSizeInBits();
   32195   // Return the lowest TypeSizeInBits bits.
   32196   MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
   32197   SAD = DAG.getBitcast(ResVT, SAD);
   32198   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
   32199                      Extract->getOperand(1));
   32200 }
   32201 
   32202 // Attempt to peek through a target shuffle and extract the scalar from the
   32203 // source.
   32204 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
   32205                                          TargetLowering::DAGCombinerInfo &DCI,
   32206                                          const X86Subtarget &Subtarget) {
   32207   if (DCI.isBeforeLegalizeOps())
   32208     return SDValue();
   32209 
   32210   SDValue Src = N->getOperand(0);
   32211   SDValue Idx = N->getOperand(1);
   32212 
   32213   EVT VT = N->getValueType(0);
   32214   EVT SrcVT = Src.getValueType();
   32215   EVT SrcSVT = SrcVT.getVectorElementType();
   32216   unsigned NumSrcElts = SrcVT.getVectorNumElements();
   32217 
   32218   // Don't attempt this for boolean mask vectors or unknown extraction indices.
   32219   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
   32220     return SDValue();
   32221 
   32222   // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
   32223   if (X86ISD::VBROADCAST == Src.getOpcode() &&
   32224       Src.getOperand(0).getValueType() == VT)
   32225     return Src.getOperand(0);
   32226 
   32227   // Resolve the target shuffle inputs and mask.
   32228   SmallVector<int, 16> Mask;
   32229   SmallVector<SDValue, 2> Ops;
   32230   if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
   32231     return SDValue();
   32232 
   32233   // Attempt to narrow/widen the shuffle mask to the correct size.
   32234   if (Mask.size() != NumSrcElts) {
   32235     if ((NumSrcElts % Mask.size()) == 0) {
   32236       SmallVector<int, 16> ScaledMask;
   32237       int Scale = NumSrcElts / Mask.size();
   32238       scaleShuffleMask<int>(Scale, Mask, ScaledMask);
   32239       Mask = std::move(ScaledMask);
   32240     } else if ((Mask.size() % NumSrcElts) == 0) {
   32241       SmallVector<int, 16> WidenedMask;
   32242       while (Mask.size() > NumSrcElts &&
   32243              canWidenShuffleElements(Mask, WidenedMask))
   32244         Mask = std::move(WidenedMask);
   32245       // TODO - investigate support for wider shuffle masks with known upper
   32246       // undef/zero elements for implicit zero-extension.
   32247     }
   32248   }
   32249 
   32250   // Check if narrowing/widening failed.
   32251   if (Mask.size() != NumSrcElts)
   32252     return SDValue();
   32253 
   32254   int SrcIdx = Mask[N->getConstantOperandVal(1)];
   32255   SDLoc dl(N);
   32256 
   32257   // If the shuffle source element is undef/zero then we can just accept it.
   32258   if (SrcIdx == SM_SentinelUndef)
   32259     return DAG.getUNDEF(VT);
   32260 
   32261   if (SrcIdx == SM_SentinelZero)
   32262     return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
   32263                                 : DAG.getConstant(0, dl, VT);
   32264 
   32265   SDValue SrcOp = Ops[SrcIdx / Mask.size()];
   32266   SrcOp = DAG.getBitcast(SrcVT, SrcOp);
   32267   SrcIdx = SrcIdx % Mask.size();
   32268 
   32269   // We can only extract other elements from 128-bit vectors and in certain
   32270   // circumstances, depending on SSE-level.
   32271   // TODO: Investigate using extract_subvector for larger vectors.
   32272   // TODO: Investigate float/double extraction if it will be just stored.
   32273   if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
   32274       ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
   32275     assert(SrcSVT == VT && "Unexpected extraction type");
   32276     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
   32277                        DAG.getIntPtrConstant(SrcIdx, dl));
   32278   }
   32279 
   32280   if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
   32281       (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
   32282     assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
   32283            "Unexpected extraction type");
   32284     unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
   32285     SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
   32286                                 DAG.getIntPtrConstant(SrcIdx, dl));
   32287     return DAG.getZExtOrTrunc(ExtOp, dl, VT);
   32288   }
   32289 
   32290   return SDValue();
   32291 }
   32292 
   32293 /// Detect vector gather/scatter index generation and convert it from being a
   32294 /// bunch of shuffles and extracts into a somewhat faster sequence.
   32295 /// For i686, the best sequence is apparently storing the value and loading
   32296 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
   32297 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   32298                                        TargetLowering::DAGCombinerInfo &DCI,
   32299                                        const X86Subtarget &Subtarget) {
   32300   if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
   32301     return NewOp;
   32302 
   32303   // TODO - Remove this once we can handle the implicit zero-extension of
   32304   // X86ISD::PEXTRW/X86ISD::PEXTRB in:
   32305   // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
   32306   // combineBasicSADPattern.
   32307   if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   32308     return SDValue();
   32309 
   32310   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
   32311     return NewOp;
   32312 
   32313   SDValue InputVector = N->getOperand(0);
   32314   SDValue EltIdx = N->getOperand(1);
   32315 
   32316   EVT SrcVT = InputVector.getValueType();
   32317   EVT VT = N->getValueType(0);
   32318   SDLoc dl(InputVector);
   32319 
   32320   // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
   32321   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
   32322       VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
   32323     SDValue MMXSrc = InputVector.getOperand(0);
   32324 
   32325     // The bitcast source is a direct mmx result.
   32326     if (MMXSrc.getValueType() == MVT::x86mmx)
   32327       return DAG.getBitcast(VT, InputVector);
   32328   }
   32329 
   32330   // Detect mmx to i32 conversion through a v2i32 elt extract.
   32331   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
   32332       VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
   32333     SDValue MMXSrc = InputVector.getOperand(0);
   32334 
   32335     // The bitcast source is a direct mmx result.
   32336     if (MMXSrc.getValueType() == MVT::x86mmx)
   32337       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
   32338   }
   32339 
   32340   if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
   32341       isa<ConstantSDNode>(EltIdx) &&
   32342       isa<ConstantSDNode>(InputVector.getOperand(0))) {
   32343     uint64_t ExtractedElt = N->getConstantOperandVal(1);
   32344     auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
   32345     const APInt &InputValue = InputC->getAPIntValue();
   32346     uint64_t Res = InputValue[ExtractedElt];
   32347     return DAG.getConstant(Res, dl, MVT::i1);
   32348   }
   32349 
   32350   // Check whether this extract is the root of a sum of absolute differences
   32351   // pattern. This has to be done here because we really want it to happen
   32352   // pre-legalization,
   32353   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
   32354     return SAD;
   32355 
   32356   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
   32357   if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
   32358     return Cmp;
   32359 
   32360   // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
   32361   if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
   32362     return MinMax;
   32363 
   32364   return SDValue();
   32365 }
   32366 
   32367 /// If a vector select has an operand that is -1 or 0, try to simplify the
   32368 /// select to a bitwise logic operation.
   32369 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
   32370 static SDValue
   32371 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
   32372                                  TargetLowering::DAGCombinerInfo &DCI,
   32373                                  const X86Subtarget &Subtarget) {
   32374   SDValue Cond = N->getOperand(0);
   32375   SDValue LHS = N->getOperand(1);
   32376   SDValue RHS = N->getOperand(2);
   32377   EVT VT = LHS.getValueType();
   32378   EVT CondVT = Cond.getValueType();
   32379   SDLoc DL(N);
   32380   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   32381 
   32382   if (N->getOpcode() != ISD::VSELECT)
   32383     return SDValue();
   32384 
   32385   assert(CondVT.isVector() && "Vector select expects a vector selector!");
   32386 
   32387   bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
   32388   // Check if the first operand is all zeros and Cond type is vXi1.
   32389   // This situation only applies to avx512.
   32390   if (TValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
   32391       CondVT.getVectorElementType() == MVT::i1) {
   32392     // Invert the cond to not(cond) : xor(op,allones)=not(op)
   32393     SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
   32394     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
   32395     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
   32396   }
   32397 
   32398   // To use the condition operand as a bitwise mask, it must have elements that
   32399   // are the same size as the select elements. Ie, the condition operand must
   32400   // have already been promoted from the IR select condition type <N x i1>.
   32401   // Don't check if the types themselves are equal because that excludes
   32402   // vector floating-point selects.
   32403   if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
   32404     return SDValue();
   32405 
   32406   bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
   32407   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
   32408 
   32409   // Try to invert the condition if true value is not all 1s and false value is
   32410   // not all 0s.
   32411   if (!TValIsAllOnes && !FValIsAllZeros &&
   32412       // Check if the selector will be produced by CMPP*/PCMP*.
   32413       Cond.getOpcode() == ISD::SETCC &&
   32414       // Check if SETCC has already been promoted.
   32415       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
   32416           CondVT) {
   32417     bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
   32418 
   32419     if (TValIsAllZeros || FValIsAllOnes) {
   32420       SDValue CC = Cond.getOperand(2);
   32421       ISD::CondCode NewCC =
   32422           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
   32423                                Cond.getOperand(0).getValueType().isInteger());
   32424       Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
   32425                           NewCC);
   32426       std::swap(LHS, RHS);
   32427       TValIsAllOnes = FValIsAllOnes;
   32428       FValIsAllZeros = TValIsAllZeros;
   32429     }
   32430   }
   32431 
   32432   // Cond value must be 'sign splat' to be converted to a logical op.
   32433   if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
   32434     return SDValue();
   32435 
   32436   // vselect Cond, 111..., 000... -> Cond
   32437   if (TValIsAllOnes && FValIsAllZeros)
   32438     return DAG.getBitcast(VT, Cond);
   32439 
   32440   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
   32441     return SDValue();
   32442 
   32443   // vselect Cond, 111..., X -> or Cond, X
   32444   if (TValIsAllOnes) {
   32445     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
   32446     SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
   32447     return DAG.getBitcast(VT, Or);
   32448   }
   32449 
   32450   // vselect Cond, X, 000... -> and Cond, X
   32451   if (FValIsAllZeros) {
   32452     SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
   32453     SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
   32454     return DAG.getBitcast(VT, And);
   32455   }
   32456 
   32457   // vselect Cond, 000..., X -> andn Cond, X
   32458   if (TValIsAllZeros) {
   32459     MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
   32460     SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
   32461     SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
   32462     SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
   32463     return DAG.getBitcast(VT, AndN);
   32464   }
   32465 
   32466   return SDValue();
   32467 }
   32468 
   32469 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
   32470   SDValue Cond = N->getOperand(0);
   32471   SDValue LHS = N->getOperand(1);
   32472   SDValue RHS = N->getOperand(2);
   32473   SDLoc DL(N);
   32474 
   32475   auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
   32476   auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
   32477   if (!TrueC || !FalseC)
   32478     return SDValue();
   32479 
   32480   // Don't do this for crazy integer types.
   32481   EVT VT = N->getValueType(0);
   32482   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   32483     return SDValue();
   32484 
   32485   // We're going to use the condition bit in math or logic ops. We could allow
   32486   // this with a wider condition value (post-legalization it becomes an i8),
   32487   // but if nothing is creating selects that late, it doesn't matter.
   32488   if (Cond.getValueType() != MVT::i1)
   32489     return SDValue();
   32490 
   32491   // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
   32492   // 3, 5, or 9 with i32/i64, so those get transformed too.
   32493   // TODO: For constants that overflow or do not differ by power-of-2 or small
   32494   // multiplier, convert to 'and' + 'add'.
   32495   const APInt &TrueVal = TrueC->getAPIntValue();
   32496   const APInt &FalseVal = FalseC->getAPIntValue();
   32497   bool OV;
   32498   APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
   32499   if (OV)
   32500     return SDValue();
   32501 
   32502   APInt AbsDiff = Diff.abs();
   32503   if (AbsDiff.isPowerOf2() ||
   32504       ((VT == MVT::i32 || VT == MVT::i64) &&
   32505        (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
   32506 
   32507     // We need a positive multiplier constant for shift/LEA codegen. The 'not'
   32508     // of the condition can usually be folded into a compare predicate, but even
   32509     // without that, the sequence should be cheaper than a CMOV alternative.
   32510     if (TrueVal.slt(FalseVal)) {
   32511       Cond = DAG.getNOT(DL, Cond, MVT::i1);
   32512       std::swap(TrueC, FalseC);
   32513     }
   32514 
   32515     // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
   32516     SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
   32517 
   32518     // Multiply condition by the difference if non-one.
   32519     if (!AbsDiff.isOneValue())
   32520       R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
   32521 
   32522     // Add the base if non-zero.
   32523     if (!FalseC->isNullValue())
   32524       R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
   32525 
   32526     return R;
   32527   }
   32528 
   32529   return SDValue();
   32530 }
   32531 
   32532 /// If this is a *dynamic* select (non-constant condition) and we can match
   32533 /// this node with one of the variable blend instructions, restructure the
   32534 /// condition so that blends can use the high (sign) bit of each element.
   32535 static SDValue combineVSelectToShrunkBlend(SDNode *N, SelectionDAG &DAG,
   32536                                            TargetLowering::DAGCombinerInfo &DCI,
   32537                                            const X86Subtarget &Subtarget) {
   32538   SDValue Cond = N->getOperand(0);
   32539   if (N->getOpcode() != ISD::VSELECT ||
   32540       ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
   32541     return SDValue();
   32542 
   32543   // Don't optimize before the condition has been transformed to a legal type
   32544   // and don't ever optimize vector selects that map to AVX512 mask-registers.
   32545   unsigned BitWidth = Cond.getScalarValueSizeInBits();
   32546   if (BitWidth < 8 || BitWidth > 64)
   32547     return SDValue();
   32548 
   32549   // We can only handle the cases where VSELECT is directly legal on the
   32550   // subtarget. We custom lower VSELECT nodes with constant conditions and
   32551   // this makes it hard to see whether a dynamic VSELECT will correctly
   32552   // lower, so we both check the operation's status and explicitly handle the
   32553   // cases where a *dynamic* blend will fail even though a constant-condition
   32554   // blend could be custom lowered.
   32555   // FIXME: We should find a better way to handle this class of problems.
   32556   // Potentially, we should combine constant-condition vselect nodes
   32557   // pre-legalization into shuffles and not mark as many types as custom
   32558   // lowered.
   32559   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   32560   EVT VT = N->getValueType(0);
   32561   if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
   32562     return SDValue();
   32563   // FIXME: We don't support i16-element blends currently. We could and
   32564   // should support them by making *all* the bits in the condition be set
   32565   // rather than just the high bit and using an i8-element blend.
   32566   if (VT.getVectorElementType() == MVT::i16)
   32567     return SDValue();
   32568   // Dynamic blending was only available from SSE4.1 onward.
   32569   if (VT.is128BitVector() && !Subtarget.hasSSE41())
   32570     return SDValue();
   32571   // Byte blends are only available in AVX2
   32572   if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
   32573     return SDValue();
   32574   // There are no 512-bit blend instructions that use sign bits.
   32575   if (VT.is512BitVector())
   32576     return SDValue();
   32577 
   32578   // TODO: Add other opcodes eventually lowered into BLEND.
   32579   for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
   32580        UI != UE; ++UI)
   32581     if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
   32582       return SDValue();
   32583 
   32584   APInt DemandedMask(APInt::getSignMask(BitWidth));
   32585   KnownBits Known;
   32586   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   32587                                         !DCI.isBeforeLegalizeOps());
   32588   if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
   32589     return SDValue();
   32590 
   32591   // If we changed the computation somewhere in the DAG, this change will
   32592   // affect all users of Cond. Update all the nodes so that we do not use
   32593   // the generic VSELECT anymore. Otherwise, we may perform wrong
   32594   // optimizations as we messed with the actual expectation for the vector
   32595   // boolean values.
   32596   for (SDNode *U : Cond->uses()) {
   32597     SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), U->getValueType(0),
   32598                              Cond, U->getOperand(1), U->getOperand(2));
   32599     DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
   32600   }
   32601   DCI.CommitTargetLoweringOpt(TLO);
   32602   return SDValue(N, 0);
   32603 }
   32604 
   32605 /// Do target-specific dag combines on SELECT and VSELECT nodes.
   32606 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   32607                              TargetLowering::DAGCombinerInfo &DCI,
   32608                              const X86Subtarget &Subtarget) {
   32609   SDLoc DL(N);
   32610   SDValue Cond = N->getOperand(0);
   32611   // Get the LHS/RHS of the select.
   32612   SDValue LHS = N->getOperand(1);
   32613   SDValue RHS = N->getOperand(2);
   32614   EVT VT = LHS.getValueType();
   32615   EVT CondVT = Cond.getValueType();
   32616   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   32617 
   32618   // Convert vselects with constant condition into shuffles.
   32619   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
   32620       DCI.isBeforeLegalizeOps()) {
   32621     SmallVector<int, 64> Mask(VT.getVectorNumElements(), -1);
   32622     for (int i = 0, Size = Mask.size(); i != Size; ++i) {
   32623       SDValue CondElt = Cond->getOperand(i);
   32624       Mask[i] = i;
   32625       // Arbitrarily choose from the 2nd operand if the select condition element
   32626       // is undef.
   32627       // TODO: Can we do better by matching patterns such as even/odd?
   32628       if (CondElt.isUndef() || isNullConstant(CondElt))
   32629         Mask[i] += Size;
   32630     }
   32631 
   32632     return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
   32633   }
   32634 
   32635   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   32636   // instructions match the semantics of the common C idiom x<y?x:y but not
   32637   // x<=y?x:y, because of how they handle negative zero (which can be
   32638   // ignored in unsafe-math mode).
   32639   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   32640   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
   32641       VT != MVT::f80 && VT != MVT::f128 &&
   32642       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
   32643       (Subtarget.hasSSE2() ||
   32644        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
   32645     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   32646 
   32647     unsigned Opcode = 0;
   32648     // Check for x CC y ? x : y.
   32649     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   32650         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   32651       switch (CC) {
   32652       default: break;
   32653       case ISD::SETULT:
   32654         // Converting this to a min would handle NaNs incorrectly, and swapping
   32655         // the operands would cause it to handle comparisons between positive
   32656         // and negative zero incorrectly.
   32657         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
   32658           if (!DAG.getTarget().Options.UnsafeFPMath &&
   32659               !(DAG.isKnownNeverZeroFloat(LHS) ||
   32660                 DAG.isKnownNeverZeroFloat(RHS)))
   32661             break;
   32662           std::swap(LHS, RHS);
   32663         }
   32664         Opcode = X86ISD::FMIN;
   32665         break;
   32666       case ISD::SETOLE:
   32667         // Converting this to a min would handle comparisons between positive
   32668         // and negative zero incorrectly.
   32669         if (!DAG.getTarget().Options.UnsafeFPMath &&
   32670             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
   32671           break;
   32672         Opcode = X86ISD::FMIN;
   32673         break;
   32674       case ISD::SETULE:
   32675         // Converting this to a min would handle both negative zeros and NaNs
   32676         // incorrectly, but we can swap the operands to fix both.
   32677         std::swap(LHS, RHS);
   32678         LLVM_FALLTHROUGH;
   32679       case ISD::SETOLT:
   32680       case ISD::SETLT:
   32681       case ISD::SETLE:
   32682         Opcode = X86ISD::FMIN;
   32683         break;
   32684 
   32685       case ISD::SETOGE:
   32686         // Converting this to a max would handle comparisons between positive
   32687         // and negative zero incorrectly.
   32688         if (!DAG.getTarget().Options.UnsafeFPMath &&
   32689             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
   32690           break;
   32691         Opcode = X86ISD::FMAX;
   32692         break;
   32693       case ISD::SETUGT:
   32694         // Converting this to a max would handle NaNs incorrectly, and swapping
   32695         // the operands would cause it to handle comparisons between positive
   32696         // and negative zero incorrectly.
   32697         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
   32698           if (!DAG.getTarget().Options.UnsafeFPMath &&
   32699               !(DAG.isKnownNeverZeroFloat(LHS) ||
   32700                 DAG.isKnownNeverZeroFloat(RHS)))
   32701             break;
   32702           std::swap(LHS, RHS);
   32703         }
   32704         Opcode = X86ISD::FMAX;
   32705         break;
   32706       case ISD::SETUGE:
   32707         // Converting this to a max would handle both negative zeros and NaNs
   32708         // incorrectly, but we can swap the operands to fix both.
   32709         std::swap(LHS, RHS);
   32710         LLVM_FALLTHROUGH;
   32711       case ISD::SETOGT:
   32712       case ISD::SETGT:
   32713       case ISD::SETGE:
   32714         Opcode = X86ISD::FMAX;
   32715         break;
   32716       }
   32717     // Check for x CC y ? y : x -- a min/max with reversed arms.
   32718     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
   32719                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
   32720       switch (CC) {
   32721       default: break;
   32722       case ISD::SETOGE:
   32723         // Converting this to a min would handle comparisons between positive
   32724         // and negative zero incorrectly, and swapping the operands would
   32725         // cause it to handle NaNs incorrectly.
   32726         if (!DAG.getTarget().Options.UnsafeFPMath &&
   32727             !(DAG.isKnownNeverZeroFloat(LHS) ||
   32728               DAG.isKnownNeverZeroFloat(RHS))) {
   32729           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   32730             break;
   32731           std::swap(LHS, RHS);
   32732         }
   32733         Opcode = X86ISD::FMIN;
   32734         break;
   32735       case ISD::SETUGT:
   32736         // Converting this to a min would handle NaNs incorrectly.
   32737         if (!DAG.getTarget().Options.UnsafeFPMath &&
   32738             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
   32739           break;
   32740         Opcode = X86ISD::FMIN;
   32741         break;
   32742       case ISD::SETUGE:
   32743         // Converting this to a min would handle both negative zeros and NaNs
   32744         // incorrectly, but we can swap the operands to fix both.
   32745         std::swap(LHS, RHS);
   32746         LLVM_FALLTHROUGH;
   32747       case ISD::SETOGT:
   32748       case ISD::SETGT:
   32749       case ISD::SETGE:
   32750         Opcode = X86ISD::FMIN;
   32751         break;
   32752 
   32753       case ISD::SETULT:
   32754         // Converting this to a max would handle NaNs incorrectly.
   32755         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   32756           break;
   32757         Opcode = X86ISD::FMAX;
   32758         break;
   32759       case ISD::SETOLE:
   32760         // Converting this to a max would handle comparisons between positive
   32761         // and negative zero incorrectly, and swapping the operands would
   32762         // cause it to handle NaNs incorrectly.
   32763         if (!DAG.getTarget().Options.UnsafeFPMath &&
   32764             !DAG.isKnownNeverZeroFloat(LHS) &&
   32765             !DAG.isKnownNeverZeroFloat(RHS)) {
   32766           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
   32767             break;
   32768           std::swap(LHS, RHS);
   32769         }
   32770         Opcode = X86ISD::FMAX;
   32771         break;
   32772       case ISD::SETULE:
   32773         // Converting this to a max would handle both negative zeros and NaNs
   32774         // incorrectly, but we can swap the operands to fix both.
   32775         std::swap(LHS, RHS);
   32776         LLVM_FALLTHROUGH;
   32777       case ISD::SETOLT:
   32778       case ISD::SETLT:
   32779       case ISD::SETLE:
   32780         Opcode = X86ISD::FMAX;
   32781         break;
   32782       }
   32783     }
   32784 
   32785     if (Opcode)
   32786       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   32787   }
   32788 
   32789   // Some mask scalar intrinsics rely on checking if only one bit is set
   32790   // and implement it in C code like this:
   32791   // A[0] = (U & 1) ? A[0] : W[0];
   32792   // This creates some redundant instructions that break pattern matching.
   32793   // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
   32794   if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
   32795       Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
   32796     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   32797     SDValue AndNode = Cond.getOperand(0);
   32798     if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
   32799         isNullConstant(Cond.getOperand(1)) &&
   32800         isOneConstant(AndNode.getOperand(1))) {
   32801       // LHS and RHS swapped due to
   32802       // setcc outputting 1 when AND resulted in 0 and vice versa.
   32803       AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
   32804       return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
   32805     }
   32806   }
   32807 
   32808   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
   32809   // lowering on KNL. In this case we convert it to
   32810   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
   32811   // The same situation all vectors of i8 and i16 without BWI.
   32812   // Make sure we extend these even before type legalization gets a chance to
   32813   // split wide vectors.
   32814   // Since SKX these selects have a proper lowering.
   32815   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
   32816       CondVT.getVectorElementType() == MVT::i1 &&
   32817       VT.getVectorNumElements() > 4 &&
   32818       (VT.getVectorElementType() == MVT::i8 ||
   32819        VT.getVectorElementType() == MVT::i16)) {
   32820     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
   32821     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
   32822   }
   32823 
   32824   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
   32825     return V;
   32826 
   32827   // Canonicalize max and min:
   32828   // (x > y) ? x : y -> (x >= y) ? x : y
   32829   // (x < y) ? x : y -> (x <= y) ? x : y
   32830   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
   32831   // the need for an extra compare
   32832   // against zero. e.g.
   32833   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
   32834   // subl   %esi, %edi
   32835   // testl  %edi, %edi
   32836   // movl   $0, %eax
   32837   // cmovgl %edi, %eax
   32838   // =>
   32839   // xorl   %eax, %eax
   32840   // subl   %esi, $edi
   32841   // cmovsl %eax, %edi
   32842   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
   32843       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
   32844       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
   32845     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   32846     switch (CC) {
   32847     default: break;
   32848     case ISD::SETLT:
   32849     case ISD::SETGT: {
   32850       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
   32851       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
   32852                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
   32853       return DAG.getSelect(DL, VT, Cond, LHS, RHS);
   32854     }
   32855     }
   32856   }
   32857 
   32858   // Early exit check
   32859   if (!TLI.isTypeLegal(VT))
   32860     return SDValue();
   32861 
   32862   // Match VSELECTs into subs with unsigned saturation.
   32863   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
   32864       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
   32865       ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
   32866        (Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
   32867     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   32868 
   32869     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
   32870     // left side invert the predicate to simplify logic below.
   32871     SDValue Other;
   32872     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
   32873       Other = RHS;
   32874       CC = ISD::getSetCCInverse(CC, true);
   32875     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
   32876       Other = LHS;
   32877     }
   32878 
   32879     if (Other.getNode() && Other->getNumOperands() == 2 &&
   32880         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
   32881       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
   32882       SDValue CondRHS = Cond->getOperand(1);
   32883 
   32884       auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   32885                              ArrayRef<SDValue> Ops) {
   32886         return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
   32887       };
   32888 
   32889       // Look for a general sub with unsigned saturation first.
   32890       // x >= y ? x-y : 0 --> subus x, y
   32891       // x >  y ? x-y : 0 --> subus x, y
   32892       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
   32893           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
   32894         return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
   32895                                 SUBUSBuilder);
   32896 
   32897       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
   32898         if (isa<BuildVectorSDNode>(CondRHS)) {
   32899           // If the RHS is a constant we have to reverse the const
   32900           // canonicalization.
   32901           // x > C-1 ? x+-C : 0 --> subus x, C
   32902           auto MatchSUBUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
   32903             return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
   32904           };
   32905           if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
   32906               ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchSUBUS)) {
   32907             OpRHS = DAG.getNode(ISD::SUB, DL, VT,
   32908                                 DAG.getConstant(0, DL, VT), OpRHS);
   32909             return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
   32910                                     SUBUSBuilder);
   32911           }
   32912 
   32913           // Another special case: If C was a sign bit, the sub has been
   32914           // canonicalized into a xor.
   32915           // FIXME: Would it be better to use computeKnownBits to determine
   32916           //        whether it's safe to decanonicalize the xor?
   32917           // x s< 0 ? x^C : 0 --> subus x, C
   32918           if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode())
   32919             if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
   32920                 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
   32921                 OpRHSConst->getAPIntValue().isSignMask()) {
   32922               OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
   32923               // Note that we have to rebuild the RHS constant here to ensure we
   32924               // don't rely on particular values of undef lanes.
   32925               return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
   32926                                       SUBUSBuilder);
   32927             }
   32928         }
   32929     }
   32930   }
   32931 
   32932   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
   32933     return V;
   32934 
   32935   if (SDValue V = combineVSelectToShrunkBlend(N, DAG, DCI, Subtarget))
   32936     return V;
   32937 
   32938   // Custom action for SELECT MMX
   32939   if (VT == MVT::x86mmx) {
   32940     LHS = DAG.getBitcast(MVT::i64, LHS);
   32941     RHS = DAG.getBitcast(MVT::i64, RHS);
   32942     SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
   32943     return DAG.getBitcast(VT, newSelect);
   32944   }
   32945 
   32946   return SDValue();
   32947 }
   32948 
   32949 /// Combine:
   32950 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
   32951 /// to:
   32952 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
   32953 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
   32954 /// Note that this is only legal for some op/cc combinations.
   32955 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
   32956                                        SelectionDAG &DAG,
   32957                                        const X86Subtarget &Subtarget) {
   32958   // This combine only operates on CMP-like nodes.
   32959   if (!(Cmp.getOpcode() == X86ISD::CMP ||
   32960         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
   32961     return SDValue();
   32962 
   32963   // Can't replace the cmp if it has more uses than the one we're looking at.
   32964   // FIXME: We would like to be able to handle this, but would need to make sure
   32965   // all uses were updated.
   32966   if (!Cmp.hasOneUse())
   32967     return SDValue();
   32968 
   32969   // This only applies to variations of the common case:
   32970   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
   32971   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
   32972   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
   32973   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
   32974   // Using the proper condcodes (see below), overflow is checked for.
   32975 
   32976   // FIXME: We can generalize both constraints:
   32977   // - XOR/OR/AND (if they were made to survive AtomicExpand)
   32978   // - LHS != 1
   32979   // if the result is compared.
   32980 
   32981   SDValue CmpLHS = Cmp.getOperand(0);
   32982   SDValue CmpRHS = Cmp.getOperand(1);
   32983 
   32984   if (!CmpLHS.hasOneUse())
   32985     return SDValue();
   32986 
   32987   unsigned Opc = CmpLHS.getOpcode();
   32988   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
   32989     return SDValue();
   32990 
   32991   SDValue OpRHS = CmpLHS.getOperand(2);
   32992   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
   32993   if (!OpRHSC)
   32994     return SDValue();
   32995 
   32996   APInt Addend = OpRHSC->getAPIntValue();
   32997   if (Opc == ISD::ATOMIC_LOAD_SUB)
   32998     Addend = -Addend;
   32999 
   33000   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
   33001   if (!CmpRHSC)
   33002     return SDValue();
   33003 
   33004   APInt Comparison = CmpRHSC->getAPIntValue();
   33005 
   33006   // If the addend is the negation of the comparison value, then we can do
   33007   // a full comparison by emitting the atomic arithmetic as a locked sub.
   33008   if (Comparison == -Addend) {
   33009     // The CC is fine, but we need to rewrite the LHS of the comparison as an
   33010     // atomic sub.
   33011     auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
   33012     auto AtomicSub = DAG.getAtomic(
   33013         ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
   33014         /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
   33015         /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
   33016         AN->getMemOperand());
   33017     // If the comparision uses the CF flag we can't use INC/DEC instructions.
   33018     bool NeedCF = false;
   33019     switch (CC) {
   33020     default: break;
   33021     case X86::COND_A: case X86::COND_AE:
   33022     case X86::COND_B: case X86::COND_BE:
   33023       NeedCF = true;
   33024       break;
   33025     }
   33026     auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
   33027     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
   33028                                   DAG.getUNDEF(CmpLHS.getValueType()));
   33029     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
   33030     return LockOp;
   33031   }
   33032 
   33033   // We can handle comparisons with zero in a number of cases by manipulating
   33034   // the CC used.
   33035   if (!Comparison.isNullValue())
   33036     return SDValue();
   33037 
   33038   if (CC == X86::COND_S && Addend == 1)
   33039     CC = X86::COND_LE;
   33040   else if (CC == X86::COND_NS && Addend == 1)
   33041     CC = X86::COND_G;
   33042   else if (CC == X86::COND_G && Addend == -1)
   33043     CC = X86::COND_GE;
   33044   else if (CC == X86::COND_LE && Addend == -1)
   33045     CC = X86::COND_L;
   33046   else
   33047     return SDValue();
   33048 
   33049   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
   33050   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
   33051                                 DAG.getUNDEF(CmpLHS.getValueType()));
   33052   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
   33053   return LockOp;
   33054 }
   33055 
   33056 // Check whether a boolean test is testing a boolean value generated by
   33057 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
   33058 // code.
   33059 //
   33060 // Simplify the following patterns:
   33061 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
   33062 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
   33063 // to (Op EFLAGS Cond)
   33064 //
   33065 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
   33066 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
   33067 // to (Op EFLAGS !Cond)
   33068 //
   33069 // where Op could be BRCOND or CMOV.
   33070 //
   33071 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   33072   // This combine only operates on CMP-like nodes.
   33073   if (!(Cmp.getOpcode() == X86ISD::CMP ||
   33074         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
   33075     return SDValue();
   33076 
   33077   // Quit if not used as a boolean value.
   33078   if (CC != X86::COND_E && CC != X86::COND_NE)
   33079     return SDValue();
   33080 
   33081   // Check CMP operands. One of them should be 0 or 1 and the other should be
   33082   // an SetCC or extended from it.
   33083   SDValue Op1 = Cmp.getOperand(0);
   33084   SDValue Op2 = Cmp.getOperand(1);
   33085 
   33086   SDValue SetCC;
   33087   const ConstantSDNode* C = nullptr;
   33088   bool needOppositeCond = (CC == X86::COND_E);
   33089   bool checkAgainstTrue = false; // Is it a comparison against 1?
   33090 
   33091   if ((C = dyn_cast<ConstantSDNode>(Op1)))
   33092     SetCC = Op2;
   33093   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
   33094     SetCC = Op1;
   33095   else // Quit if all operands are not constants.
   33096     return SDValue();
   33097 
   33098   if (C->getZExtValue() == 1) {
   33099     needOppositeCond = !needOppositeCond;
   33100     checkAgainstTrue = true;
   33101   } else if (C->getZExtValue() != 0)
   33102     // Quit if the constant is neither 0 or 1.
   33103     return SDValue();
   33104 
   33105   bool truncatedToBoolWithAnd = false;
   33106   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
   33107   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
   33108          SetCC.getOpcode() == ISD::TRUNCATE ||
   33109          SetCC.getOpcode() == ISD::AND) {
   33110     if (SetCC.getOpcode() == ISD::AND) {
   33111       int OpIdx = -1;
   33112       if (isOneConstant(SetCC.getOperand(0)))
   33113         OpIdx = 1;
   33114       if (isOneConstant(SetCC.getOperand(1)))
   33115         OpIdx = 0;
   33116       if (OpIdx < 0)
   33117         break;
   33118       SetCC = SetCC.getOperand(OpIdx);
   33119       truncatedToBoolWithAnd = true;
   33120     } else
   33121       SetCC = SetCC.getOperand(0);
   33122   }
   33123 
   33124   switch (SetCC.getOpcode()) {
   33125   case X86ISD::SETCC_CARRY:
   33126     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
   33127     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
   33128     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
   33129     // truncated to i1 using 'and'.
   33130     if (checkAgainstTrue && !truncatedToBoolWithAnd)
   33131       break;
   33132     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
   33133            "Invalid use of SETCC_CARRY!");
   33134     LLVM_FALLTHROUGH;
   33135   case X86ISD::SETCC:
   33136     // Set the condition code or opposite one if necessary.
   33137     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
   33138     if (needOppositeCond)
   33139       CC = X86::GetOppositeBranchCondition(CC);
   33140     return SetCC.getOperand(1);
   33141   case X86ISD::CMOV: {
   33142     // Check whether false/true value has canonical one, i.e. 0 or 1.
   33143     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
   33144     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
   33145     // Quit if true value is not a constant.
   33146     if (!TVal)
   33147       return SDValue();
   33148     // Quit if false value is not a constant.
   33149     if (!FVal) {
   33150       SDValue Op = SetCC.getOperand(0);
   33151       // Skip 'zext' or 'trunc' node.
   33152       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
   33153           Op.getOpcode() == ISD::TRUNCATE)
   33154         Op = Op.getOperand(0);
   33155       // A special case for rdrand/rdseed, where 0 is set if false cond is
   33156       // found.
   33157       if ((Op.getOpcode() != X86ISD::RDRAND &&
   33158            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
   33159         return SDValue();
   33160     }
   33161     // Quit if false value is not the constant 0 or 1.
   33162     bool FValIsFalse = true;
   33163     if (FVal && FVal->getZExtValue() != 0) {
   33164       if (FVal->getZExtValue() != 1)
   33165         return SDValue();
   33166       // If FVal is 1, opposite cond is needed.
   33167       needOppositeCond = !needOppositeCond;
   33168       FValIsFalse = false;
   33169     }
   33170     // Quit if TVal is not the constant opposite of FVal.
   33171     if (FValIsFalse && TVal->getZExtValue() != 1)
   33172       return SDValue();
   33173     if (!FValIsFalse && TVal->getZExtValue() != 0)
   33174       return SDValue();
   33175     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
   33176     if (needOppositeCond)
   33177       CC = X86::GetOppositeBranchCondition(CC);
   33178     return SetCC.getOperand(3);
   33179   }
   33180   }
   33181 
   33182   return SDValue();
   33183 }
   33184 
   33185 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
   33186 /// Match:
   33187 ///   (X86or (X86setcc) (X86setcc))
   33188 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
   33189 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
   33190                                            X86::CondCode &CC1, SDValue &Flags,
   33191                                            bool &isAnd) {
   33192   if (Cond->getOpcode() == X86ISD::CMP) {
   33193     if (!isNullConstant(Cond->getOperand(1)))
   33194       return false;
   33195 
   33196     Cond = Cond->getOperand(0);
   33197   }
   33198 
   33199   isAnd = false;
   33200 
   33201   SDValue SetCC0, SetCC1;
   33202   switch (Cond->getOpcode()) {
   33203   default: return false;
   33204   case ISD::AND:
   33205   case X86ISD::AND:
   33206     isAnd = true;
   33207     LLVM_FALLTHROUGH;
   33208   case ISD::OR:
   33209   case X86ISD::OR:
   33210     SetCC0 = Cond->getOperand(0);
   33211     SetCC1 = Cond->getOperand(1);
   33212     break;
   33213   };
   33214 
   33215   // Make sure we have SETCC nodes, using the same flags value.
   33216   if (SetCC0.getOpcode() != X86ISD::SETCC ||
   33217       SetCC1.getOpcode() != X86ISD::SETCC ||
   33218       SetCC0->getOperand(1) != SetCC1->getOperand(1))
   33219     return false;
   33220 
   33221   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
   33222   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
   33223   Flags = SetCC0->getOperand(1);
   33224   return true;
   33225 }
   33226 
   33227 // When legalizing carry, we create carries via add X, -1
   33228 // If that comes from an actual carry, via setcc, we use the
   33229 // carry directly.
   33230 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
   33231   if (EFLAGS.getOpcode() == X86ISD::ADD) {
   33232     if (isAllOnesConstant(EFLAGS.getOperand(1))) {
   33233       SDValue Carry = EFLAGS.getOperand(0);
   33234       while (Carry.getOpcode() == ISD::TRUNCATE ||
   33235              Carry.getOpcode() == ISD::ZERO_EXTEND ||
   33236              Carry.getOpcode() == ISD::SIGN_EXTEND ||
   33237              Carry.getOpcode() == ISD::ANY_EXTEND ||
   33238              (Carry.getOpcode() == ISD::AND &&
   33239               isOneConstant(Carry.getOperand(1))))
   33240         Carry = Carry.getOperand(0);
   33241       if (Carry.getOpcode() == X86ISD::SETCC ||
   33242           Carry.getOpcode() == X86ISD::SETCC_CARRY) {
   33243         if (Carry.getConstantOperandVal(0) == X86::COND_B)
   33244           return Carry.getOperand(1);
   33245       }
   33246     }
   33247   }
   33248 
   33249   return SDValue();
   33250 }
   33251 
   33252 /// Optimize an EFLAGS definition used according to the condition code \p CC
   33253 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
   33254 /// uses of chain values.
   33255 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
   33256                                   SelectionDAG &DAG,
   33257                                   const X86Subtarget &Subtarget) {
   33258   if (CC == X86::COND_B)
   33259     if (SDValue Flags = combineCarryThroughADD(EFLAGS))
   33260       return Flags;
   33261 
   33262   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
   33263     return R;
   33264   return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
   33265 }
   33266 
   33267 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
   33268 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
   33269                            TargetLowering::DAGCombinerInfo &DCI,
   33270                            const X86Subtarget &Subtarget) {
   33271   SDLoc DL(N);
   33272 
   33273   SDValue FalseOp = N->getOperand(0);
   33274   SDValue TrueOp = N->getOperand(1);
   33275   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
   33276   SDValue Cond = N->getOperand(3);
   33277 
   33278   // Try to simplify the EFLAGS and condition code operands.
   33279   // We can't always do this as FCMOV only supports a subset of X86 cond.
   33280   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
   33281     if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
   33282       SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
   33283         Flags};
   33284       return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
   33285     }
   33286   }
   33287 
   33288   // If this is a select between two integer constants, try to do some
   33289   // optimizations.  Note that the operands are ordered the opposite of SELECT
   33290   // operands.
   33291   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
   33292     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
   33293       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
   33294       // larger than FalseC (the false value).
   33295       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
   33296         CC = X86::GetOppositeBranchCondition(CC);
   33297         std::swap(TrueC, FalseC);
   33298         std::swap(TrueOp, FalseOp);
   33299       }
   33300 
   33301       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
   33302       // This is efficient for any integer data type (including i8/i16) and
   33303       // shift amount.
   33304       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
   33305         Cond = getSETCC(CC, Cond, DL, DAG);
   33306 
   33307         // Zero extend the condition if needed.
   33308         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
   33309 
   33310         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
   33311         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
   33312                            DAG.getConstant(ShAmt, DL, MVT::i8));
   33313         return Cond;
   33314       }
   33315 
   33316       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
   33317       // for any integer data type, including i8/i16.
   33318       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
   33319         Cond = getSETCC(CC, Cond, DL, DAG);
   33320 
   33321         // Zero extend the condition if needed.
   33322         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
   33323                            FalseC->getValueType(0), Cond);
   33324         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   33325                            SDValue(FalseC, 0));
   33326         return Cond;
   33327       }
   33328 
   33329       // Optimize cases that will turn into an LEA instruction.  This requires
   33330       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   33331       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
   33332         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
   33333         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
   33334 
   33335         bool isFastMultiplier = false;
   33336         if (Diff < 10) {
   33337           switch ((unsigned char)Diff) {
   33338           default: break;
   33339           case 1:  // result = add base, cond
   33340           case 2:  // result = lea base(    , cond*2)
   33341           case 3:  // result = lea base(cond, cond*2)
   33342           case 4:  // result = lea base(    , cond*4)
   33343           case 5:  // result = lea base(cond, cond*4)
   33344           case 8:  // result = lea base(    , cond*8)
   33345           case 9:  // result = lea base(cond, cond*8)
   33346             isFastMultiplier = true;
   33347             break;
   33348           }
   33349         }
   33350 
   33351         if (isFastMultiplier) {
   33352           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
   33353           Cond = getSETCC(CC, Cond, DL ,DAG);
   33354           // Zero extend the condition if needed.
   33355           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
   33356                              Cond);
   33357           // Scale the condition by the difference.
   33358           if (Diff != 1)
   33359             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
   33360                                DAG.getConstant(Diff, DL, Cond.getValueType()));
   33361 
   33362           // Add the base if non-zero.
   33363           if (FalseC->getAPIntValue() != 0)
   33364             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
   33365                                SDValue(FalseC, 0));
   33366           return Cond;
   33367         }
   33368       }
   33369     }
   33370   }
   33371 
   33372   // Handle these cases:
   33373   //   (select (x != c), e, c) -> select (x != c), e, x),
   33374   //   (select (x == c), c, e) -> select (x == c), x, e)
   33375   // where the c is an integer constant, and the "select" is the combination
   33376   // of CMOV and CMP.
   33377   //
   33378   // The rationale for this change is that the conditional-move from a constant
   33379   // needs two instructions, however, conditional-move from a register needs
   33380   // only one instruction.
   33381   //
   33382   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
   33383   //  some instruction-combining opportunities. This opt needs to be
   33384   //  postponed as late as possible.
   33385   //
   33386   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
   33387     // the DCI.xxxx conditions are provided to postpone the optimization as
   33388     // late as possible.
   33389 
   33390     ConstantSDNode *CmpAgainst = nullptr;
   33391     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
   33392         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
   33393         !isa<ConstantSDNode>(Cond.getOperand(0))) {
   33394 
   33395       if (CC == X86::COND_NE &&
   33396           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
   33397         CC = X86::GetOppositeBranchCondition(CC);
   33398         std::swap(TrueOp, FalseOp);
   33399       }
   33400 
   33401       if (CC == X86::COND_E &&
   33402           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
   33403         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
   33404                           DAG.getConstant(CC, DL, MVT::i8), Cond };
   33405         return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
   33406       }
   33407     }
   33408   }
   33409 
   33410   // Fold and/or of setcc's to double CMOV:
   33411   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
   33412   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
   33413   //
   33414   // This combine lets us generate:
   33415   //   cmovcc1 (jcc1 if we don't have CMOV)
   33416   //   cmovcc2 (same)
   33417   // instead of:
   33418   //   setcc1
   33419   //   setcc2
   33420   //   and/or
   33421   //   cmovne (jne if we don't have CMOV)
   33422   // When we can't use the CMOV instruction, it might increase branch
   33423   // mispredicts.
   33424   // When we can use CMOV, or when there is no mispredict, this improves
   33425   // throughput and reduces register pressure.
   33426   //
   33427   if (CC == X86::COND_NE) {
   33428     SDValue Flags;
   33429     X86::CondCode CC0, CC1;
   33430     bool isAndSetCC;
   33431     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
   33432       if (isAndSetCC) {
   33433         std::swap(FalseOp, TrueOp);
   33434         CC0 = X86::GetOppositeBranchCondition(CC0);
   33435         CC1 = X86::GetOppositeBranchCondition(CC1);
   33436       }
   33437 
   33438       SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
   33439         Flags};
   33440       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
   33441       SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
   33442       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
   33443       return CMOV;
   33444     }
   33445   }
   33446 
   33447   // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
   33448   //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
   33449   // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
   33450   //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
   33451   if ((CC == X86::COND_NE || CC == X86::COND_E) &&
   33452       Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
   33453     SDValue Add = TrueOp;
   33454     SDValue Const = FalseOp;
   33455     // Canonicalize the condition code for easier matching and output.
   33456     if (CC == X86::COND_E) {
   33457       std::swap(Add, Const);
   33458       CC = X86::COND_NE;
   33459     }
   33460 
   33461     // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
   33462     if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
   33463         Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
   33464         (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
   33465          Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
   33466         Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
   33467       EVT VT = N->getValueType(0);
   33468       // This should constant fold.
   33469       SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
   33470       SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
   33471                                  DAG.getConstant(CC, DL, MVT::i8), Cond);
   33472       return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
   33473     }
   33474   }
   33475 
   33476   return SDValue();
   33477 }
   33478 
   33479 /// Different mul shrinking modes.
   33480 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
   33481 
   33482 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
   33483   EVT VT = N->getOperand(0).getValueType();
   33484   if (VT.getScalarSizeInBits() != 32)
   33485     return false;
   33486 
   33487   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
   33488   unsigned SignBits[2] = {1, 1};
   33489   bool IsPositive[2] = {false, false};
   33490   for (unsigned i = 0; i < 2; i++) {
   33491     SDValue Opd = N->getOperand(i);
   33492 
   33493     // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
   33494     // compute signbits for it separately.
   33495     if (Opd.getOpcode() == ISD::ANY_EXTEND) {
   33496       // For anyextend, it is safe to assume an appropriate number of leading
   33497       // sign/zero bits.
   33498       if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
   33499         SignBits[i] = 25;
   33500       else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
   33501                MVT::i16)
   33502         SignBits[i] = 17;
   33503       else
   33504         return false;
   33505       IsPositive[i] = true;
   33506     } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
   33507       // All the operands of BUILD_VECTOR need to be int constant.
   33508       // Find the smallest value range which all the operands belong to.
   33509       SignBits[i] = 32;
   33510       IsPositive[i] = true;
   33511       for (const SDValue &SubOp : Opd.getNode()->op_values()) {
   33512         if (SubOp.isUndef())
   33513           continue;
   33514         auto *CN = dyn_cast<ConstantSDNode>(SubOp);
   33515         if (!CN)
   33516           return false;
   33517         APInt IntVal = CN->getAPIntValue();
   33518         if (IntVal.isNegative())
   33519           IsPositive[i] = false;
   33520         SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
   33521       }
   33522     } else {
   33523       SignBits[i] = DAG.ComputeNumSignBits(Opd);
   33524       if (Opd.getOpcode() == ISD::ZERO_EXTEND)
   33525         IsPositive[i] = true;
   33526     }
   33527   }
   33528 
   33529   bool AllPositive = IsPositive[0] && IsPositive[1];
   33530   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
   33531   // When ranges are from -128 ~ 127, use MULS8 mode.
   33532   if (MinSignBits >= 25)
   33533     Mode = MULS8;
   33534   // When ranges are from 0 ~ 255, use MULU8 mode.
   33535   else if (AllPositive && MinSignBits >= 24)
   33536     Mode = MULU8;
   33537   // When ranges are from -32768 ~ 32767, use MULS16 mode.
   33538   else if (MinSignBits >= 17)
   33539     Mode = MULS16;
   33540   // When ranges are from 0 ~ 65535, use MULU16 mode.
   33541   else if (AllPositive && MinSignBits >= 16)
   33542     Mode = MULU16;
   33543   else
   33544     return false;
   33545   return true;
   33546 }
   33547 
   33548 /// When the operands of vector mul are extended from smaller size values,
   33549 /// like i8 and i16, the type of mul may be shrinked to generate more
   33550 /// efficient code. Two typical patterns are handled:
   33551 /// Pattern1:
   33552 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
   33553 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
   33554 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
   33555 ///     %5 = mul <N x i32> %2, %4
   33556 ///
   33557 /// Pattern2:
   33558 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
   33559 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
   33560 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
   33561 ///     %5 = mul <N x i32> %2, %4
   33562 ///
   33563 /// There are four mul shrinking modes:
   33564 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
   33565 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
   33566 /// generate pmullw+sext32 for it (MULS8 mode).
   33567 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
   33568 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
   33569 /// generate pmullw+zext32 for it (MULU8 mode).
   33570 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
   33571 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
   33572 /// generate pmullw+pmulhw for it (MULS16 mode).
   33573 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
   33574 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
   33575 /// generate pmullw+pmulhuw for it (MULU16 mode).
   33576 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
   33577                                const X86Subtarget &Subtarget) {
   33578   // Check for legality
   33579   // pmullw/pmulhw are not supported by SSE.
   33580   if (!Subtarget.hasSSE2())
   33581     return SDValue();
   33582 
   33583   // Check for profitability
   33584   // pmulld is supported since SSE41. It is better to use pmulld
   33585   // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
   33586   // the expansion.
   33587   bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
   33588   if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
   33589     return SDValue();
   33590 
   33591   ShrinkMode Mode;
   33592   if (!canReduceVMulWidth(N, DAG, Mode))
   33593     return SDValue();
   33594 
   33595   SDLoc DL(N);
   33596   SDValue N0 = N->getOperand(0);
   33597   SDValue N1 = N->getOperand(1);
   33598   EVT VT = N->getOperand(0).getValueType();
   33599   unsigned NumElts = VT.getVectorNumElements();
   33600   if ((NumElts % 2) != 0)
   33601     return SDValue();
   33602 
   33603   unsigned RegSize = 128;
   33604   MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
   33605   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
   33606 
   33607   // Shrink the operands of mul.
   33608   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
   33609   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
   33610 
   33611   if (NumElts >= OpsVT.getVectorNumElements()) {
   33612     // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
   33613     // lower part is needed.
   33614     SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
   33615     if (Mode == MULU8 || Mode == MULS8) {
   33616       return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
   33617                          DL, VT, MulLo);
   33618     } else {
   33619       MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
   33620       // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
   33621       // the higher part is also needed.
   33622       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
   33623                                   ReducedVT, NewN0, NewN1);
   33624 
   33625       // Repack the lower part and higher part result of mul into a wider
   33626       // result.
   33627       // Generate shuffle functioning as punpcklwd.
   33628       SmallVector<int, 16> ShuffleMask(NumElts);
   33629       for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
   33630         ShuffleMask[2 * i] = i;
   33631         ShuffleMask[2 * i + 1] = i + NumElts;
   33632       }
   33633       SDValue ResLo =
   33634           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
   33635       ResLo = DAG.getBitcast(ResVT, ResLo);
   33636       // Generate shuffle functioning as punpckhwd.
   33637       for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
   33638         ShuffleMask[2 * i] = i + NumElts / 2;
   33639         ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
   33640       }
   33641       SDValue ResHi =
   33642           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
   33643       ResHi = DAG.getBitcast(ResVT, ResHi);
   33644       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
   33645     }
   33646   } else {
   33647     // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
   33648     // to legalize the mul explicitly because implicit legalization for type
   33649     // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
   33650     // instructions which will not exist when we explicitly legalize it by
   33651     // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
   33652     // <4 x i16> undef).
   33653     //
   33654     // Legalize the operands of mul.
   33655     // FIXME: We may be able to handle non-concatenated vectors by insertion.
   33656     unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
   33657     if ((RegSize % ReducedSizeInBits) != 0)
   33658       return SDValue();
   33659 
   33660     SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
   33661                                  DAG.getUNDEF(ReducedVT));
   33662     Ops[0] = NewN0;
   33663     NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
   33664     Ops[0] = NewN1;
   33665     NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
   33666 
   33667     if (Mode == MULU8 || Mode == MULS8) {
   33668       // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
   33669       // part is needed.
   33670       SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
   33671 
   33672       // convert the type of mul result to VT.
   33673       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
   33674       SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
   33675                                               : ISD::SIGN_EXTEND_VECTOR_INREG,
   33676                                 DL, ResVT, Mul);
   33677       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
   33678                          DAG.getIntPtrConstant(0, DL));
   33679     } else {
   33680       // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
   33681       // MULU16/MULS16, both parts are needed.
   33682       SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
   33683       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
   33684                                   OpsVT, NewN0, NewN1);
   33685 
   33686       // Repack the lower part and higher part result of mul into a wider
   33687       // result. Make sure the type of mul result is VT.
   33688       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
   33689       SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
   33690       Res = DAG.getBitcast(ResVT, Res);
   33691       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
   33692                          DAG.getIntPtrConstant(0, DL));
   33693     }
   33694   }
   33695 }
   33696 
   33697 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
   33698                                  EVT VT, const SDLoc &DL) {
   33699 
   33700   auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
   33701     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
   33702                                  DAG.getConstant(Mult, DL, VT));
   33703     Result = DAG.getNode(ISD::SHL, DL, VT, Result,
   33704                          DAG.getConstant(Shift, DL, MVT::i8));
   33705     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
   33706                          N->getOperand(0));
   33707     return Result;
   33708   };
   33709 
   33710   auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
   33711     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
   33712                                  DAG.getConstant(Mul1, DL, VT));
   33713     Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
   33714                          DAG.getConstant(Mul2, DL, VT));
   33715     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
   33716                          N->getOperand(0));
   33717     return Result;
   33718   };
   33719 
   33720   switch (MulAmt) {
   33721   default:
   33722     break;
   33723   case 11:
   33724     // mul x, 11 => add ((shl (mul x, 5), 1), x)
   33725     return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
   33726   case 21:
   33727     // mul x, 21 => add ((shl (mul x, 5), 2), x)
   33728     return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
   33729   case 41:
   33730     // mul x, 41 => add ((shl (mul x, 5), 3), x)
   33731     return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
   33732   case 22:
   33733     // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
   33734     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
   33735                        combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
   33736   case 19:
   33737     // mul x, 19 => add ((shl (mul x, 9), 1), x)
   33738     return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
   33739   case 37:
   33740     // mul x, 37 => add ((shl (mul x, 9), 2), x)
   33741     return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
   33742   case 73:
   33743     // mul x, 73 => add ((shl (mul x, 9), 3), x)
   33744     return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
   33745   case 13:
   33746     // mul x, 13 => add ((shl (mul x, 3), 2), x)
   33747     return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
   33748   case 23:
   33749     // mul x, 23 => sub ((shl (mul x, 3), 3), x)
   33750     return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
   33751   case 26:
   33752     // mul x, 26 => add ((mul (mul x, 5), 5), x)
   33753     return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
   33754   case 28:
   33755     // mul x, 28 => add ((mul (mul x, 9), 3), x)
   33756     return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
   33757   case 29:
   33758     // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
   33759     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
   33760                        combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
   33761   }
   33762 
   33763   // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
   33764   // by a single LEA.
   33765   // First check if this a sum of two power of 2s because that's easy. Then
   33766   // count how many zeros are up to the first bit.
   33767   // TODO: We can do this even without LEA at a cost of two shifts and an add.
   33768   if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
   33769     unsigned ScaleShift = countTrailingZeros(MulAmt);
   33770     if (ScaleShift >= 1 && ScaleShift < 4) {
   33771       unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
   33772       SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   33773                                    DAG.getConstant(ShiftAmt, DL, MVT::i8));
   33774       SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   33775                                    DAG.getConstant(ScaleShift, DL, MVT::i8));
   33776       return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
   33777     }
   33778   }
   33779 
   33780   return SDValue();
   33781 }
   33782 
   33783 // If the upper 17 bits of each element are zero then we can use PMADDWD,
   33784 // which is always at least as quick as PMULLD, expect on KNL.
   33785 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
   33786                                    const X86Subtarget &Subtarget) {
   33787   if (!Subtarget.hasSSE2())
   33788     return SDValue();
   33789 
   33790   if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL)
   33791     return SDValue();
   33792 
   33793   EVT VT = N->getValueType(0);
   33794 
   33795   // Only support vXi32 vectors.
   33796   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
   33797     return SDValue();
   33798 
   33799   // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
   33800   MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
   33801   if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT))
   33802     return SDValue();
   33803 
   33804   SDValue N0 = N->getOperand(0);
   33805   SDValue N1 = N->getOperand(1);
   33806   APInt Mask17 = APInt::getHighBitsSet(32, 17);
   33807   if (!DAG.MaskedValueIsZero(N1, Mask17) ||
   33808       !DAG.MaskedValueIsZero(N0, Mask17))
   33809     return SDValue();
   33810 
   33811   // Use SplitOpsAndApply to handle AVX splitting.
   33812   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   33813                            ArrayRef<SDValue> Ops) {
   33814     MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
   33815     return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
   33816   };
   33817   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
   33818                           { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
   33819                           PMADDWDBuilder);
   33820 }
   33821 
   33822 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
   33823                                   const X86Subtarget &Subtarget) {
   33824   if (!Subtarget.hasSSE2())
   33825     return SDValue();
   33826 
   33827   EVT VT = N->getValueType(0);
   33828 
   33829   // Only support vXi64 vectors.
   33830   if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
   33831       !DAG.getTargetLoweringInfo().isTypeLegal(VT))
   33832     return SDValue();
   33833 
   33834   SDValue N0 = N->getOperand(0);
   33835   SDValue N1 = N->getOperand(1);
   33836 
   33837   // MULDQ returns the 64-bit result of the signed multiplication of the lower
   33838   // 32-bits. We can lower with this if the sign bits stretch that far.
   33839   if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
   33840       DAG.ComputeNumSignBits(N1) > 32) {
   33841     auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   33842                             ArrayRef<SDValue> Ops) {
   33843       return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
   33844     };
   33845     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
   33846                             PMULDQBuilder, /*CheckBWI*/false);
   33847   }
   33848 
   33849   // If the upper bits are zero we can use a single pmuludq.
   33850   APInt Mask = APInt::getHighBitsSet(64, 32);
   33851   if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
   33852     auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   33853                              ArrayRef<SDValue> Ops) {
   33854       return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
   33855     };
   33856     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
   33857                             PMULUDQBuilder, /*CheckBWI*/false);
   33858   }
   33859 
   33860   return SDValue();
   33861 }
   33862 
   33863 /// Optimize a single multiply with constant into two operations in order to
   33864 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
   33865 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
   33866                           TargetLowering::DAGCombinerInfo &DCI,
   33867                           const X86Subtarget &Subtarget) {
   33868   EVT VT = N->getValueType(0);
   33869 
   33870   if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
   33871     return V;
   33872 
   33873   if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
   33874     return V;
   33875 
   33876   if (DCI.isBeforeLegalize() && VT.isVector())
   33877     return reduceVMULWidth(N, DAG, Subtarget);
   33878 
   33879   if (!MulConstantOptimization)
   33880     return SDValue();
   33881   // An imul is usually smaller than the alternative sequence.
   33882   if (DAG.getMachineFunction().getFunction().optForMinSize())
   33883     return SDValue();
   33884 
   33885   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
   33886     return SDValue();
   33887 
   33888   if (VT != MVT::i64 && VT != MVT::i32)
   33889     return SDValue();
   33890 
   33891   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   33892   if (!C)
   33893     return SDValue();
   33894   if (isPowerOf2_64(C->getZExtValue()))
   33895     return SDValue();
   33896 
   33897   int64_t SignMulAmt = C->getSExtValue();
   33898   assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
   33899   uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
   33900 
   33901   SDLoc DL(N);
   33902   if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
   33903     SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
   33904                                  DAG.getConstant(AbsMulAmt, DL, VT));
   33905     if (SignMulAmt < 0)
   33906       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
   33907                            NewMul);
   33908 
   33909     return NewMul;
   33910   }
   33911 
   33912   uint64_t MulAmt1 = 0;
   33913   uint64_t MulAmt2 = 0;
   33914   if ((AbsMulAmt % 9) == 0) {
   33915     MulAmt1 = 9;
   33916     MulAmt2 = AbsMulAmt / 9;
   33917   } else if ((AbsMulAmt % 5) == 0) {
   33918     MulAmt1 = 5;
   33919     MulAmt2 = AbsMulAmt / 5;
   33920   } else if ((AbsMulAmt % 3) == 0) {
   33921     MulAmt1 = 3;
   33922     MulAmt2 = AbsMulAmt / 3;
   33923   }
   33924 
   33925   SDValue NewMul;
   33926   // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
   33927   if (MulAmt2 &&
   33928       (isPowerOf2_64(MulAmt2) ||
   33929        (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
   33930 
   33931     if (isPowerOf2_64(MulAmt2) &&
   33932         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
   33933       // If second multiplifer is pow2, issue it first. We want the multiply by
   33934       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
   33935       // is an add.
   33936       std::swap(MulAmt1, MulAmt2);
   33937 
   33938     if (isPowerOf2_64(MulAmt1))
   33939       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   33940                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
   33941     else
   33942       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
   33943                            DAG.getConstant(MulAmt1, DL, VT));
   33944 
   33945     if (isPowerOf2_64(MulAmt2))
   33946       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
   33947                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
   33948     else
   33949       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
   33950                            DAG.getConstant(MulAmt2, DL, VT));
   33951 
   33952     // Negate the result.
   33953     if (SignMulAmt < 0)
   33954       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
   33955                            NewMul);
   33956   } else if (!Subtarget.slowLEA())
   33957     NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
   33958 
   33959   if (!NewMul) {
   33960     assert(C->getZExtValue() != 0 &&
   33961            C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
   33962            "Both cases that could cause potential overflows should have "
   33963            "already been handled.");
   33964     if (isPowerOf2_64(AbsMulAmt - 1)) {
   33965       // (mul x, 2^N + 1) => (add (shl x, N), x)
   33966       NewMul = DAG.getNode(
   33967           ISD::ADD, DL, VT, N->getOperand(0),
   33968           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   33969                       DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
   33970                                       MVT::i8)));
   33971       // To negate, subtract the number from zero
   33972       if (SignMulAmt < 0)
   33973         NewMul = DAG.getNode(ISD::SUB, DL, VT,
   33974                              DAG.getConstant(0, DL, VT), NewMul);
   33975     } else if (isPowerOf2_64(AbsMulAmt + 1)) {
   33976       // (mul x, 2^N - 1) => (sub (shl x, N), x)
   33977       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   33978                            DAG.getConstant(Log2_64(AbsMulAmt + 1),
   33979                                            DL, MVT::i8));
   33980       // To negate, reverse the operands of the subtract.
   33981       if (SignMulAmt < 0)
   33982         NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
   33983       else
   33984         NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
   33985     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
   33986       // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
   33987       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   33988                            DAG.getConstant(Log2_64(AbsMulAmt - 2),
   33989                                            DL, MVT::i8));
   33990       NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
   33991       NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
   33992     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
   33993       // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
   33994       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
   33995                            DAG.getConstant(Log2_64(AbsMulAmt + 2),
   33996                                            DL, MVT::i8));
   33997       NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
   33998       NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
   33999     }
   34000   }
   34001 
   34002   return NewMul;
   34003 }
   34004 
   34005 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
   34006   SDValue N0 = N->getOperand(0);
   34007   SDValue N1 = N->getOperand(1);
   34008   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   34009   EVT VT = N0.getValueType();
   34010 
   34011   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
   34012   // since the result of setcc_c is all zero's or all ones.
   34013   if (VT.isInteger() && !VT.isVector() &&
   34014       N1C && N0.getOpcode() == ISD::AND &&
   34015       N0.getOperand(1).getOpcode() == ISD::Constant) {
   34016     SDValue N00 = N0.getOperand(0);
   34017     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
   34018     Mask <<= N1C->getAPIntValue();
   34019     bool MaskOK = false;
   34020     // We can handle cases concerning bit-widening nodes containing setcc_c if
   34021     // we carefully interrogate the mask to make sure we are semantics
   34022     // preserving.
   34023     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
   34024     // of the underlying setcc_c operation if the setcc_c was zero extended.
   34025     // Consider the following example:
   34026     //   zext(setcc_c)                 -> i32 0x0000FFFF
   34027     //   c1                            -> i32 0x0000FFFF
   34028     //   c2                            -> i32 0x00000001
   34029     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
   34030     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
   34031     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   34032       MaskOK = true;
   34033     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
   34034                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   34035       MaskOK = true;
   34036     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
   34037                 N00.getOpcode() == ISD::ANY_EXTEND) &&
   34038                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
   34039       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
   34040     }
   34041     if (MaskOK && Mask != 0) {
   34042       SDLoc DL(N);
   34043       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
   34044     }
   34045   }
   34046 
   34047   // Hardware support for vector shifts is sparse which makes us scalarize the
   34048   // vector operations in many cases. Also, on sandybridge ADD is faster than
   34049   // shl.
   34050   // (shl V, 1) -> add V,V
   34051   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
   34052     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
   34053       assert(N0.getValueType().isVector() && "Invalid vector shift type");
   34054       // We shift all of the values by one. In many cases we do not have
   34055       // hardware support for this operation. This is better expressed as an ADD
   34056       // of two values.
   34057       if (N1SplatC->getAPIntValue() == 1)
   34058         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
   34059     }
   34060 
   34061   return SDValue();
   34062 }
   34063 
   34064 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
   34065   SDValue N0 = N->getOperand(0);
   34066   SDValue N1 = N->getOperand(1);
   34067   EVT VT = N0.getValueType();
   34068   unsigned Size = VT.getSizeInBits();
   34069 
   34070   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
   34071   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
   34072   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
   34073   // depending on sign of (SarConst - [56,48,32,24,16])
   34074 
   34075   // sexts in X86 are MOVs. The MOVs have the same code size
   34076   // as above SHIFTs (only SHIFT on 1 has lower code size).
   34077   // However the MOVs have 2 advantages to a SHIFT:
   34078   // 1. MOVs can write to a register that differs from source
   34079   // 2. MOVs accept memory operands
   34080 
   34081   if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
   34082       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
   34083       N0.getOperand(1).getOpcode() != ISD::Constant)
   34084     return SDValue();
   34085 
   34086   SDValue N00 = N0.getOperand(0);
   34087   SDValue N01 = N0.getOperand(1);
   34088   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
   34089   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
   34090   EVT CVT = N1.getValueType();
   34091 
   34092   if (SarConst.isNegative())
   34093     return SDValue();
   34094 
   34095   for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
   34096     unsigned ShiftSize = SVT.getSizeInBits();
   34097     // skipping types without corresponding sext/zext and
   34098     // ShlConst that is not one of [56,48,32,24,16]
   34099     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
   34100       continue;
   34101     SDLoc DL(N);
   34102     SDValue NN =
   34103         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
   34104     SarConst = SarConst - (Size - ShiftSize);
   34105     if (SarConst == 0)
   34106       return NN;
   34107     else if (SarConst.isNegative())
   34108       return DAG.getNode(ISD::SHL, DL, VT, NN,
   34109                          DAG.getConstant(-SarConst, DL, CVT));
   34110     else
   34111       return DAG.getNode(ISD::SRA, DL, VT, NN,
   34112                          DAG.getConstant(SarConst, DL, CVT));
   34113   }
   34114   return SDValue();
   34115 }
   34116 
   34117 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
   34118                                         TargetLowering::DAGCombinerInfo &DCI) {
   34119   SDValue N0 = N->getOperand(0);
   34120   SDValue N1 = N->getOperand(1);
   34121   EVT VT = N0.getValueType();
   34122 
   34123   // Only do this on the last DAG combine as it can interfere with other
   34124   // combines.
   34125   if (!DCI.isAfterLegalizeDAG())
   34126     return SDValue();
   34127 
   34128   // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
   34129   // TODO: This is a generic DAG combine that became an x86-only combine to
   34130   // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
   34131   // and-not ('andn').
   34132   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
   34133     return SDValue();
   34134 
   34135   auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
   34136   auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   34137   if (!ShiftC || !AndC)
   34138     return SDValue();
   34139 
   34140   // If we can shrink the constant mask below 8-bits or 32-bits, then this
   34141   // transform should reduce code size. It may also enable secondary transforms
   34142   // from improved known-bits analysis or instruction selection.
   34143   APInt MaskVal = AndC->getAPIntValue();
   34144 
   34145   // If this can be matched by a zero extend, don't optimize.
   34146   if (MaskVal.isMask()) {
   34147     unsigned TO = MaskVal.countTrailingOnes();
   34148     if (TO >= 8 && isPowerOf2_32(TO))
   34149       return SDValue();
   34150   }
   34151 
   34152   APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
   34153   unsigned OldMaskSize = MaskVal.getMinSignedBits();
   34154   unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
   34155   if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
   34156       (OldMaskSize > 32 && NewMaskSize <= 32)) {
   34157     // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
   34158     SDLoc DL(N);
   34159     SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
   34160     SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
   34161     return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
   34162   }
   34163   return SDValue();
   34164 }
   34165 
   34166 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
   34167                             TargetLowering::DAGCombinerInfo &DCI,
   34168                             const X86Subtarget &Subtarget) {
   34169   if (N->getOpcode() == ISD::SHL)
   34170     if (SDValue V = combineShiftLeft(N, DAG))
   34171       return V;
   34172 
   34173   if (N->getOpcode() == ISD::SRA)
   34174     if (SDValue V = combineShiftRightArithmetic(N, DAG))
   34175       return V;
   34176 
   34177   if (N->getOpcode() == ISD::SRL)
   34178     if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
   34179       return V;
   34180 
   34181   return SDValue();
   34182 }
   34183 
   34184 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
   34185                                  TargetLowering::DAGCombinerInfo &DCI,
   34186                                  const X86Subtarget &Subtarget) {
   34187   unsigned Opcode = N->getOpcode();
   34188   assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
   34189          "Unexpected shift opcode");
   34190 
   34191   EVT VT = N->getValueType(0);
   34192   SDValue N0 = N->getOperand(0);
   34193   SDValue N1 = N->getOperand(1);
   34194   unsigned DstBitsPerElt = VT.getScalarSizeInBits();
   34195   unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
   34196   assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
   34197          N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
   34198          "Unexpected PACKSS/PACKUS input type");
   34199 
   34200   // Constant Folding.
   34201   APInt UndefElts0, UndefElts1;
   34202   SmallVector<APInt, 32> EltBits0, EltBits1;
   34203   if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
   34204       (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
   34205       getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
   34206       getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
   34207     unsigned NumLanes = VT.getSizeInBits() / 128;
   34208     unsigned NumDstElts = VT.getVectorNumElements();
   34209     unsigned NumSrcElts = NumDstElts / 2;
   34210     unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
   34211     unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
   34212     bool IsSigned = (X86ISD::PACKSS == Opcode);
   34213 
   34214     APInt Undefs(NumDstElts, 0);
   34215     SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
   34216     for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
   34217       for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
   34218         unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
   34219         auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
   34220         auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
   34221 
   34222         if (UndefElts[SrcIdx]) {
   34223           Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
   34224           continue;
   34225         }
   34226 
   34227         APInt &Val = EltBits[SrcIdx];
   34228         if (IsSigned) {
   34229           // PACKSS: Truncate signed value with signed saturation.
   34230           // Source values less than dst minint are saturated to minint.
   34231           // Source values greater than dst maxint are saturated to maxint.
   34232           if (Val.isSignedIntN(DstBitsPerElt))
   34233             Val = Val.trunc(DstBitsPerElt);
   34234           else if (Val.isNegative())
   34235             Val = APInt::getSignedMinValue(DstBitsPerElt);
   34236           else
   34237             Val = APInt::getSignedMaxValue(DstBitsPerElt);
   34238         } else {
   34239           // PACKUS: Truncate signed value with unsigned saturation.
   34240           // Source values less than zero are saturated to zero.
   34241           // Source values greater than dst maxuint are saturated to maxuint.
   34242           if (Val.isIntN(DstBitsPerElt))
   34243             Val = Val.trunc(DstBitsPerElt);
   34244           else if (Val.isNegative())
   34245             Val = APInt::getNullValue(DstBitsPerElt);
   34246           else
   34247             Val = APInt::getAllOnesValue(DstBitsPerElt);
   34248         }
   34249         Bits[Lane * NumDstEltsPerLane + Elt] = Val;
   34250       }
   34251     }
   34252 
   34253     return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
   34254   }
   34255 
   34256   // Attempt to combine as shuffle.
   34257   SDValue Op(N, 0);
   34258   if (SDValue Res =
   34259           combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
   34260                                         /*HasVarMask*/ false, DAG, Subtarget))
   34261     return Res;
   34262 
   34263   return SDValue();
   34264 }
   34265 
   34266 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
   34267                                      TargetLowering::DAGCombinerInfo &DCI,
   34268                                      const X86Subtarget &Subtarget) {
   34269   unsigned Opcode = N->getOpcode();
   34270   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
   34271           X86ISD::VSRLI == Opcode) &&
   34272          "Unexpected shift opcode");
   34273   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
   34274   EVT VT = N->getValueType(0);
   34275   SDValue N0 = N->getOperand(0);
   34276   SDValue N1 = N->getOperand(1);
   34277   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
   34278   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
   34279          "Unexpected value type");
   34280 
   34281   // Out of range logical bit shifts are guaranteed to be zero.
   34282   // Out of range arithmetic bit shifts splat the sign bit.
   34283   APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
   34284   if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
   34285     if (LogicalShift)
   34286       return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
   34287     else
   34288       ShiftVal = NumBitsPerElt - 1;
   34289   }
   34290 
   34291   // Shift N0 by zero -> N0.
   34292   if (!ShiftVal)
   34293     return N0;
   34294 
   34295   // Shift zero -> zero.
   34296   if (ISD::isBuildVectorAllZeros(N0.getNode()))
   34297     return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
   34298 
   34299   // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
   34300   // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
   34301   // TODO - support other sra opcodes as needed.
   34302   if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
   34303       N0.getOpcode() == X86ISD::VSRAI)
   34304     return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
   34305 
   34306   // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
   34307   if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
   34308       N1 == N0.getOperand(1)) {
   34309     SDValue N00 = N0.getOperand(0);
   34310     unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
   34311     if (ShiftVal.ult(NumSignBits))
   34312       return N00;
   34313   }
   34314 
   34315   // We can decode 'whole byte' logical bit shifts as shuffles.
   34316   if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
   34317     SDValue Op(N, 0);
   34318     if (SDValue Res = combineX86ShufflesRecursively(
   34319             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
   34320             /*HasVarMask*/ false, DAG, Subtarget))
   34321       return Res;
   34322   }
   34323 
   34324   // Constant Folding.
   34325   APInt UndefElts;
   34326   SmallVector<APInt, 32> EltBits;
   34327   if (N->isOnlyUserOf(N0.getNode()) &&
   34328       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
   34329     assert(EltBits.size() == VT.getVectorNumElements() &&
   34330            "Unexpected shift value type");
   34331     unsigned ShiftImm = ShiftVal.getZExtValue();
   34332     for (APInt &Elt : EltBits) {
   34333       if (X86ISD::VSHLI == Opcode)
   34334         Elt <<= ShiftImm;
   34335       else if (X86ISD::VSRAI == Opcode)
   34336         Elt.ashrInPlace(ShiftImm);
   34337       else
   34338         Elt.lshrInPlace(ShiftImm);
   34339     }
   34340     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
   34341   }
   34342 
   34343   return SDValue();
   34344 }
   34345 
   34346 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
   34347                                    TargetLowering::DAGCombinerInfo &DCI,
   34348                                    const X86Subtarget &Subtarget) {
   34349   assert(
   34350       ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
   34351        (N->getOpcode() == X86ISD::PINSRW &&
   34352         N->getValueType(0) == MVT::v8i16)) &&
   34353       "Unexpected vector insertion");
   34354 
   34355   // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
   34356   SDValue Op(N, 0);
   34357   if (SDValue Res =
   34358           combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
   34359                                         /*HasVarMask*/ false, DAG, Subtarget))
   34360     return Res;
   34361 
   34362   return SDValue();
   34363 }
   34364 
   34365 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
   34366 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
   34367 /// OR -> CMPNEQSS.
   34368 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
   34369                                    TargetLowering::DAGCombinerInfo &DCI,
   34370                                    const X86Subtarget &Subtarget) {
   34371   unsigned opcode;
   34372 
   34373   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
   34374   // we're requiring SSE2 for both.
   34375   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
   34376     SDValue N0 = N->getOperand(0);
   34377     SDValue N1 = N->getOperand(1);
   34378     SDValue CMP0 = N0->getOperand(1);
   34379     SDValue CMP1 = N1->getOperand(1);
   34380     SDLoc DL(N);
   34381 
   34382     // The SETCCs should both refer to the same CMP.
   34383     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
   34384       return SDValue();
   34385 
   34386     SDValue CMP00 = CMP0->getOperand(0);
   34387     SDValue CMP01 = CMP0->getOperand(1);
   34388     EVT     VT    = CMP00.getValueType();
   34389 
   34390     if (VT == MVT::f32 || VT == MVT::f64) {
   34391       bool ExpectingFlags = false;
   34392       // Check for any users that want flags:
   34393       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
   34394            !ExpectingFlags && UI != UE; ++UI)
   34395         switch (UI->getOpcode()) {
   34396         default:
   34397         case ISD::BR_CC:
   34398         case ISD::BRCOND:
   34399         case ISD::SELECT:
   34400           ExpectingFlags = true;
   34401           break;
   34402         case ISD::CopyToReg:
   34403         case ISD::SIGN_EXTEND:
   34404         case ISD::ZERO_EXTEND:
   34405         case ISD::ANY_EXTEND:
   34406           break;
   34407         }
   34408 
   34409       if (!ExpectingFlags) {
   34410         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
   34411         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
   34412 
   34413         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
   34414           X86::CondCode tmp = cc0;
   34415           cc0 = cc1;
   34416           cc1 = tmp;
   34417         }
   34418 
   34419         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
   34420             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
   34421           // FIXME: need symbolic constants for these magic numbers.
   34422           // See X86ATTInstPrinter.cpp:printSSECC().
   34423           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
   34424           if (Subtarget.hasAVX512()) {
   34425             SDValue FSetCC =
   34426                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
   34427                             DAG.getConstant(x86cc, DL, MVT::i8));
   34428             // Need to fill with zeros to ensure the bitcast will produce zeroes
   34429             // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
   34430             SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
   34431                                       DAG.getConstant(0, DL, MVT::v16i1),
   34432                                       FSetCC, DAG.getIntPtrConstant(0, DL));
   34433             return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
   34434                                       N->getSimpleValueType(0));
   34435           }
   34436           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
   34437                                               CMP00.getValueType(), CMP00, CMP01,
   34438                                               DAG.getConstant(x86cc, DL,
   34439                                                               MVT::i8));
   34440 
   34441           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
   34442           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
   34443 
   34444           if (is64BitFP && !Subtarget.is64Bit()) {
   34445             // On a 32-bit target, we cannot bitcast the 64-bit float to a
   34446             // 64-bit integer, since that's not a legal type. Since
   34447             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
   34448             // bits, but can do this little dance to extract the lowest 32 bits
   34449             // and work with those going forward.
   34450             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
   34451                                            OnesOrZeroesF);
   34452             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
   34453             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
   34454                                         Vector32, DAG.getIntPtrConstant(0, DL));
   34455             IntVT = MVT::i32;
   34456           }
   34457 
   34458           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
   34459           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
   34460                                       DAG.getConstant(1, DL, IntVT));
   34461           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
   34462                                               ANDed);
   34463           return OneBitOfTruth;
   34464         }
   34465       }
   34466     }
   34467   }
   34468   return SDValue();
   34469 }
   34470 
   34471 // Try to match (and (xor X, -1), Y) logic pattern for (andnp X, Y) combines.
   34472 static bool matchANDXORWithAllOnesAsANDNP(SDNode *N, SDValue &X, SDValue &Y) {
   34473   if (N->getOpcode() != ISD::AND)
   34474     return false;
   34475 
   34476   SDValue N0 = N->getOperand(0);
   34477   SDValue N1 = N->getOperand(1);
   34478   if (N0.getOpcode() == ISD::XOR &&
   34479       ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
   34480     X = N0.getOperand(0);
   34481     Y = N1;
   34482     return true;
   34483   }
   34484   if (N1.getOpcode() == ISD::XOR &&
   34485       ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
   34486     X = N1.getOperand(0);
   34487     Y = N0;
   34488     return true;
   34489   }
   34490 
   34491   return false;
   34492 }
   34493 
   34494 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
   34495 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   34496   assert(N->getOpcode() == ISD::AND);
   34497 
   34498   EVT VT = N->getValueType(0);
   34499   if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
   34500     return SDValue();
   34501 
   34502   SDValue X, Y;
   34503   if (matchANDXORWithAllOnesAsANDNP(N, X, Y))
   34504     return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
   34505 
   34506   return SDValue();
   34507 }
   34508 
   34509 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
   34510 // register. In most cases we actually compare or select YMM-sized registers
   34511 // and mixing the two types creates horrible code. This method optimizes
   34512 // some of the transition sequences.
   34513 // Even with AVX-512 this is still useful for removing casts around logical
   34514 // operations on vXi1 mask types.
   34515 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   34516                                    const X86Subtarget &Subtarget) {
   34517   EVT VT = N->getValueType(0);
   34518   assert(VT.isVector() && "Expected vector type");
   34519 
   34520   assert((N->getOpcode() == ISD::ANY_EXTEND ||
   34521           N->getOpcode() == ISD::ZERO_EXTEND ||
   34522           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
   34523 
   34524   SDValue Narrow = N->getOperand(0);
   34525   EVT NarrowVT = Narrow.getValueType();
   34526 
   34527   if (Narrow->getOpcode() != ISD::XOR &&
   34528       Narrow->getOpcode() != ISD::AND &&
   34529       Narrow->getOpcode() != ISD::OR)
   34530     return SDValue();
   34531 
   34532   SDValue N0  = Narrow->getOperand(0);
   34533   SDValue N1  = Narrow->getOperand(1);
   34534   SDLoc DL(Narrow);
   34535 
   34536   // The Left side has to be a trunc.
   34537   if (N0.getOpcode() != ISD::TRUNCATE)
   34538     return SDValue();
   34539 
   34540   // The type of the truncated inputs.
   34541   if (N0->getOperand(0).getValueType() != VT)
   34542     return SDValue();
   34543 
   34544   // The right side has to be a 'trunc' or a constant vector.
   34545   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
   34546                   N1.getOperand(0).getValueType() == VT;
   34547   if (!RHSTrunc &&
   34548       !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
   34549     return SDValue();
   34550 
   34551   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   34552 
   34553   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
   34554     return SDValue();
   34555 
   34556   // Set N0 and N1 to hold the inputs to the new wide operation.
   34557   N0 = N0->getOperand(0);
   34558   if (RHSTrunc)
   34559     N1 = N1->getOperand(0);
   34560   else
   34561     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
   34562 
   34563   // Generate the wide operation.
   34564   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
   34565   unsigned Opcode = N->getOpcode();
   34566   switch (Opcode) {
   34567   default: llvm_unreachable("Unexpected opcode");
   34568   case ISD::ANY_EXTEND:
   34569     return Op;
   34570   case ISD::ZERO_EXTEND:
   34571     return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
   34572   case ISD::SIGN_EXTEND:
   34573     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
   34574                        Op, DAG.getValueType(NarrowVT));
   34575   }
   34576 }
   34577 
   34578 /// If both input operands of a logic op are being cast from floating point
   34579 /// types, try to convert this into a floating point logic node to avoid
   34580 /// unnecessary moves from SSE to integer registers.
   34581 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
   34582                                         const X86Subtarget &Subtarget) {
   34583   unsigned FPOpcode = ISD::DELETED_NODE;
   34584   if (N->getOpcode() == ISD::AND)
   34585     FPOpcode = X86ISD::FAND;
   34586   else if (N->getOpcode() == ISD::OR)
   34587     FPOpcode = X86ISD::FOR;
   34588   else if (N->getOpcode() == ISD::XOR)
   34589     FPOpcode = X86ISD::FXOR;
   34590 
   34591   assert(FPOpcode != ISD::DELETED_NODE &&
   34592          "Unexpected input node for FP logic conversion");
   34593 
   34594   EVT VT = N->getValueType(0);
   34595   SDValue N0 = N->getOperand(0);
   34596   SDValue N1 = N->getOperand(1);
   34597   SDLoc DL(N);
   34598   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
   34599       ((Subtarget.hasSSE1() && VT == MVT::i32) ||
   34600        (Subtarget.hasSSE2() && VT == MVT::i64))) {
   34601     SDValue N00 = N0.getOperand(0);
   34602     SDValue N10 = N1.getOperand(0);
   34603     EVT N00Type = N00.getValueType();
   34604     EVT N10Type = N10.getValueType();
   34605     if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
   34606       SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
   34607       return DAG.getBitcast(VT, FPLogic);
   34608     }
   34609   }
   34610   return SDValue();
   34611 }
   34612 
   34613 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
   34614 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
   34615 /// with a shift-right to eliminate loading the vector constant mask value.
   34616 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
   34617                                      const X86Subtarget &Subtarget) {
   34618   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
   34619   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
   34620   EVT VT0 = Op0.getValueType();
   34621   EVT VT1 = Op1.getValueType();
   34622 
   34623   if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
   34624     return SDValue();
   34625 
   34626   APInt SplatVal;
   34627   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
   34628       !SplatVal.isMask())
   34629     return SDValue();
   34630 
   34631   if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
   34632     return SDValue();
   34633 
   34634   unsigned EltBitWidth = VT0.getScalarSizeInBits();
   34635   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
   34636     return SDValue();
   34637 
   34638   SDLoc DL(N);
   34639   unsigned ShiftVal = SplatVal.countTrailingOnes();
   34640   SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
   34641   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
   34642   return DAG.getBitcast(N->getValueType(0), Shift);
   34643 }
   34644 
   34645 // Get the index node from the lowered DAG of a GEP IR instruction with one
   34646 // indexing dimension.
   34647 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
   34648   if (Ld->isIndexed())
   34649     return SDValue();
   34650 
   34651   SDValue Base = Ld->getBasePtr();
   34652 
   34653   if (Base.getOpcode() != ISD::ADD)
   34654     return SDValue();
   34655 
   34656   SDValue ShiftedIndex = Base.getOperand(0);
   34657 
   34658   if (ShiftedIndex.getOpcode() != ISD::SHL)
   34659     return SDValue();
   34660 
   34661   return ShiftedIndex.getOperand(0);
   34662 
   34663 }
   34664 
   34665 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
   34666   if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
   34667     switch (VT.getSizeInBits()) {
   34668     default: return false;
   34669     case 64: return Subtarget.is64Bit() ? true : false;
   34670     case 32: return true;
   34671     }
   34672   }
   34673   return false;
   34674 }
   34675 
   34676 // This function recognizes cases where X86 bzhi instruction can replace and
   34677 // 'and-load' sequence.
   34678 // In case of loading integer value from an array of constants which is defined
   34679 // as follows:
   34680 //
   34681 //   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
   34682 //
   34683 // then applying a bitwise and on the result with another input.
   34684 // It's equivalent to performing bzhi (zero high bits) on the input, with the
   34685 // same index of the load.
   34686 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
   34687                                     const X86Subtarget &Subtarget) {
   34688   MVT VT = Node->getSimpleValueType(0);
   34689   SDLoc dl(Node);
   34690 
   34691   // Check if subtarget has BZHI instruction for the node's type
   34692   if (!hasBZHI(Subtarget, VT))
   34693     return SDValue();
   34694 
   34695   // Try matching the pattern for both operands.
   34696   for (unsigned i = 0; i < 2; i++) {
   34697     SDValue N = Node->getOperand(i);
   34698     LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
   34699 
   34700      // continue if the operand is not a load instruction
   34701     if (!Ld)
   34702       return SDValue();
   34703 
   34704     const Value *MemOp = Ld->getMemOperand()->getValue();
   34705 
   34706     if (!MemOp)
   34707       return SDValue();
   34708 
   34709     if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
   34710       if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
   34711         if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
   34712 
   34713           Constant *Init = GV->getInitializer();
   34714           Type *Ty = Init->getType();
   34715           if (!isa<ConstantDataArray>(Init) ||
   34716               !Ty->getArrayElementType()->isIntegerTy() ||
   34717               Ty->getArrayElementType()->getScalarSizeInBits() !=
   34718                   VT.getSizeInBits() ||
   34719               Ty->getArrayNumElements() >
   34720                   Ty->getArrayElementType()->getScalarSizeInBits())
   34721             continue;
   34722 
   34723           // Check if the array's constant elements are suitable to our case.
   34724           uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
   34725           bool ConstantsMatch = true;
   34726           for (uint64_t j = 0; j < ArrayElementCount; j++) {
   34727             ConstantInt *Elem =
   34728                 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
   34729             if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
   34730               ConstantsMatch = false;
   34731               break;
   34732             }
   34733           }
   34734           if (!ConstantsMatch)
   34735             continue;
   34736 
   34737           // Do the transformation (For 32-bit type):
   34738           // -> (and (load arr[idx]), inp)
   34739           // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
   34740           //    that will be replaced with one bzhi instruction.
   34741           SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
   34742           SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
   34743 
   34744           // Get the Node which indexes into the array.
   34745           SDValue Index = getIndexFromUnindexedLoad(Ld);
   34746           if (!Index)
   34747             return SDValue();
   34748           Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
   34749 
   34750           SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
   34751           Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
   34752 
   34753           SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
   34754           SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
   34755 
   34756           return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
   34757         }
   34758       }
   34759     }
   34760   }
   34761   return SDValue();
   34762 }
   34763 
   34764 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   34765                           TargetLowering::DAGCombinerInfo &DCI,
   34766                           const X86Subtarget &Subtarget) {
   34767   EVT VT = N->getValueType(0);
   34768 
   34769   // If this is SSE1 only convert to FAND to avoid scalarization.
   34770   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
   34771     return DAG.getBitcast(
   34772         MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
   34773                                 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
   34774                                 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
   34775   }
   34776 
   34777   // Use a 32-bit and+zext if upper bits known zero.
   34778   if (VT == MVT::i64 && Subtarget.is64Bit() &&
   34779       !isa<ConstantSDNode>(N->getOperand(1))) {
   34780     APInt HiMask = APInt::getHighBitsSet(64, 32);
   34781     if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
   34782         DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
   34783       SDLoc dl(N);
   34784       SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
   34785       SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
   34786       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
   34787                          DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
   34788     }
   34789   }
   34790 
   34791   if (DCI.isBeforeLegalizeOps())
   34792     return SDValue();
   34793 
   34794   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
   34795     return R;
   34796 
   34797   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
   34798     return FPLogic;
   34799 
   34800   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
   34801     return R;
   34802 
   34803   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
   34804     return ShiftRight;
   34805 
   34806   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
   34807     return R;
   34808 
   34809   // Attempt to recursively combine a bitmask AND with shuffles.
   34810   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
   34811     SDValue Op(N, 0);
   34812     if (SDValue Res = combineX86ShufflesRecursively(
   34813             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
   34814             /*HasVarMask*/ false, DAG, Subtarget))
   34815       return Res;
   34816   }
   34817 
   34818   // Attempt to combine a scalar bitmask AND with an extracted shuffle.
   34819   if ((VT.getScalarSizeInBits() % 8) == 0 &&
   34820       N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   34821       isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
   34822     SDValue BitMask = N->getOperand(1);
   34823     SDValue SrcVec = N->getOperand(0).getOperand(0);
   34824     EVT SrcVecVT = SrcVec.getValueType();
   34825 
   34826     // Check that the constant bitmask masks whole bytes.
   34827     APInt UndefElts;
   34828     SmallVector<APInt, 64> EltBits;
   34829     if (VT == SrcVecVT.getScalarType() &&
   34830         N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
   34831         getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
   34832         llvm::all_of(EltBits, [](APInt M) {
   34833           return M.isNullValue() || M.isAllOnesValue();
   34834         })) {
   34835       unsigned NumElts = SrcVecVT.getVectorNumElements();
   34836       unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
   34837       unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
   34838 
   34839       // Create a root shuffle mask from the byte mask and the extracted index.
   34840       SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
   34841       for (unsigned i = 0; i != Scale; ++i) {
   34842         if (UndefElts[i])
   34843           continue;
   34844         int VecIdx = Scale * Idx + i;
   34845         ShuffleMask[VecIdx] =
   34846             EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
   34847       }
   34848 
   34849       if (SDValue Shuffle = combineX86ShufflesRecursively(
   34850               {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
   34851               /*HasVarMask*/ false, DAG, Subtarget))
   34852         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
   34853                            N->getOperand(0).getOperand(1));
   34854     }
   34855   }
   34856 
   34857   return SDValue();
   34858 }
   34859 
   34860 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
   34861 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
   34862   if (N->getOpcode() != ISD::OR)
   34863     return false;
   34864 
   34865   SDValue N0 = N->getOperand(0);
   34866   SDValue N1 = N->getOperand(1);
   34867 
   34868   // Canonicalize AND to LHS.
   34869   if (N1.getOpcode() == ISD::AND)
   34870     std::swap(N0, N1);
   34871 
   34872   // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
   34873   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
   34874     return false;
   34875 
   34876   Mask = N1.getOperand(0);
   34877   X = N1.getOperand(1);
   34878 
   34879   // Check to see if the mask appeared in both the AND and ANDNP.
   34880   if (N0.getOperand(0) == Mask)
   34881     Y = N0.getOperand(1);
   34882   else if (N0.getOperand(1) == Mask)
   34883     Y = N0.getOperand(0);
   34884   else
   34885     return false;
   34886 
   34887   // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
   34888   // ANDNP combine allows other combines to happen that prevent matching.
   34889   return true;
   34890 }
   34891 
   34892 // Try to fold:
   34893 //   (or (and (m, y), (pandn m, x)))
   34894 // into:
   34895 //   (vselect m, x, y)
   34896 // As a special case, try to fold:
   34897 //   (or (and (m, (sub 0, x)), (pandn m, x)))
   34898 // into:
   34899 //   (sub (xor X, M), M)
   34900 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   34901                                             const X86Subtarget &Subtarget) {
   34902   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
   34903 
   34904   EVT VT = N->getValueType(0);
   34905   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
   34906         (VT.is256BitVector() && Subtarget.hasInt256())))
   34907     return SDValue();
   34908 
   34909   SDValue X, Y, Mask;
   34910   if (!matchLogicBlend(N, X, Y, Mask))
   34911     return SDValue();
   34912 
   34913   // Validate that X, Y, and Mask are bitcasts, and see through them.
   34914   Mask = peekThroughBitcasts(Mask);
   34915   X = peekThroughBitcasts(X);
   34916   Y = peekThroughBitcasts(Y);
   34917 
   34918   EVT MaskVT = Mask.getValueType();
   34919   unsigned EltBits = MaskVT.getScalarSizeInBits();
   34920 
   34921   // TODO: Attempt to handle floating point cases as well?
   34922   if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
   34923     return SDValue();
   34924 
   34925   SDLoc DL(N);
   34926 
   34927   // Try to match:
   34928   //   (or (and (M, (sub 0, X)), (pandn M, X)))
   34929   // which is a special case of vselect:
   34930   //   (vselect M, (sub 0, X), X)
   34931   // Per:
   34932   // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
   34933   // We know that, if fNegate is 0 or 1:
   34934   //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
   34935   //
   34936   // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
   34937   //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
   34938   //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
   34939   // This lets us transform our vselect to:
   34940   //   (add (xor X, M), (and M, 1))
   34941   // And further to:
   34942   //   (sub (xor X, M), M)
   34943   if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
   34944       DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
   34945     auto IsNegV = [](SDNode *N, SDValue V) {
   34946       return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
   34947         ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
   34948     };
   34949     SDValue V;
   34950     if (IsNegV(Y.getNode(), X))
   34951       V = X;
   34952     else if (IsNegV(X.getNode(), Y))
   34953       V = Y;
   34954 
   34955     if (V) {
   34956       SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
   34957       SDValue SubOp2 = Mask;
   34958 
   34959       // If the negate was on the false side of the select, then
   34960       // the operands of the SUB need to be swapped. PR 27251.
   34961       // This is because the pattern being matched above is
   34962       // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
   34963       // but if the pattern matched was
   34964       // (vselect M, X, (sub (0, X))), that is really negation of the pattern
   34965       // above, -(vselect M, (sub 0, X), X), and therefore the replacement
   34966       // pattern also needs to be a negation of the replacement pattern above.
   34967       // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
   34968       // sub accomplishes the negation of the replacement pattern.
   34969       if (V == Y)
   34970          std::swap(SubOp1, SubOp2);
   34971 
   34972       SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
   34973       return DAG.getBitcast(VT, Res);
   34974     }
   34975   }
   34976 
   34977   // PBLENDVB is only available on SSE 4.1.
   34978   if (!Subtarget.hasSSE41())
   34979     return SDValue();
   34980 
   34981   MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
   34982 
   34983   X = DAG.getBitcast(BlendVT, X);
   34984   Y = DAG.getBitcast(BlendVT, Y);
   34985   Mask = DAG.getBitcast(BlendVT, Mask);
   34986   Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
   34987   return DAG.getBitcast(VT, Mask);
   34988 }
   34989 
   34990 // Helper function for combineOrCmpEqZeroToCtlzSrl
   34991 // Transforms:
   34992 //   seteq(cmp x, 0)
   34993 //   into:
   34994 //   srl(ctlz x), log2(bitsize(x))
   34995 // Input pattern is checked by caller.
   34996 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
   34997                                           SelectionDAG &DAG) {
   34998   SDValue Cmp = Op.getOperand(1);
   34999   EVT VT = Cmp.getOperand(0).getValueType();
   35000   unsigned Log2b = Log2_32(VT.getSizeInBits());
   35001   SDLoc dl(Op);
   35002   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
   35003   // The result of the shift is true or false, and on X86, the 32-bit
   35004   // encoding of shr and lzcnt is more desirable.
   35005   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
   35006   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
   35007                             DAG.getConstant(Log2b, dl, MVT::i8));
   35008   return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
   35009 }
   35010 
   35011 // Try to transform:
   35012 //   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
   35013 //   into:
   35014 //   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
   35015 // Will also attempt to match more generic cases, eg:
   35016 //   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
   35017 // Only applies if the target supports the FastLZCNT feature.
   35018 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
   35019                                            TargetLowering::DAGCombinerInfo &DCI,
   35020                                            const X86Subtarget &Subtarget) {
   35021   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
   35022     return SDValue();
   35023 
   35024   auto isORCandidate = [](SDValue N) {
   35025     return (N->getOpcode() == ISD::OR && N->hasOneUse());
   35026   };
   35027 
   35028   // Check the zero extend is extending to 32-bit or more. The code generated by
   35029   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
   35030   // instructions to clear the upper bits.
   35031   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
   35032       !isORCandidate(N->getOperand(0)))
   35033     return SDValue();
   35034 
   35035   // Check the node matches: setcc(eq, cmp 0)
   35036   auto isSetCCCandidate = [](SDValue N) {
   35037     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
   35038            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
   35039            N->getOperand(1).getOpcode() == X86ISD::CMP &&
   35040            isNullConstant(N->getOperand(1).getOperand(1)) &&
   35041            N->getOperand(1).getValueType().bitsGE(MVT::i32);
   35042   };
   35043 
   35044   SDNode *OR = N->getOperand(0).getNode();
   35045   SDValue LHS = OR->getOperand(0);
   35046   SDValue RHS = OR->getOperand(1);
   35047 
   35048   // Save nodes matching or(or, setcc(eq, cmp 0)).
   35049   SmallVector<SDNode *, 2> ORNodes;
   35050   while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
   35051           (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
   35052     ORNodes.push_back(OR);
   35053     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
   35054     LHS = OR->getOperand(0);
   35055     RHS = OR->getOperand(1);
   35056   }
   35057 
   35058   // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
   35059   if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
   35060       !isORCandidate(SDValue(OR, 0)))
   35061     return SDValue();
   35062 
   35063   // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
   35064   // to
   35065   // or(srl(ctlz),srl(ctlz)).
   35066   // The dag combiner can then fold it into:
   35067   // srl(or(ctlz, ctlz)).
   35068   EVT VT = OR->getValueType(0);
   35069   SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
   35070   SDValue Ret, NewRHS;
   35071   if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
   35072     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
   35073 
   35074   if (!Ret)
   35075     return SDValue();
   35076 
   35077   // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
   35078   while (ORNodes.size() > 0) {
   35079     OR = ORNodes.pop_back_val();
   35080     LHS = OR->getOperand(0);
   35081     RHS = OR->getOperand(1);
   35082     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
   35083     if (RHS->getOpcode() == ISD::OR)
   35084       std::swap(LHS, RHS);
   35085     EVT VT = OR->getValueType(0);
   35086     SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
   35087     if (!NewRHS)
   35088       return SDValue();
   35089     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
   35090   }
   35091 
   35092   if (Ret)
   35093     Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
   35094 
   35095   return Ret;
   35096 }
   35097 
   35098 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
   35099                          TargetLowering::DAGCombinerInfo &DCI,
   35100                          const X86Subtarget &Subtarget) {
   35101   SDValue N0 = N->getOperand(0);
   35102   SDValue N1 = N->getOperand(1);
   35103   EVT VT = N->getValueType(0);
   35104 
   35105   // If this is SSE1 only convert to FOR to avoid scalarization.
   35106   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
   35107     return DAG.getBitcast(MVT::v4i32,
   35108                           DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
   35109                                       DAG.getBitcast(MVT::v4f32, N0),
   35110                                       DAG.getBitcast(MVT::v4f32, N1)));
   35111   }
   35112 
   35113   if (DCI.isBeforeLegalizeOps())
   35114     return SDValue();
   35115 
   35116   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
   35117     return R;
   35118 
   35119   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
   35120     return FPLogic;
   35121 
   35122   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
   35123     return R;
   35124 
   35125   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
   35126     return SDValue();
   35127 
   35128   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
   35129   bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
   35130 
   35131   // SHLD/SHRD instructions have lower register pressure, but on some
   35132   // platforms they have higher latency than the equivalent
   35133   // series of shifts/or that would otherwise be generated.
   35134   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
   35135   // have higher latencies and we are not optimizing for size.
   35136   if (!OptForSize && Subtarget.isSHLDSlow())
   35137     return SDValue();
   35138 
   35139   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
   35140     std::swap(N0, N1);
   35141   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
   35142     return SDValue();
   35143   if (!N0.hasOneUse() || !N1.hasOneUse())
   35144     return SDValue();
   35145 
   35146   SDValue ShAmt0 = N0.getOperand(1);
   35147   if (ShAmt0.getValueType() != MVT::i8)
   35148     return SDValue();
   35149   SDValue ShAmt1 = N1.getOperand(1);
   35150   if (ShAmt1.getValueType() != MVT::i8)
   35151     return SDValue();
   35152   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
   35153     ShAmt0 = ShAmt0.getOperand(0);
   35154   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
   35155     ShAmt1 = ShAmt1.getOperand(0);
   35156 
   35157   SDLoc DL(N);
   35158   unsigned Opc = X86ISD::SHLD;
   35159   SDValue Op0 = N0.getOperand(0);
   35160   SDValue Op1 = N1.getOperand(0);
   35161   if (ShAmt0.getOpcode() == ISD::SUB ||
   35162       ShAmt0.getOpcode() == ISD::XOR) {
   35163     Opc = X86ISD::SHRD;
   35164     std::swap(Op0, Op1);
   35165     std::swap(ShAmt0, ShAmt1);
   35166   }
   35167 
   35168   // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
   35169   // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
   35170   // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
   35171   // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
   35172   unsigned Bits = VT.getSizeInBits();
   35173   if (ShAmt1.getOpcode() == ISD::SUB) {
   35174     SDValue Sum = ShAmt1.getOperand(0);
   35175     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
   35176       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
   35177       if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
   35178         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
   35179       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
   35180         return DAG.getNode(Opc, DL, VT,
   35181                            Op0, Op1,
   35182                            DAG.getNode(ISD::TRUNCATE, DL,
   35183                                        MVT::i8, ShAmt0));
   35184     }
   35185   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
   35186     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
   35187     if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
   35188       return DAG.getNode(Opc, DL, VT,
   35189                          N0.getOperand(0), N1.getOperand(0),
   35190                          DAG.getNode(ISD::TRUNCATE, DL,
   35191                                        MVT::i8, ShAmt0));
   35192   } else if (ShAmt1.getOpcode() == ISD::XOR) {
   35193     SDValue Mask = ShAmt1.getOperand(1);
   35194     if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
   35195       unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
   35196       SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
   35197       if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
   35198         ShAmt1Op0 = ShAmt1Op0.getOperand(0);
   35199       if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
   35200         if (Op1.getOpcode() == InnerShift &&
   35201             isa<ConstantSDNode>(Op1.getOperand(1)) &&
   35202             Op1.getConstantOperandVal(1) == 1) {
   35203           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
   35204                              DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
   35205         }
   35206         // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
   35207         if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
   35208             Op1.getOperand(0) == Op1.getOperand(1)) {
   35209           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
   35210                      DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
   35211         }
   35212       }
   35213     }
   35214   }
   35215 
   35216   return SDValue();
   35217 }
   35218 
   35219 /// Try to turn tests against the signbit in the form of:
   35220 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
   35221 /// into:
   35222 ///   SETGT(X, -1)
   35223 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
   35224   // This is only worth doing if the output type is i8 or i1.
   35225   EVT ResultType = N->getValueType(0);
   35226   if (ResultType != MVT::i8 && ResultType != MVT::i1)
   35227     return SDValue();
   35228 
   35229   SDValue N0 = N->getOperand(0);
   35230   SDValue N1 = N->getOperand(1);
   35231 
   35232   // We should be performing an xor against a truncated shift.
   35233   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
   35234     return SDValue();
   35235 
   35236   // Make sure we are performing an xor against one.
   35237   if (!isOneConstant(N1))
   35238     return SDValue();
   35239 
   35240   // SetCC on x86 zero extends so only act on this if it's a logical shift.
   35241   SDValue Shift = N0.getOperand(0);
   35242   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
   35243     return SDValue();
   35244 
   35245   // Make sure we are truncating from one of i16, i32 or i64.
   35246   EVT ShiftTy = Shift.getValueType();
   35247   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
   35248     return SDValue();
   35249 
   35250   // Make sure the shift amount extracts the sign bit.
   35251   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
   35252       Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
   35253     return SDValue();
   35254 
   35255   // Create a greater-than comparison against -1.
   35256   // N.B. Using SETGE against 0 works but we want a canonical looking
   35257   // comparison, using SETGT matches up with what TranslateX86CC.
   35258   SDLoc DL(N);
   35259   SDValue ShiftOp = Shift.getOperand(0);
   35260   EVT ShiftOpTy = ShiftOp.getValueType();
   35261   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   35262   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
   35263                                                *DAG.getContext(), ResultType);
   35264   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
   35265                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
   35266   if (SetCCResultType != ResultType)
   35267     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
   35268   return Cond;
   35269 }
   35270 
   35271 /// Turn vector tests of the signbit in the form of:
   35272 ///   xor (sra X, elt_size(X)-1), -1
   35273 /// into:
   35274 ///   pcmpgt X, -1
   35275 ///
   35276 /// This should be called before type legalization because the pattern may not
   35277 /// persist after that.
   35278 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
   35279                                          const X86Subtarget &Subtarget) {
   35280   EVT VT = N->getValueType(0);
   35281   if (!VT.isSimple())
   35282     return SDValue();
   35283 
   35284   switch (VT.getSimpleVT().SimpleTy) {
   35285   default: return SDValue();
   35286   case MVT::v16i8:
   35287   case MVT::v8i16:
   35288   case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
   35289   case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
   35290   case MVT::v32i8:
   35291   case MVT::v16i16:
   35292   case MVT::v8i32:
   35293   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
   35294   }
   35295 
   35296   // There must be a shift right algebraic before the xor, and the xor must be a
   35297   // 'not' operation.
   35298   SDValue Shift = N->getOperand(0);
   35299   SDValue Ones = N->getOperand(1);
   35300   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
   35301       !ISD::isBuildVectorAllOnes(Ones.getNode()))
   35302     return SDValue();
   35303 
   35304   // The shift should be smearing the sign bit across each vector element.
   35305   auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
   35306   if (!ShiftBV)
   35307     return SDValue();
   35308 
   35309   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
   35310   auto *ShiftAmt = ShiftBV->getConstantSplatNode();
   35311   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
   35312     return SDValue();
   35313 
   35314   // Create a greater-than comparison against -1. We don't use the more obvious
   35315   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
   35316   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
   35317 }
   35318 
   35319 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
   35320 /// is valid for the given \p Subtarget.
   35321 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
   35322                                         const X86Subtarget &Subtarget) {
   35323   if (!Subtarget.hasAVX512())
   35324     return false;
   35325 
   35326   // FIXME: Scalar type may be supported if we move it to vector register.
   35327   if (!SrcVT.isVector())
   35328     return false;
   35329 
   35330   EVT SrcElVT = SrcVT.getScalarType();
   35331   EVT DstElVT = DstVT.getScalarType();
   35332   if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
   35333     return false;
   35334   if (SrcVT.is512BitVector() || Subtarget.hasVLX())
   35335     return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
   35336   return false;
   35337 }
   35338 
   35339 /// Detect patterns of truncation with unsigned saturation:
   35340 ///
   35341 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
   35342 ///   Return the source value x to be truncated or SDValue() if the pattern was
   35343 ///   not matched.
   35344 ///
   35345 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
   35346 ///   where C1 >= 0 and C2 is unsigned max of destination type.
   35347 ///
   35348 ///    (truncate (smax (smin (x, C2), C1)) to dest_type)
   35349 ///   where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
   35350 ///
   35351 ///   These two patterns are equivalent to:
   35352 ///   (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
   35353 ///   So return the smax(x, C1) value to be truncated or SDValue() if the
   35354 ///   pattern was not matched.
   35355 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   35356                                  const SDLoc &DL) {
   35357   EVT InVT = In.getValueType();
   35358 
   35359   // Saturation with truncation. We truncate from InVT to VT.
   35360   assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
   35361          "Unexpected types for truncate operation");
   35362 
   35363   // Match min/max and return limit value as a parameter.
   35364   auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
   35365     if (V.getOpcode() == Opcode &&
   35366         ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
   35367       return V.getOperand(0);
   35368     return SDValue();
   35369   };
   35370 
   35371   APInt C1, C2;
   35372   if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
   35373     // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
   35374     // the element size of the destination type.
   35375     if (C2.isMask(VT.getScalarSizeInBits()))
   35376       return UMin;
   35377 
   35378   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
   35379     if (MatchMinMax(SMin, ISD::SMAX, C1))
   35380       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
   35381         return SMin;
   35382 
   35383   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
   35384     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
   35385       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
   35386           C2.uge(C1)) {
   35387         return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
   35388       }
   35389 
   35390   return SDValue();
   35391 }
   35392 
   35393 /// Detect patterns of truncation with signed saturation:
   35394 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
   35395 ///                  signed_max_of_dest_type)) to dest_type)
   35396 /// or:
   35397 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
   35398 ///                  signed_min_of_dest_type)) to dest_type).
   35399 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
   35400 /// Return the source value to be truncated or SDValue() if the pattern was not
   35401 /// matched.
   35402 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
   35403   unsigned NumDstBits = VT.getScalarSizeInBits();
   35404   unsigned NumSrcBits = In.getScalarValueSizeInBits();
   35405   assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
   35406 
   35407   auto MatchMinMax = [](SDValue V, unsigned Opcode,
   35408                         const APInt &Limit) -> SDValue {
   35409     APInt C;
   35410     if (V.getOpcode() == Opcode &&
   35411         ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
   35412       return V.getOperand(0);
   35413     return SDValue();
   35414   };
   35415 
   35416   APInt SignedMax, SignedMin;
   35417   if (MatchPackUS) {
   35418     SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
   35419     SignedMin = APInt(NumSrcBits, 0);
   35420   } else {
   35421     SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
   35422     SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
   35423   }
   35424 
   35425   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
   35426     if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
   35427       return SMax;
   35428 
   35429   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
   35430     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
   35431       return SMin;
   35432 
   35433   return SDValue();
   35434 }
   35435 
   35436 /// Detect a pattern of truncation with signed saturation.
   35437 /// The types should allow to use VPMOVSS* instruction on AVX512.
   35438 /// Return the source value to be truncated or SDValue() if the pattern was not
   35439 /// matched.
   35440 static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
   35441                                        const X86Subtarget &Subtarget,
   35442                                        const TargetLowering &TLI) {
   35443   if (!TLI.isTypeLegal(In.getValueType()))
   35444     return SDValue();
   35445   if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
   35446     return SDValue();
   35447   return detectSSatPattern(In, VT);
   35448 }
   35449 
   35450 /// Detect a pattern of truncation with saturation:
   35451 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
   35452 /// The types should allow to use VPMOVUS* instruction on AVX512.
   35453 /// Return the source value to be truncated or SDValue() if the pattern was not
   35454 /// matched.
   35455 static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   35456                                        const SDLoc &DL,
   35457                                        const X86Subtarget &Subtarget,
   35458                                        const TargetLowering &TLI) {
   35459   if (!TLI.isTypeLegal(In.getValueType()))
   35460     return SDValue();
   35461   if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
   35462     return SDValue();
   35463   return detectUSatPattern(In, VT, DAG, DL);
   35464 }
   35465 
   35466 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
   35467                                       SelectionDAG &DAG,
   35468                                       const X86Subtarget &Subtarget) {
   35469   EVT SVT = VT.getScalarType();
   35470   EVT InVT = In.getValueType();
   35471   EVT InSVT = InVT.getScalarType();
   35472   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   35473   if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
   35474       isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
   35475     if (auto SSatVal = detectSSatPattern(In, VT))
   35476       return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
   35477     if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
   35478       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
   35479   }
   35480   if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
   35481       (SVT == MVT::i8 || SVT == MVT::i16) &&
   35482       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
   35483     if (auto USatVal = detectSSatPattern(In, VT, true)) {
   35484       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
   35485       if (SVT == MVT::i8 && InSVT == MVT::i32) {
   35486         EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
   35487                                      VT.getVectorNumElements());
   35488         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
   35489                                              DAG, Subtarget);
   35490         if (Mid)
   35491           return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
   35492                                         Subtarget);
   35493       } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
   35494         return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
   35495                                       Subtarget);
   35496     }
   35497     if (auto SSatVal = detectSSatPattern(In, VT))
   35498       return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
   35499                                     Subtarget);
   35500   }
   35501   return SDValue();
   35502 }
   35503 
   35504 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
   35505 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
   35506 /// X86ISD::AVG instruction.
   35507 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   35508                                 const X86Subtarget &Subtarget,
   35509                                 const SDLoc &DL) {
   35510   if (!VT.isVector())
   35511     return SDValue();
   35512   EVT InVT = In.getValueType();
   35513   unsigned NumElems = VT.getVectorNumElements();
   35514 
   35515   EVT ScalarVT = VT.getVectorElementType();
   35516   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
   35517         isPowerOf2_32(NumElems)))
   35518     return SDValue();
   35519 
   35520   // InScalarVT is the intermediate type in AVG pattern and it should be greater
   35521   // than the original input type (i8/i16).
   35522   EVT InScalarVT = InVT.getVectorElementType();
   35523   if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
   35524     return SDValue();
   35525 
   35526   if (!Subtarget.hasSSE2())
   35527     return SDValue();
   35528 
   35529   // Detect the following pattern:
   35530   //
   35531   //   %1 = zext <N x i8> %a to <N x i32>
   35532   //   %2 = zext <N x i8> %b to <N x i32>
   35533   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
   35534   //   %4 = add nuw nsw <N x i32> %3, %2
   35535   //   %5 = lshr <N x i32> %N, <i32 1 x N>
   35536   //   %6 = trunc <N x i32> %5 to <N x i8>
   35537   //
   35538   // In AVX512, the last instruction can also be a trunc store.
   35539   if (In.getOpcode() != ISD::SRL)
   35540     return SDValue();
   35541 
   35542   // A lambda checking the given SDValue is a constant vector and each element
   35543   // is in the range [Min, Max].
   35544   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
   35545     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
   35546     if (!BV || !BV->isConstant())
   35547       return false;
   35548     for (SDValue Op : V->ops()) {
   35549       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
   35550       if (!C)
   35551         return false;
   35552       const APInt &Val = C->getAPIntValue();
   35553       if (Val.ult(Min) || Val.ugt(Max))
   35554         return false;
   35555     }
   35556     return true;
   35557   };
   35558 
   35559   // Check if each element of the vector is left-shifted by one.
   35560   auto LHS = In.getOperand(0);
   35561   auto RHS = In.getOperand(1);
   35562   if (!IsConstVectorInRange(RHS, 1, 1))
   35563     return SDValue();
   35564   if (LHS.getOpcode() != ISD::ADD)
   35565     return SDValue();
   35566 
   35567   // Detect a pattern of a + b + 1 where the order doesn't matter.
   35568   SDValue Operands[3];
   35569   Operands[0] = LHS.getOperand(0);
   35570   Operands[1] = LHS.getOperand(1);
   35571 
   35572   auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   35573                        ArrayRef<SDValue> Ops) {
   35574     return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
   35575   };
   35576 
   35577   // Take care of the case when one of the operands is a constant vector whose
   35578   // element is in the range [1, 256].
   35579   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
   35580       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
   35581       Operands[0].getOperand(0).getValueType() == VT) {
   35582     // The pattern is detected. Subtract one from the constant vector, then
   35583     // demote it and emit X86ISD::AVG instruction.
   35584     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
   35585     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
   35586     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
   35587     return SplitOpsAndApply(DAG, Subtarget, DL, VT,
   35588                             { Operands[0].getOperand(0), Operands[1] },
   35589                             AVGBuilder);
   35590   }
   35591 
   35592   if (Operands[0].getOpcode() == ISD::ADD)
   35593     std::swap(Operands[0], Operands[1]);
   35594   else if (Operands[1].getOpcode() != ISD::ADD)
   35595     return SDValue();
   35596   Operands[2] = Operands[1].getOperand(0);
   35597   Operands[1] = Operands[1].getOperand(1);
   35598 
   35599   // Now we have three operands of two additions. Check that one of them is a
   35600   // constant vector with ones, and the other two are promoted from i8/i16.
   35601   for (int i = 0; i < 3; ++i) {
   35602     if (!IsConstVectorInRange(Operands[i], 1, 1))
   35603       continue;
   35604     std::swap(Operands[i], Operands[2]);
   35605 
   35606     // Check if Operands[0] and Operands[1] are results of type promotion.
   35607     for (int j = 0; j < 2; ++j)
   35608       if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
   35609           Operands[j].getOperand(0).getValueType() != VT)
   35610         return SDValue();
   35611 
   35612     // The pattern is detected, emit X86ISD::AVG instruction(s).
   35613     return SplitOpsAndApply(DAG, Subtarget, DL, VT,
   35614                             { Operands[0].getOperand(0),
   35615                               Operands[1].getOperand(0) }, AVGBuilder);
   35616   }
   35617 
   35618   return SDValue();
   35619 }
   35620 
   35621 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
   35622                            TargetLowering::DAGCombinerInfo &DCI,
   35623                            const X86Subtarget &Subtarget) {
   35624   LoadSDNode *Ld = cast<LoadSDNode>(N);
   35625   EVT RegVT = Ld->getValueType(0);
   35626   EVT MemVT = Ld->getMemoryVT();
   35627   SDLoc dl(Ld);
   35628   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   35629 
   35630   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
   35631   // into two 16-byte operations. Also split non-temporal aligned loads on
   35632   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
   35633   ISD::LoadExtType Ext = Ld->getExtensionType();
   35634   bool Fast;
   35635   unsigned AddressSpace = Ld->getAddressSpace();
   35636   unsigned Alignment = Ld->getAlignment();
   35637   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
   35638       Ext == ISD::NON_EXTLOAD &&
   35639       ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
   35640        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
   35641                                AddressSpace, Alignment, &Fast) && !Fast))) {
   35642     unsigned NumElems = RegVT.getVectorNumElements();
   35643     if (NumElems < 2)
   35644       return SDValue();
   35645 
   35646     SDValue Ptr = Ld->getBasePtr();
   35647 
   35648     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
   35649                                   NumElems/2);
   35650     SDValue Load1 =
   35651         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
   35652                     Alignment, Ld->getMemOperand()->getFlags());
   35653 
   35654     Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
   35655     SDValue Load2 =
   35656         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
   35657                     Ld->getPointerInfo().getWithOffset(16),
   35658                     MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
   35659     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
   35660                              Load1.getValue(1),
   35661                              Load2.getValue(1));
   35662 
   35663     SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
   35664     return DCI.CombineTo(N, NewVec, TF, true);
   35665   }
   35666 
   35667   return SDValue();
   35668 }
   35669 
   35670 /// If V is a build vector of boolean constants and exactly one of those
   35671 /// constants is true, return the operand index of that true element.
   35672 /// Otherwise, return -1.
   35673 static int getOneTrueElt(SDValue V) {
   35674   // This needs to be a build vector of booleans.
   35675   // TODO: Checking for the i1 type matches the IR definition for the mask,
   35676   // but the mask check could be loosened to i8 or other types. That might
   35677   // also require checking more than 'allOnesValue'; eg, the x86 HW
   35678   // instructions only require that the MSB is set for each mask element.
   35679   // The ISD::MSTORE comments/definition do not specify how the mask operand
   35680   // is formatted.
   35681   auto *BV = dyn_cast<BuildVectorSDNode>(V);
   35682   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
   35683     return -1;
   35684 
   35685   int TrueIndex = -1;
   35686   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
   35687   for (unsigned i = 0; i < NumElts; ++i) {
   35688     const SDValue &Op = BV->getOperand(i);
   35689     if (Op.isUndef())
   35690       continue;
   35691     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
   35692     if (!ConstNode)
   35693       return -1;
   35694     if (ConstNode->getAPIntValue().isAllOnesValue()) {
   35695       // If we already found a one, this is too many.
   35696       if (TrueIndex >= 0)
   35697         return -1;
   35698       TrueIndex = i;
   35699     }
   35700   }
   35701   return TrueIndex;
   35702 }
   35703 
   35704 /// Given a masked memory load/store operation, return true if it has one mask
   35705 /// bit set. If it has one mask bit set, then also return the memory address of
   35706 /// the scalar element to load/store, the vector index to insert/extract that
   35707 /// scalar element, and the alignment for the scalar memory access.
   35708 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
   35709                                          SelectionDAG &DAG, SDValue &Addr,
   35710                                          SDValue &Index, unsigned &Alignment) {
   35711   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
   35712   if (TrueMaskElt < 0)
   35713     return false;
   35714 
   35715   // Get the address of the one scalar element that is specified by the mask
   35716   // using the appropriate offset from the base pointer.
   35717   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
   35718   Addr = MaskedOp->getBasePtr();
   35719   if (TrueMaskElt != 0) {
   35720     unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
   35721     Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
   35722   }
   35723 
   35724   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
   35725   Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
   35726   return true;
   35727 }
   35728 
   35729 /// If exactly one element of the mask is set for a non-extending masked load,
   35730 /// it is a scalar load and vector insert.
   35731 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
   35732 /// mask have already been optimized in IR, so we don't bother with those here.
   35733 static SDValue
   35734 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
   35735                              TargetLowering::DAGCombinerInfo &DCI) {
   35736   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
   35737   // However, some target hooks may need to be added to know when the transform
   35738   // is profitable. Endianness would also have to be considered.
   35739 
   35740   SDValue Addr, VecIndex;
   35741   unsigned Alignment;
   35742   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
   35743     return SDValue();
   35744 
   35745   // Load the one scalar element that is specified by the mask using the
   35746   // appropriate offset from the base pointer.
   35747   SDLoc DL(ML);
   35748   EVT VT = ML->getValueType(0);
   35749   EVT EltVT = VT.getVectorElementType();
   35750   SDValue Load =
   35751       DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
   35752                   Alignment, ML->getMemOperand()->getFlags());
   35753 
   35754   // Insert the loaded element into the appropriate place in the vector.
   35755   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
   35756                                Load, VecIndex);
   35757   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
   35758 }
   35759 
   35760 static SDValue
   35761 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
   35762                               TargetLowering::DAGCombinerInfo &DCI) {
   35763   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
   35764     return SDValue();
   35765 
   35766   SDLoc DL(ML);
   35767   EVT VT = ML->getValueType(0);
   35768 
   35769   // If we are loading the first and last elements of a vector, it is safe and
   35770   // always faster to load the whole vector. Replace the masked load with a
   35771   // vector load and select.
   35772   unsigned NumElts = VT.getVectorNumElements();
   35773   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
   35774   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
   35775   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
   35776   if (LoadFirstElt && LoadLastElt) {
   35777     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
   35778                                 ML->getMemOperand());
   35779     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
   35780     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
   35781   }
   35782 
   35783   // Convert a masked load with a constant mask into a masked load and a select.
   35784   // This allows the select operation to use a faster kind of select instruction
   35785   // (for example, vblendvps -> vblendps).
   35786 
   35787   // Don't try this if the pass-through operand is already undefined. That would
   35788   // cause an infinite loop because that's what we're about to create.
   35789   if (ML->getSrc0().isUndef())
   35790     return SDValue();
   35791 
   35792   // The new masked load has an undef pass-through operand. The select uses the
   35793   // original pass-through operand.
   35794   SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
   35795                                     ML->getMask(), DAG.getUNDEF(VT),
   35796                                     ML->getMemoryVT(), ML->getMemOperand(),
   35797                                     ML->getExtensionType());
   35798   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
   35799 
   35800   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
   35801 }
   35802 
   35803 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
   35804                                  TargetLowering::DAGCombinerInfo &DCI,
   35805                                  const X86Subtarget &Subtarget) {
   35806   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
   35807 
   35808   // TODO: Expanding load with constant mask may be optimized as well.
   35809   if (Mld->isExpandingLoad())
   35810     return SDValue();
   35811 
   35812   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
   35813     if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
   35814       return ScalarLoad;
   35815     // TODO: Do some AVX512 subsets benefit from this transform?
   35816     if (!Subtarget.hasAVX512())
   35817       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
   35818         return Blend;
   35819   }
   35820 
   35821   if (Mld->getExtensionType() != ISD::SEXTLOAD)
   35822     return SDValue();
   35823 
   35824   // Resolve extending loads.
   35825   EVT VT = Mld->getValueType(0);
   35826   unsigned NumElems = VT.getVectorNumElements();
   35827   EVT LdVT = Mld->getMemoryVT();
   35828   SDLoc dl(Mld);
   35829 
   35830   assert(LdVT != VT && "Cannot extend to the same type");
   35831   unsigned ToSz = VT.getScalarSizeInBits();
   35832   unsigned FromSz = LdVT.getScalarSizeInBits();
   35833   // From/To sizes and ElemCount must be pow of two.
   35834   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
   35835     "Unexpected size for extending masked load");
   35836 
   35837   unsigned SizeRatio  = ToSz / FromSz;
   35838   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
   35839 
   35840   // Create a type on which we perform the shuffle.
   35841   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   35842           LdVT.getScalarType(), NumElems*SizeRatio);
   35843   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   35844 
   35845   // Convert Src0 value.
   35846   SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
   35847   if (!Mld->getSrc0().isUndef()) {
   35848     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   35849     for (unsigned i = 0; i != NumElems; ++i)
   35850       ShuffleVec[i] = i * SizeRatio;
   35851 
   35852     // Can't shuffle using an illegal type.
   35853     assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
   35854            "WideVecVT should be legal");
   35855     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
   35856                                     DAG.getUNDEF(WideVecVT), ShuffleVec);
   35857   }
   35858 
   35859   // Prepare the new mask.
   35860   SDValue NewMask;
   35861   SDValue Mask = Mld->getMask();
   35862   if (Mask.getValueType() == VT) {
   35863     // Mask and original value have the same type.
   35864     NewMask = DAG.getBitcast(WideVecVT, Mask);
   35865     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   35866     for (unsigned i = 0; i != NumElems; ++i)
   35867       ShuffleVec[i] = i * SizeRatio;
   35868     for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
   35869       ShuffleVec[i] = NumElems * SizeRatio;
   35870     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
   35871                                    DAG.getConstant(0, dl, WideVecVT),
   35872                                    ShuffleVec);
   35873   } else {
   35874     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
   35875     unsigned WidenNumElts = NumElems*SizeRatio;
   35876     unsigned MaskNumElts = VT.getVectorNumElements();
   35877     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
   35878                                      WidenNumElts);
   35879 
   35880     unsigned NumConcat = WidenNumElts / MaskNumElts;
   35881     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
   35882     SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
   35883     Ops[0] = Mask;
   35884     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   35885   }
   35886 
   35887   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
   35888                                      Mld->getBasePtr(), NewMask, WideSrc0,
   35889                                      Mld->getMemoryVT(), Mld->getMemOperand(),
   35890                                      ISD::NON_EXTLOAD);
   35891   SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
   35892   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
   35893 }
   35894 
   35895 /// If exactly one element of the mask is set for a non-truncating masked store,
   35896 /// it is a vector extract and scalar store.
   35897 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
   35898 /// mask have already been optimized in IR, so we don't bother with those here.
   35899 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
   35900                                               SelectionDAG &DAG) {
   35901   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
   35902   // However, some target hooks may need to be added to know when the transform
   35903   // is profitable. Endianness would also have to be considered.
   35904 
   35905   SDValue Addr, VecIndex;
   35906   unsigned Alignment;
   35907   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
   35908     return SDValue();
   35909 
   35910   // Extract the one scalar element that is actually being stored.
   35911   SDLoc DL(MS);
   35912   EVT VT = MS->getValue().getValueType();
   35913   EVT EltVT = VT.getVectorElementType();
   35914   SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
   35915                                 MS->getValue(), VecIndex);
   35916 
   35917   // Store that element at the appropriate offset from the base pointer.
   35918   return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
   35919                       Alignment, MS->getMemOperand()->getFlags());
   35920 }
   35921 
   35922 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
   35923                                   const X86Subtarget &Subtarget) {
   35924   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
   35925 
   35926   if (Mst->isCompressingStore())
   35927     return SDValue();
   35928 
   35929   if (!Mst->isTruncatingStore()) {
   35930     if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
   35931       return ScalarStore;
   35932 
   35933     // If the mask is checking (0 > X), we're creating a vector with all-zeros
   35934     // or all-ones elements based on the sign bits of X. AVX1 masked store only
   35935     // cares about the sign bit of each mask element, so eliminate the compare:
   35936     // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
   35937     // Note that by waiting to match an x86-specific PCMPGT node, we're
   35938     // eliminating potentially more complex matching of a setcc node which has
   35939     // a full range of predicates.
   35940     SDValue Mask = Mst->getMask();
   35941     if (Mask.getOpcode() == X86ISD::PCMPGT &&
   35942         ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
   35943       assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
   35944              "Unexpected type for PCMPGT");
   35945       return DAG.getMaskedStore(
   35946           Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
   35947           Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
   35948     }
   35949 
   35950     // TODO: AVX512 targets should also be able to simplify something like the
   35951     // pattern above, but that pattern will be different. It will either need to
   35952     // match setcc more generally or match PCMPGTM later (in tablegen?).
   35953 
   35954     return SDValue();
   35955   }
   35956 
   35957   // Resolve truncating stores.
   35958   EVT VT = Mst->getValue().getValueType();
   35959   unsigned NumElems = VT.getVectorNumElements();
   35960   EVT StVT = Mst->getMemoryVT();
   35961   SDLoc dl(Mst);
   35962 
   35963   assert(StVT != VT && "Cannot truncate to the same type");
   35964   unsigned FromSz = VT.getScalarSizeInBits();
   35965   unsigned ToSz = StVT.getScalarSizeInBits();
   35966 
   35967   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   35968 
   35969   // The truncating store is legal in some cases. For example
   35970   // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
   35971   // are designated for truncate store.
   35972   // In this case we don't need any further transformations.
   35973   if (TLI.isTruncStoreLegal(VT, StVT))
   35974     return SDValue();
   35975 
   35976   // From/To sizes and ElemCount must be pow of two.
   35977   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
   35978     "Unexpected size for truncating masked store");
   35979   // We are going to use the original vector elt for storing.
   35980   // Accumulated smaller vector elements must be a multiple of the store size.
   35981   assert (((NumElems * FromSz) % ToSz) == 0 &&
   35982           "Unexpected ratio for truncating masked store");
   35983 
   35984   unsigned SizeRatio  = FromSz / ToSz;
   35985   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
   35986 
   35987   // Create a type on which we perform the shuffle.
   35988   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   35989           StVT.getScalarType(), NumElems*SizeRatio);
   35990 
   35991   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   35992 
   35993   SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
   35994   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   35995   for (unsigned i = 0; i != NumElems; ++i)
   35996     ShuffleVec[i] = i * SizeRatio;
   35997 
   35998   // Can't shuffle using an illegal type.
   35999   assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
   36000          "WideVecVT should be legal");
   36001 
   36002   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
   36003                                               DAG.getUNDEF(WideVecVT),
   36004                                               ShuffleVec);
   36005 
   36006   SDValue NewMask;
   36007   SDValue Mask = Mst->getMask();
   36008   if (Mask.getValueType() == VT) {
   36009     // Mask and original value have the same type.
   36010     NewMask = DAG.getBitcast(WideVecVT, Mask);
   36011     for (unsigned i = 0; i != NumElems; ++i)
   36012       ShuffleVec[i] = i * SizeRatio;
   36013     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
   36014       ShuffleVec[i] = NumElems*SizeRatio;
   36015     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
   36016                                    DAG.getConstant(0, dl, WideVecVT),
   36017                                    ShuffleVec);
   36018   } else {
   36019     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
   36020     unsigned WidenNumElts = NumElems*SizeRatio;
   36021     unsigned MaskNumElts = VT.getVectorNumElements();
   36022     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
   36023                                      WidenNumElts);
   36024 
   36025     unsigned NumConcat = WidenNumElts / MaskNumElts;
   36026     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
   36027     SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
   36028     Ops[0] = Mask;
   36029     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   36030   }
   36031 
   36032   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
   36033                             Mst->getBasePtr(), NewMask, StVT,
   36034                             Mst->getMemOperand(), false);
   36035 }
   36036 
   36037 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   36038                             const X86Subtarget &Subtarget) {
   36039   StoreSDNode *St = cast<StoreSDNode>(N);
   36040   EVT VT = St->getValue().getValueType();
   36041   EVT StVT = St->getMemoryVT();
   36042   SDLoc dl(St);
   36043   SDValue StoredVal = St->getOperand(1);
   36044   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   36045 
   36046   // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
   36047   // This will avoid a copy to k-register.
   36048   if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
   36049       StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
   36050       StoredVal.getOperand(0).getValueType() == MVT::i8) {
   36051     return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
   36052                         St->getBasePtr(), St->getPointerInfo(),
   36053                         St->getAlignment(), St->getMemOperand()->getFlags());
   36054   }
   36055 
   36056   // Widen v2i1/v4i1 stores to v8i1.
   36057   if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
   36058       Subtarget.hasAVX512()) {
   36059     unsigned NumConcats = 8 / VT.getVectorNumElements();
   36060     SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
   36061     Ops[0] = StoredVal;
   36062     StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
   36063     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
   36064                         St->getPointerInfo(), St->getAlignment(),
   36065                         St->getMemOperand()->getFlags());
   36066   }
   36067 
   36068   // Turn vXi1 stores of constants into a scalar store.
   36069   if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
   36070        VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
   36071       ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
   36072     // If its a v64i1 store without 64-bit support, we need two stores.
   36073     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
   36074       SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
   36075                                       StoredVal->ops().slice(0, 32));
   36076       Lo = combinevXi1ConstantToInteger(Lo, DAG);
   36077       SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
   36078                                       StoredVal->ops().slice(32, 32));
   36079       Hi = combinevXi1ConstantToInteger(Hi, DAG);
   36080 
   36081       unsigned Alignment = St->getAlignment();
   36082 
   36083       SDValue Ptr0 = St->getBasePtr();
   36084       SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
   36085 
   36086       SDValue Ch0 =
   36087           DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
   36088                        Alignment, St->getMemOperand()->getFlags());
   36089       SDValue Ch1 =
   36090           DAG.getStore(St->getChain(), dl, Hi, Ptr1,
   36091                        St->getPointerInfo().getWithOffset(4),
   36092                        MinAlign(Alignment, 4U),
   36093                        St->getMemOperand()->getFlags());
   36094       return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
   36095     }
   36096 
   36097     StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
   36098     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
   36099                         St->getPointerInfo(), St->getAlignment(),
   36100                         St->getMemOperand()->getFlags());
   36101   }
   36102 
   36103   // If we are saving a concatenation of two XMM registers and 32-byte stores
   36104   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
   36105   bool Fast;
   36106   unsigned AddressSpace = St->getAddressSpace();
   36107   unsigned Alignment = St->getAlignment();
   36108   if (VT.is256BitVector() && StVT == VT &&
   36109       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
   36110                              AddressSpace, Alignment, &Fast) &&
   36111       !Fast) {
   36112     unsigned NumElems = VT.getVectorNumElements();
   36113     if (NumElems < 2)
   36114       return SDValue();
   36115 
   36116     SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
   36117     SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
   36118 
   36119     SDValue Ptr0 = St->getBasePtr();
   36120     SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
   36121 
   36122     SDValue Ch0 =
   36123         DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
   36124                      Alignment, St->getMemOperand()->getFlags());
   36125     SDValue Ch1 =
   36126         DAG.getStore(St->getChain(), dl, Value1, Ptr1,
   36127                      St->getPointerInfo().getWithOffset(16),
   36128                      MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
   36129     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
   36130   }
   36131 
   36132   // Optimize trunc store (of multiple scalars) to shuffle and store.
   36133   // First, pack all of the elements in one place. Next, store to memory
   36134   // in fewer chunks.
   36135   if (St->isTruncatingStore() && VT.isVector()) {
   36136     // Check if we can detect an AVG pattern from the truncation. If yes,
   36137     // replace the trunc store by a normal store with the result of X86ISD::AVG
   36138     // instruction.
   36139     if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
   36140                                        Subtarget, dl))
   36141       return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
   36142                           St->getPointerInfo(), St->getAlignment(),
   36143                           St->getMemOperand()->getFlags());
   36144 
   36145     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   36146     if (SDValue Val =
   36147         detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
   36148                                 TLI))
   36149       return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
   36150                              dl, Val, St->getBasePtr(),
   36151                              St->getMemoryVT(), St->getMemOperand(), DAG);
   36152     if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
   36153                                               DAG, dl, Subtarget, TLI))
   36154       return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
   36155                              dl, Val, St->getBasePtr(),
   36156                              St->getMemoryVT(), St->getMemOperand(), DAG);
   36157 
   36158     unsigned NumElems = VT.getVectorNumElements();
   36159     assert(StVT != VT && "Cannot truncate to the same type");
   36160     unsigned FromSz = VT.getScalarSizeInBits();
   36161     unsigned ToSz = StVT.getScalarSizeInBits();
   36162 
   36163     // The truncating store is legal in some cases. For example
   36164     // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
   36165     // are designated for truncate store.
   36166     // In this case we don't need any further transformations.
   36167     if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
   36168       return SDValue();
   36169 
   36170     // From, To sizes and ElemCount must be pow of two
   36171     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
   36172     // We are going to use the original vector elt for storing.
   36173     // Accumulated smaller vector elements must be a multiple of the store size.
   36174     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
   36175 
   36176     unsigned SizeRatio  = FromSz / ToSz;
   36177 
   36178     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
   36179 
   36180     // Create a type on which we perform the shuffle
   36181     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
   36182             StVT.getScalarType(), NumElems*SizeRatio);
   36183 
   36184     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
   36185 
   36186     SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
   36187     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
   36188     for (unsigned i = 0; i != NumElems; ++i)
   36189       ShuffleVec[i] = i * SizeRatio;
   36190 
   36191     // Can't shuffle using an illegal type.
   36192     if (!TLI.isTypeLegal(WideVecVT))
   36193       return SDValue();
   36194 
   36195     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
   36196                                          DAG.getUNDEF(WideVecVT),
   36197                                          ShuffleVec);
   36198     // At this point all of the data is stored at the bottom of the
   36199     // register. We now need to save it to mem.
   36200 
   36201     // Find the largest store unit
   36202     MVT StoreType = MVT::i8;
   36203     for (MVT Tp : MVT::integer_valuetypes()) {
   36204       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
   36205         StoreType = Tp;
   36206     }
   36207 
   36208     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
   36209     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
   36210         (64 <= NumElems * ToSz))
   36211       StoreType = MVT::f64;
   36212 
   36213     // Bitcast the original vector into a vector of store-size units
   36214     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
   36215             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
   36216     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
   36217     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
   36218     SmallVector<SDValue, 8> Chains;
   36219     SDValue Ptr = St->getBasePtr();
   36220 
   36221     // Perform one or more big stores into memory.
   36222     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
   36223       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   36224                                    StoreType, ShuffWide,
   36225                                    DAG.getIntPtrConstant(i, dl));
   36226       SDValue Ch =
   36227           DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
   36228                        St->getAlignment(), St->getMemOperand()->getFlags());
   36229       Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
   36230       Chains.push_back(Ch);
   36231     }
   36232 
   36233     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   36234   }
   36235 
   36236   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   36237   // the FP state in cases where an emms may be missing.
   36238   // A preferable solution to the general problem is to figure out the right
   36239   // places to insert EMMS.  This qualifies as a quick hack.
   36240 
   36241   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
   36242   if (VT.getSizeInBits() != 64)
   36243     return SDValue();
   36244 
   36245   const Function &F = DAG.getMachineFunction().getFunction();
   36246   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
   36247   bool F64IsLegal =
   36248       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
   36249   if ((VT.isVector() ||
   36250        (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
   36251       isa<LoadSDNode>(St->getValue()) &&
   36252       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
   36253       St->getChain().hasOneUse() && !St->isVolatile()) {
   36254     LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
   36255     SmallVector<SDValue, 8> Ops;
   36256 
   36257     if (!ISD::isNormalLoad(Ld))
   36258       return SDValue();
   36259 
   36260     // If this is not the MMX case, i.e. we are just turning i64 load/store
   36261     // into f64 load/store, avoid the transformation if there are multiple
   36262     // uses of the loaded value.
   36263     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
   36264       return SDValue();
   36265 
   36266     SDLoc LdDL(Ld);
   36267     SDLoc StDL(N);
   36268     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
   36269     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
   36270     // pair instead.
   36271     if (Subtarget.is64Bit() || F64IsLegal) {
   36272       MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
   36273       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
   36274                                   Ld->getMemOperand());
   36275 
   36276       // Make sure new load is placed in same chain order.
   36277       DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
   36278       return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
   36279                           St->getMemOperand());
   36280     }
   36281 
   36282     // Otherwise, lower to two pairs of 32-bit loads / stores.
   36283     SDValue LoAddr = Ld->getBasePtr();
   36284     SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
   36285 
   36286     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
   36287                                Ld->getPointerInfo(), Ld->getAlignment(),
   36288                                Ld->getMemOperand()->getFlags());
   36289     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
   36290                                Ld->getPointerInfo().getWithOffset(4),
   36291                                MinAlign(Ld->getAlignment(), 4),
   36292                                Ld->getMemOperand()->getFlags());
   36293     // Make sure new loads are placed in same chain order.
   36294     DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
   36295     DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
   36296 
   36297     LoAddr = St->getBasePtr();
   36298     HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
   36299 
   36300     SDValue LoSt =
   36301         DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
   36302                      St->getAlignment(), St->getMemOperand()->getFlags());
   36303     SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
   36304                                 St->getPointerInfo().getWithOffset(4),
   36305                                 MinAlign(St->getAlignment(), 4),
   36306                                 St->getMemOperand()->getFlags());
   36307     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
   36308   }
   36309 
   36310   // This is similar to the above case, but here we handle a scalar 64-bit
   36311   // integer store that is extracted from a vector on a 32-bit target.
   36312   // If we have SSE2, then we can treat it like a floating-point double
   36313   // to get past legalization. The execution dependencies fixup pass will
   36314   // choose the optimal machine instruction for the store if this really is
   36315   // an integer or v2f32 rather than an f64.
   36316   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
   36317       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   36318     SDValue OldExtract = St->getOperand(1);
   36319     SDValue ExtOp0 = OldExtract.getOperand(0);
   36320     unsigned VecSize = ExtOp0.getValueSizeInBits();
   36321     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
   36322     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
   36323     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
   36324                                      BitCast, OldExtract.getOperand(1));
   36325     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
   36326                         St->getPointerInfo(), St->getAlignment(),
   36327                         St->getMemOperand()->getFlags());
   36328   }
   36329 
   36330   return SDValue();
   36331 }
   36332 
   36333 /// Return 'true' if this vector operation is "horizontal"
   36334 /// and return the operands for the horizontal operation in LHS and RHS.  A
   36335 /// horizontal operation performs the binary operation on successive elements
   36336 /// of its first operand, then on successive elements of its second operand,
   36337 /// returning the resulting values in a vector.  For example, if
   36338 ///   A = < float a0, float a1, float a2, float a3 >
   36339 /// and
   36340 ///   B = < float b0, float b1, float b2, float b3 >
   36341 /// then the result of doing a horizontal operation on A and B is
   36342 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
   36343 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
   36344 /// A horizontal-op B, for some already available A and B, and if so then LHS is
   36345 /// set to A, RHS to B, and the routine returns 'true'.
   36346 /// Note that the binary operation should have the property that if one of the
   36347 /// operands is UNDEF then the result is UNDEF.
   36348 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   36349   // Look for the following pattern: if
   36350   //   A = < float a0, float a1, float a2, float a3 >
   36351   //   B = < float b0, float b1, float b2, float b3 >
   36352   // and
   36353   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
   36354   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
   36355   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
   36356   // which is A horizontal-op B.
   36357 
   36358   // At least one of the operands should be a vector shuffle.
   36359   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
   36360       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
   36361     return false;
   36362 
   36363   MVT VT = LHS.getSimpleValueType();
   36364 
   36365   assert((VT.is128BitVector() || VT.is256BitVector()) &&
   36366          "Unsupported vector type for horizontal add/sub");
   36367 
   36368   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
   36369   // operate independently on 128-bit lanes.
   36370   unsigned NumElts = VT.getVectorNumElements();
   36371   unsigned NumLanes = VT.getSizeInBits()/128;
   36372   unsigned NumLaneElts = NumElts / NumLanes;
   36373   assert((NumLaneElts % 2 == 0) &&
   36374          "Vector type should have an even number of elements in each lane");
   36375   unsigned HalfLaneElts = NumLaneElts/2;
   36376 
   36377   // View LHS in the form
   36378   //   LHS = VECTOR_SHUFFLE A, B, LMask
   36379   // If LHS is not a shuffle then pretend it is the shuffle
   36380   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
   36381   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
   36382   // type VT.
   36383   SDValue A, B;
   36384   SmallVector<int, 16> LMask(NumElts);
   36385   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
   36386     if (!LHS.getOperand(0).isUndef())
   36387       A = LHS.getOperand(0);
   36388     if (!LHS.getOperand(1).isUndef())
   36389       B = LHS.getOperand(1);
   36390     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
   36391     std::copy(Mask.begin(), Mask.end(), LMask.begin());
   36392   } else {
   36393     if (!LHS.isUndef())
   36394       A = LHS;
   36395     for (unsigned i = 0; i != NumElts; ++i)
   36396       LMask[i] = i;
   36397   }
   36398 
   36399   // Likewise, view RHS in the form
   36400   //   RHS = VECTOR_SHUFFLE C, D, RMask
   36401   SDValue C, D;
   36402   SmallVector<int, 16> RMask(NumElts);
   36403   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
   36404     if (!RHS.getOperand(0).isUndef())
   36405       C = RHS.getOperand(0);
   36406     if (!RHS.getOperand(1).isUndef())
   36407       D = RHS.getOperand(1);
   36408     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
   36409     std::copy(Mask.begin(), Mask.end(), RMask.begin());
   36410   } else {
   36411     if (!RHS.isUndef())
   36412       C = RHS;
   36413     for (unsigned i = 0; i != NumElts; ++i)
   36414       RMask[i] = i;
   36415   }
   36416 
   36417   // Check that the shuffles are both shuffling the same vectors.
   36418   if (!(A == C && B == D) && !(A == D && B == C))
   36419     return false;
   36420 
   36421   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
   36422   if (!A.getNode() && !B.getNode())
   36423     return false;
   36424 
   36425   // If A and B occur in reverse order in RHS, then "swap" them (which means
   36426   // rewriting the mask).
   36427   if (A != C)
   36428     ShuffleVectorSDNode::commuteMask(RMask);
   36429 
   36430   // At this point LHS and RHS are equivalent to
   36431   //   LHS = VECTOR_SHUFFLE A, B, LMask
   36432   //   RHS = VECTOR_SHUFFLE A, B, RMask
   36433   // Check that the masks correspond to performing a horizontal operation.
   36434   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
   36435     for (unsigned i = 0; i != NumLaneElts; ++i) {
   36436       int LIdx = LMask[i+l], RIdx = RMask[i+l];
   36437 
   36438       // Ignore any UNDEF components.
   36439       if (LIdx < 0 || RIdx < 0 ||
   36440           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
   36441           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
   36442         continue;
   36443 
   36444       // Check that successive elements are being operated on.  If not, this is
   36445       // not a horizontal operation.
   36446       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
   36447       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
   36448       if (!(LIdx == Index && RIdx == Index + 1) &&
   36449           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
   36450         return false;
   36451     }
   36452   }
   36453 
   36454   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
   36455   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
   36456   return true;
   36457 }
   36458 
   36459 /// Do target-specific dag combines on floating-point adds/subs.
   36460 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
   36461                                const X86Subtarget &Subtarget) {
   36462   EVT VT = N->getValueType(0);
   36463   SDValue LHS = N->getOperand(0);
   36464   SDValue RHS = N->getOperand(1);
   36465   bool IsFadd = N->getOpcode() == ISD::FADD;
   36466   assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
   36467 
   36468   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
   36469   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
   36470        (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
   36471       isHorizontalBinOp(LHS, RHS, IsFadd)) {
   36472     auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
   36473     return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
   36474   }
   36475   return SDValue();
   36476 }
   36477 
   36478 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
   36479 /// the codegen.
   36480 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
   36481 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
   36482                                           const X86Subtarget &Subtarget,
   36483                                           const SDLoc &DL) {
   36484   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
   36485   SDValue Src = N->getOperand(0);
   36486   unsigned Opcode = Src.getOpcode();
   36487   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   36488 
   36489   EVT VT = N->getValueType(0);
   36490   EVT SrcVT = Src.getValueType();
   36491 
   36492   auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
   36493     unsigned TruncSizeInBits = VT.getScalarSizeInBits();
   36494 
   36495     // Repeated operand, so we are only trading one output truncation for
   36496     // one input truncation.
   36497     if (Op0 == Op1)
   36498       return true;
   36499 
   36500     // See if either operand has been extended from a smaller/equal size to
   36501     // the truncation size, allowing a truncation to combine with the extend.
   36502     unsigned Opcode0 = Op0.getOpcode();
   36503     if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
   36504          Opcode0 == ISD::ZERO_EXTEND) &&
   36505         Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
   36506       return true;
   36507 
   36508     unsigned Opcode1 = Op1.getOpcode();
   36509     if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
   36510          Opcode1 == ISD::ZERO_EXTEND) &&
   36511         Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
   36512       return true;
   36513 
   36514     // See if either operand is a single use constant which can be constant
   36515     // folded.
   36516     SDValue BC0 = peekThroughOneUseBitcasts(Op0);
   36517     SDValue BC1 = peekThroughOneUseBitcasts(Op1);
   36518     return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
   36519            ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
   36520   };
   36521 
   36522   auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
   36523     SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
   36524     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
   36525     return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
   36526   };
   36527 
   36528   // Don't combine if the operation has other uses.
   36529   if (!N->isOnlyUserOf(Src.getNode()))
   36530     return SDValue();
   36531 
   36532   // Only support vector truncation for now.
   36533   // TODO: i64 scalar math would benefit as well.
   36534   if (!VT.isVector())
   36535     return SDValue();
   36536 
   36537   // In most cases its only worth pre-truncating if we're only facing the cost
   36538   // of one truncation.
   36539   // i.e. if one of the inputs will constant fold or the input is repeated.
   36540   switch (Opcode) {
   36541   case ISD::AND:
   36542   case ISD::XOR:
   36543   case ISD::OR: {
   36544     SDValue Op0 = Src.getOperand(0);
   36545     SDValue Op1 = Src.getOperand(1);
   36546     if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
   36547         IsRepeatedOpOrFreeTruncation(Op0, Op1))
   36548       return TruncateArithmetic(Op0, Op1);
   36549     break;
   36550   }
   36551 
   36552   case ISD::MUL:
   36553     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
   36554     // better to truncate if we have the chance.
   36555     if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
   36556         !TLI.isOperationLegal(Opcode, SrcVT))
   36557       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
   36558     LLVM_FALLTHROUGH;
   36559   case ISD::ADD: {
   36560     // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
   36561     SDValue Op0 = Src.getOperand(0);
   36562     SDValue Op1 = Src.getOperand(1);
   36563     if (TLI.isOperationLegal(Opcode, VT) &&
   36564         IsRepeatedOpOrFreeTruncation(Op0, Op1))
   36565       return TruncateArithmetic(Op0, Op1);
   36566     break;
   36567   }
   36568   }
   36569 
   36570   return SDValue();
   36571 }
   36572 
   36573 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
   36574 static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
   36575                                                  const X86Subtarget &Subtarget,
   36576                                                  SelectionDAG &DAG) {
   36577   SDValue In = N->getOperand(0);
   36578   EVT InVT = In.getValueType();
   36579   EVT InSVT = InVT.getVectorElementType();
   36580   EVT OutVT = N->getValueType(0);
   36581   EVT OutSVT = OutVT.getVectorElementType();
   36582 
   36583   // Split a long vector into vectors of legal type and mask to unset all bits
   36584   // that won't appear in the result to prevent saturation.
   36585   // TODO - we should be doing this at the maximum legal size but this is
   36586   // causing regressions where we're concatenating back to max width just to
   36587   // perform the AND and then extracting back again.....
   36588   unsigned NumSubRegs = InVT.getSizeInBits() / 128;
   36589   unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
   36590   EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
   36591   SmallVector<SDValue, 8> SubVecs(NumSubRegs);
   36592 
   36593   APInt Mask =
   36594       APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
   36595   SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
   36596 
   36597   for (unsigned i = 0; i < NumSubRegs; i++) {
   36598     SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
   36599                               DAG.getIntPtrConstant(i * NumSubRegElts, DL));
   36600     SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
   36601   }
   36602   In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
   36603 
   36604   return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
   36605 }
   36606 
   36607 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
   36608 static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
   36609                                                  const X86Subtarget &Subtarget,
   36610                                                  SelectionDAG &DAG) {
   36611   SDValue In = N->getOperand(0);
   36612   EVT InVT = In.getValueType();
   36613   EVT OutVT = N->getValueType(0);
   36614   In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
   36615                    DAG.getValueType(OutVT));
   36616   return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
   36617 }
   36618 
   36619 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
   36620 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
   36621 /// legalization the truncation will be translated into a BUILD_VECTOR with each
   36622 /// element that is extracted from a vector and then truncated, and it is
   36623 /// difficult to do this optimization based on them.
   36624 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
   36625                                        const X86Subtarget &Subtarget) {
   36626   EVT OutVT = N->getValueType(0);
   36627   if (!OutVT.isVector())
   36628     return SDValue();
   36629 
   36630   SDValue In = N->getOperand(0);
   36631   if (!In.getValueType().isSimple())
   36632     return SDValue();
   36633 
   36634   EVT InVT = In.getValueType();
   36635   unsigned NumElems = OutVT.getVectorNumElements();
   36636 
   36637   // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
   36638   // SSE2, and we need to take care of it specially.
   36639   // AVX512 provides vpmovdb.
   36640   if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
   36641     return SDValue();
   36642 
   36643   EVT OutSVT = OutVT.getVectorElementType();
   36644   EVT InSVT = InVT.getVectorElementType();
   36645   if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
   36646         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
   36647         NumElems >= 8))
   36648     return SDValue();
   36649 
   36650   // SSSE3's pshufb results in less instructions in the cases below.
   36651   if (Subtarget.hasSSSE3() && NumElems == 8 &&
   36652       ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
   36653        (InSVT == MVT::i32 && OutSVT == MVT::i16)))
   36654     return SDValue();
   36655 
   36656   SDLoc DL(N);
   36657   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
   36658   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
   36659   // truncate 2 x v4i32 to v8i16.
   36660   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
   36661     return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
   36662   if (InSVT == MVT::i32)
   36663     return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
   36664 
   36665   return SDValue();
   36666 }
   36667 
   36668 /// This function transforms vector truncation of 'extended sign-bits' or
   36669 /// 'extended zero-bits' values.
   36670 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
   36671 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
   36672                                                SelectionDAG &DAG,
   36673                                                const X86Subtarget &Subtarget) {
   36674   // Requires SSE2 but AVX512 has fast truncate.
   36675   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
   36676     return SDValue();
   36677 
   36678   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
   36679     return SDValue();
   36680 
   36681   SDValue In = N->getOperand(0);
   36682   if (!In.getValueType().isSimple())
   36683     return SDValue();
   36684 
   36685   MVT VT = N->getValueType(0).getSimpleVT();
   36686   MVT SVT = VT.getScalarType();
   36687 
   36688   MVT InVT = In.getValueType().getSimpleVT();
   36689   MVT InSVT = InVT.getScalarType();
   36690 
   36691   // Check we have a truncation suited for PACKSS/PACKUS.
   36692   if (!VT.is128BitVector() && !VT.is256BitVector())
   36693     return SDValue();
   36694   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
   36695     return SDValue();
   36696   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
   36697     return SDValue();
   36698 
   36699   unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
   36700   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
   36701 
   36702   // Use PACKUS if the input has zero-bits that extend all the way to the
   36703   // packed/truncated value. e.g. masks, zext_in_reg, etc.
   36704   KnownBits Known;
   36705   DAG.computeKnownBits(In, Known);
   36706   unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
   36707   if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
   36708     return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
   36709 
   36710   // Use PACKSS if the input has sign-bits that extend all the way to the
   36711   // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
   36712   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
   36713   if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
   36714     return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
   36715 
   36716   return SDValue();
   36717 }
   36718 
   36719 // Try to form a MULHU or MULHS node by looking for
   36720 // (trunc (srl (mul ext, ext), 16))
   36721 // TODO: This is X86 specific because we want to be able to handle wide types
   36722 // before type legalization. But we can only do it if the vector will be
   36723 // legalized via widening/splitting. Type legalization can't handle promotion
   36724 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
   36725 // combiner.
   36726 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
   36727                             SelectionDAG &DAG, const X86Subtarget &Subtarget) {
   36728   // First instruction should be a right shift of a multiply.
   36729   if (Src.getOpcode() != ISD::SRL ||
   36730       Src.getOperand(0).getOpcode() != ISD::MUL)
   36731     return SDValue();
   36732 
   36733   if (!Subtarget.hasSSE2())
   36734     return SDValue();
   36735 
   36736   // Only handle vXi16 types that are at least 128-bits.
   36737   if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
   36738       VT.getVectorNumElements() < 8)
   36739     return SDValue();
   36740 
   36741   // Input type should be vXi32.
   36742   EVT InVT = Src.getValueType();
   36743   if (InVT.getVectorElementType() != MVT::i32)
   36744     return SDValue();
   36745 
   36746   // Need a shift by 16.
   36747   APInt ShiftAmt;
   36748   if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
   36749       ShiftAmt != 16)
   36750     return SDValue();
   36751 
   36752   SDValue LHS = Src.getOperand(0).getOperand(0);
   36753   SDValue RHS = Src.getOperand(0).getOperand(1);
   36754 
   36755   unsigned ExtOpc = LHS.getOpcode();
   36756   if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
   36757       RHS.getOpcode() != ExtOpc)
   36758     return SDValue();
   36759 
   36760   // Peek through the extends.
   36761   LHS = LHS.getOperand(0);
   36762   RHS = RHS.getOperand(0);
   36763 
   36764   // Ensure the input types match.
   36765   if (LHS.getValueType() != VT || RHS.getValueType() != VT)
   36766     return SDValue();
   36767 
   36768   unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
   36769   return DAG.getNode(Opc, DL, VT, LHS, RHS);
   36770 }
   36771 
   36772 // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
   36773 // from one vector with signed bytes from another vector, adds together
   36774 // adjacent pairs of 16-bit products, and saturates the result before
   36775 // truncating to 16-bits.
   36776 //
   36777 // Which looks something like this:
   36778 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
   36779 //                 (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
   36780 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
   36781                                const X86Subtarget &Subtarget,
   36782                                const SDLoc &DL) {
   36783   if (!VT.isVector() || !Subtarget.hasSSSE3())
   36784     return SDValue();
   36785 
   36786   unsigned NumElems = VT.getVectorNumElements();
   36787   EVT ScalarVT = VT.getVectorElementType();
   36788   if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
   36789     return SDValue();
   36790 
   36791   SDValue SSatVal = detectSSatPattern(In, VT);
   36792   if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
   36793     return SDValue();
   36794 
   36795   // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
   36796   // of multiplies from even/odd elements.
   36797   SDValue N0 = SSatVal.getOperand(0);
   36798   SDValue N1 = SSatVal.getOperand(1);
   36799 
   36800   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
   36801     return SDValue();
   36802 
   36803   SDValue N00 = N0.getOperand(0);
   36804   SDValue N01 = N0.getOperand(1);
   36805   SDValue N10 = N1.getOperand(0);
   36806   SDValue N11 = N1.getOperand(1);
   36807 
   36808   // TODO: Handle constant vectors and use knownbits/computenumsignbits?
   36809   // Canonicalize zero_extend to LHS.
   36810   if (N01.getOpcode() == ISD::ZERO_EXTEND)
   36811     std::swap(N00, N01);
   36812   if (N11.getOpcode() == ISD::ZERO_EXTEND)
   36813     std::swap(N10, N11);
   36814 
   36815   // Ensure we have a zero_extend and a sign_extend.
   36816   if (N00.getOpcode() != ISD::ZERO_EXTEND ||
   36817       N01.getOpcode() != ISD::SIGN_EXTEND ||
   36818       N10.getOpcode() != ISD::ZERO_EXTEND ||
   36819       N11.getOpcode() != ISD::SIGN_EXTEND)
   36820     return SDValue();
   36821 
   36822   // Peek through the extends.
   36823   N00 = N00.getOperand(0);
   36824   N01 = N01.getOperand(0);
   36825   N10 = N10.getOperand(0);
   36826   N11 = N11.getOperand(0);
   36827 
   36828   // Ensure the extend is from vXi8.
   36829   if (N00.getValueType().getVectorElementType() != MVT::i8 ||
   36830       N01.getValueType().getVectorElementType() != MVT::i8 ||
   36831       N10.getValueType().getVectorElementType() != MVT::i8 ||
   36832       N11.getValueType().getVectorElementType() != MVT::i8)
   36833     return SDValue();
   36834 
   36835   // All inputs should be build_vectors.
   36836   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
   36837       N01.getOpcode() != ISD::BUILD_VECTOR ||
   36838       N10.getOpcode() != ISD::BUILD_VECTOR ||
   36839       N11.getOpcode() != ISD::BUILD_VECTOR)
   36840     return SDValue();
   36841 
   36842   // N00/N10 are zero extended. N01/N11 are sign extended.
   36843 
   36844   // For each element, we need to ensure we have an odd element from one vector
   36845   // multiplied by the odd element of another vector and the even element from
   36846   // one of the same vectors being multiplied by the even element from the
   36847   // other vector. So we need to make sure for each element i, this operator
   36848   // is being performed:
   36849   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
   36850   SDValue ZExtIn, SExtIn;
   36851   for (unsigned i = 0; i != NumElems; ++i) {
   36852     SDValue N00Elt = N00.getOperand(i);
   36853     SDValue N01Elt = N01.getOperand(i);
   36854     SDValue N10Elt = N10.getOperand(i);
   36855     SDValue N11Elt = N11.getOperand(i);
   36856     // TODO: Be more tolerant to undefs.
   36857     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   36858         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   36859         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   36860         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   36861       return SDValue();
   36862     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
   36863     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
   36864     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
   36865     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
   36866     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
   36867       return SDValue();
   36868     unsigned IdxN00 = ConstN00Elt->getZExtValue();
   36869     unsigned IdxN01 = ConstN01Elt->getZExtValue();
   36870     unsigned IdxN10 = ConstN10Elt->getZExtValue();
   36871     unsigned IdxN11 = ConstN11Elt->getZExtValue();
   36872     // Add is commutative so indices can be reordered.
   36873     if (IdxN00 > IdxN10) {
   36874       std::swap(IdxN00, IdxN10);
   36875       std::swap(IdxN01, IdxN11);
   36876     }
   36877     // N0 indices be the even element. N1 indices must be the next odd element.
   36878     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
   36879         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
   36880       return SDValue();
   36881     SDValue N00In = N00Elt.getOperand(0);
   36882     SDValue N01In = N01Elt.getOperand(0);
   36883     SDValue N10In = N10Elt.getOperand(0);
   36884     SDValue N11In = N11Elt.getOperand(0);
   36885     // First time we find an input capture it.
   36886     if (!ZExtIn) {
   36887       ZExtIn = N00In;
   36888       SExtIn = N01In;
   36889     }
   36890     if (ZExtIn != N00In || SExtIn != N01In ||
   36891         ZExtIn != N10In || SExtIn != N11In)
   36892       return SDValue();
   36893   }
   36894 
   36895   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   36896                          ArrayRef<SDValue> Ops) {
   36897     // Shrink by adding truncate nodes and let DAGCombine fold with the
   36898     // sources.
   36899     EVT InVT = Ops[0].getValueType();
   36900     assert(InVT.getScalarType() == MVT::i8 &&
   36901            "Unexpected scalar element type");
   36902     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
   36903     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
   36904                                  InVT.getVectorNumElements() / 2);
   36905     return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
   36906   };
   36907   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
   36908                           PMADDBuilder);
   36909 }
   36910 
   36911 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   36912                                const X86Subtarget &Subtarget) {
   36913   EVT VT = N->getValueType(0);
   36914   SDValue Src = N->getOperand(0);
   36915   SDLoc DL(N);
   36916 
   36917   // Attempt to pre-truncate inputs to arithmetic ops instead.
   36918   if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
   36919     return V;
   36920 
   36921   // Try to detect AVG pattern first.
   36922   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
   36923     return Avg;
   36924 
   36925   // Try to detect PMADD
   36926   if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
   36927     return PMAdd;
   36928 
   36929   // Try to combine truncation with signed/unsigned saturation.
   36930   if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
   36931     return Val;
   36932 
   36933   // Try to combine PMULHUW/PMULHW for vXi16.
   36934   if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
   36935     return V;
   36936 
   36937   // The bitcast source is a direct mmx result.
   36938   // Detect bitcasts between i32 to x86mmx
   36939   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
   36940     SDValue BCSrc = Src.getOperand(0);
   36941     if (BCSrc.getValueType() == MVT::x86mmx)
   36942       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
   36943   }
   36944 
   36945   // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
   36946   if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
   36947     return V;
   36948 
   36949   return combineVectorTruncation(N, DAG, Subtarget);
   36950 }
   36951 
   36952 /// Returns the negated value if the node \p N flips sign of FP value.
   36953 ///
   36954 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
   36955 /// AVX512F does not have FXOR, so FNEG is lowered as
   36956 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
   36957 /// In this case we go though all bitcasts.
   36958 static SDValue isFNEG(SDNode *N) {
   36959   if (N->getOpcode() == ISD::FNEG)
   36960     return N->getOperand(0);
   36961 
   36962   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
   36963   if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
   36964     return SDValue();
   36965 
   36966   SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
   36967   if (!Op1.getValueType().isFloatingPoint())
   36968     return SDValue();
   36969 
   36970   // Extract constant bits and see if they are all sign bit masks.
   36971   APInt UndefElts;
   36972   SmallVector<APInt, 16> EltBits;
   36973   if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(),
   36974                                     UndefElts, EltBits, false, false))
   36975     if (llvm::all_of(EltBits, [](APInt &I) { return I.isSignMask(); }))
   36976       return peekThroughBitcasts(Op.getOperand(0));
   36977 
   36978   return SDValue();
   36979 }
   36980 
   36981 /// Do target-specific dag combines on floating point negations.
   36982 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
   36983                            const X86Subtarget &Subtarget) {
   36984   EVT OrigVT = N->getValueType(0);
   36985   SDValue Arg = isFNEG(N);
   36986   assert(Arg.getNode() && "N is expected to be an FNEG node");
   36987 
   36988   EVT VT = Arg.getValueType();
   36989   EVT SVT = VT.getScalarType();
   36990   SDLoc DL(N);
   36991 
   36992   // Let legalize expand this if it isn't a legal type yet.
   36993   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   36994     return SDValue();
   36995 
   36996   // If we're negating a FMUL node on a target with FMA, then we can avoid the
   36997   // use of a constant by performing (-0 - A*B) instead.
   36998   // FIXME: Check rounding control flags as well once it becomes available.
   36999   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
   37000       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
   37001     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
   37002     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
   37003                                   Arg.getOperand(1), Zero);
   37004     return DAG.getBitcast(OrigVT, NewNode);
   37005   }
   37006 
   37007   // If we're negating an FMA node, then we can adjust the
   37008   // instruction to include the extra negation.
   37009   unsigned NewOpcode = 0;
   37010   if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
   37011     switch (Arg.getOpcode()) {
   37012     case ISD::FMA:             NewOpcode = X86ISD::FNMSUB;       break;
   37013     case X86ISD::FMSUB:        NewOpcode = X86ISD::FNMADD;       break;
   37014     case X86ISD::FNMADD:       NewOpcode = X86ISD::FMSUB;        break;
   37015     case X86ISD::FNMSUB:       NewOpcode = ISD::FMA;             break;
   37016     case X86ISD::FMADD_RND:    NewOpcode = X86ISD::FNMSUB_RND;   break;
   37017     case X86ISD::FMSUB_RND:    NewOpcode = X86ISD::FNMADD_RND;   break;
   37018     case X86ISD::FNMADD_RND:   NewOpcode = X86ISD::FMSUB_RND;    break;
   37019     case X86ISD::FNMSUB_RND:   NewOpcode = X86ISD::FMADD_RND;    break;
   37020     // We can't handle scalar intrinsic node here because it would only
   37021     // invert one element and not the whole vector. But we could try to handle
   37022     // a negation of the lower element only.
   37023     }
   37024   }
   37025   if (NewOpcode)
   37026     return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
   37027                                               Arg.getNode()->ops()));
   37028 
   37029   return SDValue();
   37030 }
   37031 
   37032 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
   37033                                  const X86Subtarget &Subtarget) {
   37034   MVT VT = N->getSimpleValueType(0);
   37035   // If we have integer vector types available, use the integer opcodes.
   37036   if (VT.isVector() && Subtarget.hasSSE2()) {
   37037     SDLoc dl(N);
   37038 
   37039     MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
   37040 
   37041     SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
   37042     SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
   37043     unsigned IntOpcode;
   37044     switch (N->getOpcode()) {
   37045     default: llvm_unreachable("Unexpected FP logic op");
   37046     case X86ISD::FOR: IntOpcode = ISD::OR; break;
   37047     case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
   37048     case X86ISD::FAND: IntOpcode = ISD::AND; break;
   37049     case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
   37050     }
   37051     SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
   37052     return DAG.getBitcast(VT, IntOp);
   37053   }
   37054   return SDValue();
   37055 }
   37056 
   37057 
   37058 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
   37059 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
   37060   if (N->getOpcode() != ISD::XOR)
   37061     return SDValue();
   37062 
   37063   SDValue LHS = N->getOperand(0);
   37064   auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
   37065   if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
   37066     return SDValue();
   37067 
   37068   X86::CondCode NewCC = X86::GetOppositeBranchCondition(
   37069       X86::CondCode(LHS->getConstantOperandVal(0)));
   37070   SDLoc DL(N);
   37071   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
   37072 }
   37073 
   37074 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   37075                           TargetLowering::DAGCombinerInfo &DCI,
   37076                           const X86Subtarget &Subtarget) {
   37077   // If this is SSE1 only convert to FXOR to avoid scalarization.
   37078   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
   37079       N->getValueType(0) == MVT::v4i32) {
   37080     return DAG.getBitcast(
   37081         MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
   37082                                 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
   37083                                 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
   37084   }
   37085 
   37086   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
   37087     return Cmp;
   37088 
   37089   if (DCI.isBeforeLegalizeOps())
   37090     return SDValue();
   37091 
   37092   if (SDValue SetCC = foldXor1SetCC(N, DAG))
   37093     return SetCC;
   37094 
   37095   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
   37096     return RV;
   37097 
   37098   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
   37099     return FPLogic;
   37100 
   37101   if (isFNEG(N))
   37102     return combineFneg(N, DAG, Subtarget);
   37103   return SDValue();
   37104 }
   37105 
   37106 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
   37107                             TargetLowering::DAGCombinerInfo &DCI,
   37108                             const X86Subtarget &Subtarget) {
   37109   SDValue Op0 = N->getOperand(0);
   37110   SDValue Op1 = N->getOperand(1);
   37111   EVT VT = N->getValueType(0);
   37112   unsigned NumBits = VT.getSizeInBits();
   37113 
   37114   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   37115   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   37116                                         !DCI.isBeforeLegalizeOps());
   37117 
   37118   // TODO - Constant Folding.
   37119   if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
   37120     // Reduce Cst1 to the bottom 16-bits.
   37121     // NOTE: SimplifyDemandedBits won't do this for constants.
   37122     const APInt &Val1 = Cst1->getAPIntValue();
   37123     APInt MaskedVal1 = Val1 & 0xFFFF;
   37124     if (MaskedVal1 != Val1)
   37125       return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
   37126                          DAG.getConstant(MaskedVal1, SDLoc(N), VT));
   37127   }
   37128 
   37129   // Only bottom 16-bits of the control bits are required.
   37130   KnownBits Known;
   37131   APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
   37132   if (TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO)) {
   37133     DCI.CommitTargetLoweringOpt(TLO);
   37134     return SDValue(N, 0);
   37135   }
   37136 
   37137   return SDValue();
   37138 }
   37139 
   37140 static bool isNullFPScalarOrVectorConst(SDValue V) {
   37141   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
   37142 }
   37143 
   37144 /// If a value is a scalar FP zero or a vector FP zero (potentially including
   37145 /// undefined elements), return a zero constant that may be used to fold away
   37146 /// that value. In the case of a vector, the returned constant will not contain
   37147 /// undefined elements even if the input parameter does. This makes it suitable
   37148 /// to be used as a replacement operand with operations (eg, bitwise-and) where
   37149 /// an undef should not propagate.
   37150 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
   37151                                         const X86Subtarget &Subtarget) {
   37152   if (!isNullFPScalarOrVectorConst(V))
   37153     return SDValue();
   37154 
   37155   if (V.getValueType().isVector())
   37156     return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
   37157 
   37158   return V;
   37159 }
   37160 
   37161 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
   37162                                       const X86Subtarget &Subtarget) {
   37163   SDValue N0 = N->getOperand(0);
   37164   SDValue N1 = N->getOperand(1);
   37165   EVT VT = N->getValueType(0);
   37166   SDLoc DL(N);
   37167 
   37168   // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
   37169   if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
   37170         (VT == MVT::f64 && Subtarget.hasSSE2()) ||
   37171         (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
   37172     return SDValue();
   37173 
   37174   auto isAllOnesConstantFP = [](SDValue V) {
   37175     if (V.getSimpleValueType().isVector())
   37176       return ISD::isBuildVectorAllOnes(V.getNode());
   37177     auto *C = dyn_cast<ConstantFPSDNode>(V);
   37178     return C && C->getConstantFPValue()->isAllOnesValue();
   37179   };
   37180 
   37181   // fand (fxor X, -1), Y --> fandn X, Y
   37182   if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
   37183     return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
   37184 
   37185   // fand X, (fxor Y, -1) --> fandn Y, X
   37186   if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
   37187     return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
   37188 
   37189   return SDValue();
   37190 }
   37191 
   37192 /// Do target-specific dag combines on X86ISD::FAND nodes.
   37193 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
   37194                            const X86Subtarget &Subtarget) {
   37195   // FAND(0.0, x) -> 0.0
   37196   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
   37197     return V;
   37198 
   37199   // FAND(x, 0.0) -> 0.0
   37200   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
   37201     return V;
   37202 
   37203   if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
   37204     return V;
   37205 
   37206   return lowerX86FPLogicOp(N, DAG, Subtarget);
   37207 }
   37208 
   37209 /// Do target-specific dag combines on X86ISD::FANDN nodes.
   37210 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
   37211                             const X86Subtarget &Subtarget) {
   37212   // FANDN(0.0, x) -> x
   37213   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
   37214     return N->getOperand(1);
   37215 
   37216   // FANDN(x, 0.0) -> 0.0
   37217   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
   37218     return V;
   37219 
   37220   return lowerX86FPLogicOp(N, DAG, Subtarget);
   37221 }
   37222 
   37223 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
   37224 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
   37225                           const X86Subtarget &Subtarget) {
   37226   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
   37227 
   37228   // F[X]OR(0.0, x) -> x
   37229   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
   37230     return N->getOperand(1);
   37231 
   37232   // F[X]OR(x, 0.0) -> x
   37233   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
   37234     return N->getOperand(0);
   37235 
   37236   if (isFNEG(N))
   37237     if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
   37238       return NewVal;
   37239 
   37240   return lowerX86FPLogicOp(N, DAG, Subtarget);
   37241 }
   37242 
   37243 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
   37244 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
   37245   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
   37246 
   37247   // Only perform optimizations if UnsafeMath is used.
   37248   if (!DAG.getTarget().Options.UnsafeFPMath)
   37249     return SDValue();
   37250 
   37251   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
   37252   // into FMINC and FMAXC, which are Commutative operations.
   37253   unsigned NewOp = 0;
   37254   switch (N->getOpcode()) {
   37255     default: llvm_unreachable("unknown opcode");
   37256     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
   37257     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
   37258   }
   37259 
   37260   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
   37261                      N->getOperand(0), N->getOperand(1));
   37262 }
   37263 
   37264 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   37265                                      const X86Subtarget &Subtarget) {
   37266   if (Subtarget.useSoftFloat())
   37267     return SDValue();
   37268 
   37269   // TODO: If an operand is already known to be a NaN or not a NaN, this
   37270   //       should be an optional swap and FMAX/FMIN.
   37271 
   37272   EVT VT = N->getValueType(0);
   37273   if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
   37274         (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
   37275         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
   37276     return SDValue();
   37277 
   37278   SDValue Op0 = N->getOperand(0);
   37279   SDValue Op1 = N->getOperand(1);
   37280   SDLoc DL(N);
   37281   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
   37282 
   37283   // If we don't have to respect NaN inputs, this is a direct translation to x86
   37284   // min/max instructions.
   37285   if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
   37286     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
   37287 
   37288   // If we have to respect NaN inputs, this takes at least 3 instructions.
   37289   // Favor a library call when operating on a scalar and minimizing code size.
   37290   if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
   37291     return SDValue();
   37292 
   37293   EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
   37294       DAG.getDataLayout(), *DAG.getContext(), VT);
   37295 
   37296   // There are 4 possibilities involving NaN inputs, and these are the required
   37297   // outputs:
   37298   //                   Op1
   37299   //               Num     NaN
   37300   //            ----------------
   37301   //       Num  |  Max  |  Op0 |
   37302   // Op0        ----------------
   37303   //       NaN  |  Op1  |  NaN |
   37304   //            ----------------
   37305   //
   37306   // The SSE FP max/min instructions were not designed for this case, but rather
   37307   // to implement:
   37308   //   Min = Op1 < Op0 ? Op1 : Op0
   37309   //   Max = Op1 > Op0 ? Op1 : Op0
   37310   //
   37311   // So they always return Op0 if either input is a NaN. However, we can still
   37312   // use those instructions for fmaxnum by selecting away a NaN input.
   37313 
   37314   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
   37315   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
   37316   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
   37317 
   37318   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
   37319   // are NaN, the NaN value of Op1 is the result.
   37320   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
   37321 }
   37322 
   37323 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
   37324 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
   37325                             TargetLowering::DAGCombinerInfo &DCI,
   37326                             const X86Subtarget &Subtarget) {
   37327   // ANDNP(0, x) -> x
   37328   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
   37329     return N->getOperand(1);
   37330 
   37331   // ANDNP(x, 0) -> 0
   37332   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
   37333     return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
   37334 
   37335   EVT VT = N->getValueType(0);
   37336 
   37337   // Attempt to recursively combine a bitmask ANDNP with shuffles.
   37338   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
   37339     SDValue Op(N, 0);
   37340     if (SDValue Res = combineX86ShufflesRecursively(
   37341             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
   37342             /*HasVarMask*/ false, DAG, Subtarget))
   37343       return Res;
   37344   }
   37345 
   37346   return SDValue();
   37347 }
   37348 
   37349 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
   37350                          TargetLowering::DAGCombinerInfo &DCI) {
   37351   SDValue N0 = N->getOperand(0);
   37352   SDValue N1 = N->getOperand(1);
   37353 
   37354   // BT ignores high bits in the bit index operand.
   37355   unsigned BitWidth = N1.getValueSizeInBits();
   37356   APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
   37357   if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
   37358     return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
   37359 
   37360   return SDValue();
   37361 }
   37362 
   37363 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
   37364 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
   37365   EVT VT = N->getValueType(0);
   37366 
   37367   SDValue N0 = N->getOperand(0);
   37368   SDValue N1 = N->getOperand(1);
   37369   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
   37370 
   37371   if (ExtraVT != MVT::i16)
   37372     return SDValue();
   37373 
   37374   // Look through single use any_extends.
   37375   if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
   37376     N0 = N0.getOperand(0);
   37377 
   37378   // See if we have a single use cmov.
   37379   if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
   37380     return SDValue();
   37381 
   37382   SDValue CMovOp0 = N0.getOperand(0);
   37383   SDValue CMovOp1 = N0.getOperand(1);
   37384 
   37385   // Make sure both operands are constants.
   37386   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
   37387       !isa<ConstantSDNode>(CMovOp1.getNode()))
   37388     return SDValue();
   37389 
   37390   SDLoc DL(N);
   37391 
   37392   // If we looked through an any_extend above, add one to the constants.
   37393   if (N0.getValueType() != VT) {
   37394     CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
   37395     CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
   37396   }
   37397 
   37398   CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
   37399   CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
   37400 
   37401   return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
   37402                      N0.getOperand(2), N0.getOperand(3));
   37403 }
   37404 
   37405 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
   37406                                       const X86Subtarget &Subtarget) {
   37407   if (SDValue V = combineSextInRegCmov(N, DAG))
   37408     return V;
   37409 
   37410   EVT VT = N->getValueType(0);
   37411   SDValue N0 = N->getOperand(0);
   37412   SDValue N1 = N->getOperand(1);
   37413   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
   37414   SDLoc dl(N);
   37415 
   37416   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
   37417   // both SSE and AVX2 since there is no sign-extended shift right
   37418   // operation on a vector with 64-bit elements.
   37419   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
   37420   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
   37421   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
   37422       N0.getOpcode() == ISD::SIGN_EXTEND)) {
   37423     SDValue N00 = N0.getOperand(0);
   37424 
   37425     // EXTLOAD has a better solution on AVX2,
   37426     // it may be replaced with X86ISD::VSEXT node.
   37427     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
   37428       if (!ISD::isNormalLoad(N00.getNode()))
   37429         return SDValue();
   37430 
   37431     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
   37432         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
   37433                                   N00, N1);
   37434       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
   37435     }
   37436   }
   37437   return SDValue();
   37438 }
   37439 
   37440 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
   37441 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
   37442 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
   37443 /// opportunities to combine math ops, use an LEA, or use a complex addressing
   37444 /// mode. This can eliminate extend, add, and shift instructions.
   37445 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
   37446                                    const X86Subtarget &Subtarget) {
   37447   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
   37448       Ext->getOpcode() != ISD::ZERO_EXTEND)
   37449     return SDValue();
   37450 
   37451   // TODO: This should be valid for other integer types.
   37452   EVT VT = Ext->getValueType(0);
   37453   if (VT != MVT::i64)
   37454     return SDValue();
   37455 
   37456   SDValue Add = Ext->getOperand(0);
   37457   if (Add.getOpcode() != ISD::ADD)
   37458     return SDValue();
   37459 
   37460   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
   37461   bool NSW = Add->getFlags().hasNoSignedWrap();
   37462   bool NUW = Add->getFlags().hasNoUnsignedWrap();
   37463 
   37464   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
   37465   // into the 'zext'
   37466   if ((Sext && !NSW) || (!Sext && !NUW))
   37467     return SDValue();
   37468 
   37469   // Having a constant operand to the 'add' ensures that we are not increasing
   37470   // the instruction count because the constant is extended for free below.
   37471   // A constant operand can also become the displacement field of an LEA.
   37472   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
   37473   if (!AddOp1)
   37474     return SDValue();
   37475 
   37476   // Don't make the 'add' bigger if there's no hope of combining it with some
   37477   // other 'add' or 'shl' instruction.
   37478   // TODO: It may be profitable to generate simpler LEA instructions in place
   37479   // of single 'add' instructions, but the cost model for selecting an LEA
   37480   // currently has a high threshold.
   37481   bool HasLEAPotential = false;
   37482   for (auto *User : Ext->uses()) {
   37483     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
   37484       HasLEAPotential = true;
   37485       break;
   37486     }
   37487   }
   37488   if (!HasLEAPotential)
   37489     return SDValue();
   37490 
   37491   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
   37492   int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
   37493   SDValue AddOp0 = Add.getOperand(0);
   37494   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
   37495   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
   37496 
   37497   // The wider add is guaranteed to not wrap because both operands are
   37498   // sign-extended.
   37499   SDNodeFlags Flags;
   37500   Flags.setNoSignedWrap(NSW);
   37501   Flags.setNoUnsignedWrap(NUW);
   37502   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
   37503 }
   37504 
   37505 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
   37506 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
   37507 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
   37508 /// extends from AH (which we otherwise need to do contortions to access).
   37509 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
   37510   SDValue N0 = N->getOperand(0);
   37511   auto OpcodeN = N->getOpcode();
   37512   auto OpcodeN0 = N0.getOpcode();
   37513   if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
   37514         (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
   37515     return SDValue();
   37516 
   37517   EVT VT = N->getValueType(0);
   37518   EVT InVT = N0.getValueType();
   37519   if (N0.getResNo() != 1 || InVT != MVT::i8 ||
   37520       !(VT == MVT::i32 || VT == MVT::i64))
   37521     return SDValue();
   37522 
   37523   SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
   37524   auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
   37525                                                : X86ISD::UDIVREM8_ZEXT_HREG;
   37526   SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
   37527                           N0.getOperand(1));
   37528   DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
   37529   // If this was a 64-bit extend, complete it.
   37530   if (VT == MVT::i64)
   37531     return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
   37532   return R.getValue(1);
   37533 }
   37534 
   37535 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
   37536 // operands and the result of CMOV is not used anywhere else - promote CMOV
   37537 // itself instead of promoting its result. This could be beneficial, because:
   37538 //     1) X86TargetLowering::EmitLoweredSelect later can do merging of two
   37539 //        (or more) pseudo-CMOVs only when they go one-after-another and
   37540 //        getting rid of result extension code after CMOV will help that.
   37541 //     2) Promotion of constant CMOV arguments is free, hence the
   37542 //        {ANY,SIGN,ZERO}_EXTEND will just be deleted.
   37543 //     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
   37544 //        promotion is also good in terms of code-size.
   37545 //        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
   37546 //         promotion).
   37547 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
   37548   SDValue CMovN = Extend->getOperand(0);
   37549   if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
   37550     return SDValue();
   37551 
   37552   EVT TargetVT = Extend->getValueType(0);
   37553   unsigned ExtendOpcode = Extend->getOpcode();
   37554   SDLoc DL(Extend);
   37555 
   37556   EVT VT = CMovN.getValueType();
   37557   SDValue CMovOp0 = CMovN.getOperand(0);
   37558   SDValue CMovOp1 = CMovN.getOperand(1);
   37559 
   37560   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
   37561       !isa<ConstantSDNode>(CMovOp1.getNode()))
   37562     return SDValue();
   37563 
   37564   // Only extend to i32 or i64.
   37565   if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
   37566     return SDValue();
   37567 
   37568   // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
   37569   // are free.
   37570   if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
   37571     return SDValue();
   37572 
   37573   // If this a zero extend to i64, we should only extend to i32 and use a free
   37574   // zero extend to finish.
   37575   EVT ExtendVT = TargetVT;
   37576   if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
   37577     ExtendVT = MVT::i32;
   37578 
   37579   CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
   37580   CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
   37581 
   37582   SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
   37583                             CMovN.getOperand(2), CMovN.getOperand(3));
   37584 
   37585   // Finish extending if needed.
   37586   if (ExtendVT != TargetVT)
   37587     Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
   37588 
   37589   return Res;
   37590 }
   37591 
   37592 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
   37593 // This is more or less the reverse of combineBitcastvxi1.
   37594 static SDValue
   37595 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
   37596                                TargetLowering::DAGCombinerInfo &DCI,
   37597                                const X86Subtarget &Subtarget) {
   37598   unsigned Opcode = N->getOpcode();
   37599   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
   37600       Opcode != ISD::ANY_EXTEND)
   37601     return SDValue();
   37602   if (!DCI.isBeforeLegalizeOps())
   37603     return SDValue();
   37604   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
   37605     return SDValue();
   37606 
   37607   SDValue N0 = N->getOperand(0);
   37608   EVT VT = N->getValueType(0);
   37609   EVT SVT = VT.getScalarType();
   37610   EVT InSVT = N0.getValueType().getScalarType();
   37611   unsigned EltSizeInBits = SVT.getSizeInBits();
   37612 
   37613   // Input type must be extending a bool vector (bit-casted from a scalar
   37614   // integer) to legal integer types.
   37615   if (!VT.isVector())
   37616     return SDValue();
   37617   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
   37618     return SDValue();
   37619   if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
   37620     return SDValue();
   37621 
   37622   SDValue N00 = N0.getOperand(0);
   37623   EVT SclVT = N0.getOperand(0).getValueType();
   37624   if (!SclVT.isScalarInteger())
   37625     return SDValue();
   37626 
   37627   SDLoc DL(N);
   37628   SDValue Vec;
   37629   SmallVector<int, 32> ShuffleMask;
   37630   unsigned NumElts = VT.getVectorNumElements();
   37631   assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
   37632 
   37633   // Broadcast the scalar integer to the vector elements.
   37634   if (NumElts > EltSizeInBits) {
   37635     // If the scalar integer is greater than the vector element size, then we
   37636     // must split it down into sub-sections for broadcasting. For example:
   37637     //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
   37638     //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
   37639     assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
   37640     unsigned Scale = NumElts / EltSizeInBits;
   37641     EVT BroadcastVT =
   37642         EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
   37643     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
   37644     Vec = DAG.getBitcast(VT, Vec);
   37645 
   37646     for (unsigned i = 0; i != Scale; ++i)
   37647       ShuffleMask.append(EltSizeInBits, i);
   37648   } else {
   37649     // For smaller scalar integers, we can simply any-extend it to the vector
   37650     // element size (we don't care about the upper bits) and broadcast it to all
   37651     // elements.
   37652     SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
   37653     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
   37654     ShuffleMask.append(NumElts, 0);
   37655   }
   37656   Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
   37657 
   37658   // Now, mask the relevant bit in each element.
   37659   SmallVector<SDValue, 32> Bits;
   37660   for (unsigned i = 0; i != NumElts; ++i) {
   37661     int BitIdx = (i % EltSizeInBits);
   37662     APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
   37663     Bits.push_back(DAG.getConstant(Bit, DL, SVT));
   37664   }
   37665   SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
   37666   Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
   37667 
   37668   // Compare against the bitmask and extend the result.
   37669   EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
   37670   Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
   37671   Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
   37672 
   37673   // For SEXT, this is now done, otherwise shift the result down for
   37674   // zero-extension.
   37675   if (Opcode == ISD::SIGN_EXTEND)
   37676     return Vec;
   37677   return DAG.getNode(ISD::SRL, DL, VT, Vec,
   37678                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
   37679 }
   37680 
   37681 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
   37682 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
   37683 /// with UNDEFs) of the input to vectors of the same size as the target type
   37684 /// which then extends the lowest elements.
   37685 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
   37686                                           TargetLowering::DAGCombinerInfo &DCI,
   37687                                           const X86Subtarget &Subtarget) {
   37688   unsigned Opcode = N->getOpcode();
   37689   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
   37690     return SDValue();
   37691   if (!DCI.isBeforeLegalizeOps())
   37692     return SDValue();
   37693   if (!Subtarget.hasSSE2())
   37694     return SDValue();
   37695 
   37696   SDValue N0 = N->getOperand(0);
   37697   EVT VT = N->getValueType(0);
   37698   EVT SVT = VT.getScalarType();
   37699   EVT InVT = N0.getValueType();
   37700   EVT InSVT = InVT.getScalarType();
   37701 
   37702   // Input type must be a vector and we must be extending legal integer types.
   37703   if (!VT.isVector())
   37704     return SDValue();
   37705   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
   37706     return SDValue();
   37707   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
   37708     return SDValue();
   37709 
   37710   // On AVX2+ targets, if the input/output types are both legal then we will be
   37711   // able to use SIGN_EXTEND/ZERO_EXTEND directly.
   37712   if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
   37713       DAG.getTargetLoweringInfo().isTypeLegal(InVT))
   37714     return SDValue();
   37715 
   37716   SDLoc DL(N);
   37717 
   37718   auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
   37719     EVT InVT = N.getValueType();
   37720     EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
   37721                                  Size / InVT.getScalarSizeInBits());
   37722     SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
   37723                                   DAG.getUNDEF(InVT));
   37724     Opnds[0] = N;
   37725     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
   37726   };
   37727 
   37728   // If target-size is less than 128-bits, extend to a type that would extend
   37729   // to 128 bits, extend that and extract the original target vector.
   37730   if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
   37731     unsigned Scale = 128 / VT.getSizeInBits();
   37732     EVT ExVT =
   37733         EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
   37734     SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
   37735     SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
   37736     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
   37737                        DAG.getIntPtrConstant(0, DL));
   37738   }
   37739 
   37740   // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
   37741   // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
   37742   // Also use this if we don't have SSE41 to allow the legalizer do its job.
   37743   if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
   37744       (VT.is256BitVector() && Subtarget.hasInt256()) ||
   37745       (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
   37746     SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
   37747     return Opcode == ISD::SIGN_EXTEND
   37748                ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
   37749                : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
   37750   }
   37751 
   37752   auto SplitAndExtendInReg = [&](unsigned SplitSize) {
   37753     unsigned NumVecs = VT.getSizeInBits() / SplitSize;
   37754     unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
   37755     EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
   37756     EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
   37757 
   37758     SmallVector<SDValue, 8> Opnds;
   37759     for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
   37760       SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
   37761                                    DAG.getIntPtrConstant(Offset, DL));
   37762       SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
   37763       SrcVec = Opcode == ISD::SIGN_EXTEND
   37764                    ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
   37765                    : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
   37766       Opnds.push_back(SrcVec);
   37767     }
   37768     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
   37769   };
   37770 
   37771   // On pre-AVX2 targets, split into 128-bit nodes of
   37772   // ISD::*_EXTEND_VECTOR_INREG.
   37773   if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
   37774     return SplitAndExtendInReg(128);
   37775 
   37776   // On pre-AVX512 targets, split into 256-bit nodes of
   37777   // ISD::*_EXTEND_VECTOR_INREG.
   37778   if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
   37779     return SplitAndExtendInReg(256);
   37780 
   37781   return SDValue();
   37782 }
   37783 
   37784 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
   37785 // result type.
   37786 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
   37787                                const X86Subtarget &Subtarget) {
   37788   SDValue N0 = N->getOperand(0);
   37789   EVT VT = N->getValueType(0);
   37790   SDLoc dl(N);
   37791 
   37792   // Only do this combine with AVX512 for vector extends.
   37793   if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
   37794     return SDValue();
   37795 
   37796   // Only combine legal element types.
   37797   EVT SVT = VT.getVectorElementType();
   37798   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
   37799       SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
   37800     return SDValue();
   37801 
   37802   // We can only do this if the vector size in 256 bits or less.
   37803   unsigned Size = VT.getSizeInBits();
   37804   if (Size > 256)
   37805     return SDValue();
   37806 
   37807   // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
   37808   // that's the only integer compares with we have.
   37809   ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
   37810   if (ISD::isUnsignedIntSetCC(CC))
   37811     return SDValue();
   37812 
   37813   // Only do this combine if the extension will be fully consumed by the setcc.
   37814   EVT N00VT = N0.getOperand(0).getValueType();
   37815   EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
   37816   if (Size != MatchingVecType.getSizeInBits())
   37817     return SDValue();
   37818 
   37819   SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
   37820 
   37821   if (N->getOpcode() == ISD::ZERO_EXTEND)
   37822     Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
   37823 
   37824   return Res;
   37825 }
   37826 
   37827 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   37828                            TargetLowering::DAGCombinerInfo &DCI,
   37829                            const X86Subtarget &Subtarget) {
   37830   SDValue N0 = N->getOperand(0);
   37831   EVT VT = N->getValueType(0);
   37832   EVT InVT = N0.getValueType();
   37833   SDLoc DL(N);
   37834 
   37835   if (SDValue DivRem8 = getDivRem8(N, DAG))
   37836     return DivRem8;
   37837 
   37838   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
   37839     return NewCMov;
   37840 
   37841   if (!DCI.isBeforeLegalizeOps())
   37842     return SDValue();
   37843 
   37844   if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
   37845     return V;
   37846 
   37847   if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
   37848       isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
   37849     // Invert and sign-extend a boolean is the same as zero-extend and subtract
   37850     // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
   37851     // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
   37852     // sext (xor Bool, -1) --> sub (zext Bool), 1
   37853     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
   37854     return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
   37855   }
   37856 
   37857   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
   37858     return V;
   37859 
   37860   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
   37861     return V;
   37862 
   37863   if (VT.isVector())
   37864     if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
   37865       return R;
   37866 
   37867   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
   37868     return NewAdd;
   37869 
   37870   return SDValue();
   37871 }
   37872 
   37873 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
   37874   if (NegMul) {
   37875     switch (Opcode) {
   37876     default: llvm_unreachable("Unexpected opcode");
   37877     case ISD::FMA:             Opcode = X86ISD::FNMADD;       break;
   37878     case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMADD_RND;   break;
   37879     case X86ISD::FMSUB:        Opcode = X86ISD::FNMSUB;       break;
   37880     case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
   37881     case X86ISD::FNMADD:       Opcode = ISD::FMA;             break;
   37882     case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMADD_RND;    break;
   37883     case X86ISD::FNMSUB:       Opcode = X86ISD::FMSUB;        break;
   37884     case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMSUB_RND;    break;
   37885     }
   37886   }
   37887 
   37888   if (NegAcc) {
   37889     switch (Opcode) {
   37890     default: llvm_unreachable("Unexpected opcode");
   37891     case ISD::FMA:             Opcode = X86ISD::FMSUB;        break;
   37892     case X86ISD::FMADD_RND:    Opcode = X86ISD::FMSUB_RND;    break;
   37893     case X86ISD::FMSUB:        Opcode = ISD::FMA;             break;
   37894     case X86ISD::FMSUB_RND:    Opcode = X86ISD::FMADD_RND;    break;
   37895     case X86ISD::FNMADD:       Opcode = X86ISD::FNMSUB;       break;
   37896     case X86ISD::FNMADD_RND:   Opcode = X86ISD::FNMSUB_RND;   break;
   37897     case X86ISD::FNMSUB:       Opcode = X86ISD::FNMADD;       break;
   37898     case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FNMADD_RND;   break;
   37899     }
   37900   }
   37901 
   37902   return Opcode;
   37903 }
   37904 
   37905 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
   37906                           const X86Subtarget &Subtarget) {
   37907   SDLoc dl(N);
   37908   EVT VT = N->getValueType(0);
   37909 
   37910   // Let legalize expand this if it isn't a legal type yet.
   37911   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
   37912     return SDValue();
   37913 
   37914   EVT ScalarVT = VT.getScalarType();
   37915   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
   37916     return SDValue();
   37917 
   37918   SDValue A = N->getOperand(0);
   37919   SDValue B = N->getOperand(1);
   37920   SDValue C = N->getOperand(2);
   37921 
   37922   auto invertIfNegative = [&DAG](SDValue &V) {
   37923     if (SDValue NegVal = isFNEG(V.getNode())) {
   37924       V = DAG.getBitcast(V.getValueType(), NegVal);
   37925       return true;
   37926     }
   37927     // Look through extract_vector_elts. If it comes from an FNEG, create a
   37928     // new extract from the FNEG input.
   37929     if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   37930         isNullConstant(V.getOperand(1))) {
   37931       if (SDValue NegVal = isFNEG(V.getOperand(0).getNode())) {
   37932         NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
   37933         V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
   37934                         NegVal, V.getOperand(1));
   37935         return true;
   37936       }
   37937     }
   37938 
   37939     return false;
   37940   };
   37941 
   37942   // Do not convert the passthru input of scalar intrinsics.
   37943   // FIXME: We could allow negations of the lower element only.
   37944   bool NegA = invertIfNegative(A);
   37945   bool NegB = invertIfNegative(B);
   37946   bool NegC = invertIfNegative(C);
   37947 
   37948   if (!NegA && !NegB && !NegC)
   37949     return SDValue();
   37950 
   37951   unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
   37952 
   37953   if (N->getNumOperands() == 4)
   37954     return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
   37955   return DAG.getNode(NewOpcode, dl, VT, A, B, C);
   37956 }
   37957 
   37958 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
   37959 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
   37960                                const X86Subtarget &Subtarget) {
   37961   SDLoc dl(N);
   37962   EVT VT = N->getValueType(0);
   37963 
   37964   SDValue NegVal = isFNEG(N->getOperand(2).getNode());
   37965   if (!NegVal)
   37966     return SDValue();
   37967 
   37968   unsigned NewOpcode;
   37969   switch (N->getOpcode()) {
   37970   default: llvm_unreachable("Unexpected opcode!");
   37971   case X86ISD::FMADDSUB:     NewOpcode = X86ISD::FMSUBADD;     break;
   37972   case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
   37973   case X86ISD::FMSUBADD:     NewOpcode = X86ISD::FMADDSUB;     break;
   37974   case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
   37975   }
   37976 
   37977   if (N->getNumOperands() == 4)
   37978     return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
   37979                        NegVal, N->getOperand(3));
   37980   return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
   37981                      NegVal);
   37982 }
   37983 
   37984 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
   37985                            TargetLowering::DAGCombinerInfo &DCI,
   37986                            const X86Subtarget &Subtarget) {
   37987   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   37988   //           (and (i32 x86isd::setcc_carry), 1)
   37989   // This eliminates the zext. This transformation is necessary because
   37990   // ISD::SETCC is always legalized to i8.
   37991   SDLoc dl(N);
   37992   SDValue N0 = N->getOperand(0);
   37993   EVT VT = N->getValueType(0);
   37994 
   37995   if (N0.getOpcode() == ISD::AND &&
   37996       N0.hasOneUse() &&
   37997       N0.getOperand(0).hasOneUse()) {
   37998     SDValue N00 = N0.getOperand(0);
   37999     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   38000       if (!isOneConstant(N0.getOperand(1)))
   38001         return SDValue();
   38002       return DAG.getNode(ISD::AND, dl, VT,
   38003                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
   38004                                      N00.getOperand(0), N00.getOperand(1)),
   38005                          DAG.getConstant(1, dl, VT));
   38006     }
   38007   }
   38008 
   38009   if (N0.getOpcode() == ISD::TRUNCATE &&
   38010       N0.hasOneUse() &&
   38011       N0.getOperand(0).hasOneUse()) {
   38012     SDValue N00 = N0.getOperand(0);
   38013     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
   38014       return DAG.getNode(ISD::AND, dl, VT,
   38015                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
   38016                                      N00.getOperand(0), N00.getOperand(1)),
   38017                          DAG.getConstant(1, dl, VT));
   38018     }
   38019   }
   38020 
   38021   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
   38022     return NewCMov;
   38023 
   38024   if (DCI.isBeforeLegalizeOps())
   38025     if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
   38026       return V;
   38027 
   38028   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
   38029     return V;
   38030 
   38031   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
   38032     return V;
   38033 
   38034   if (VT.isVector())
   38035     if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
   38036       return R;
   38037 
   38038   if (SDValue DivRem8 = getDivRem8(N, DAG))
   38039     return DivRem8;
   38040 
   38041   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
   38042     return NewAdd;
   38043 
   38044   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
   38045     return R;
   38046 
   38047   return SDValue();
   38048 }
   38049 
   38050 /// Try to map a 128-bit or larger integer comparison to vector instructions
   38051 /// before type legalization splits it up into chunks.
   38052 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
   38053                                                const X86Subtarget &Subtarget) {
   38054   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
   38055   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
   38056 
   38057   // We're looking for an oversized integer equality comparison.
   38058   SDValue X = SetCC->getOperand(0);
   38059   SDValue Y = SetCC->getOperand(1);
   38060   EVT OpVT = X.getValueType();
   38061   unsigned OpSize = OpVT.getSizeInBits();
   38062   if (!OpVT.isScalarInteger() || OpSize < 128)
   38063     return SDValue();
   38064 
   38065   // Ignore a comparison with zero because that gets special treatment in
   38066   // EmitTest(). But make an exception for the special case of a pair of
   38067   // logically-combined vector-sized operands compared to zero. This pattern may
   38068   // be generated by the memcmp expansion pass with oversized integer compares
   38069   // (see PR33325).
   38070   bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
   38071                           X.getOperand(0).getOpcode() == ISD::XOR &&
   38072                           X.getOperand(1).getOpcode() == ISD::XOR;
   38073   if (isNullConstant(Y) && !IsOrXorXorCCZero)
   38074     return SDValue();
   38075 
   38076   // Bail out if we know that this is not really just an oversized integer.
   38077   if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
   38078       peekThroughBitcasts(Y).getValueType() == MVT::f128)
   38079     return SDValue();
   38080 
   38081   // TODO: Use PXOR + PTEST for SSE4.1 or later?
   38082   // TODO: Add support for AVX-512.
   38083   EVT VT = SetCC->getValueType(0);
   38084   SDLoc DL(SetCC);
   38085   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
   38086       (OpSize == 256 && Subtarget.hasAVX2())) {
   38087     EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
   38088     SDValue Cmp;
   38089     if (IsOrXorXorCCZero) {
   38090       // This is a bitwise-combined equality comparison of 2 pairs of vectors:
   38091       // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
   38092       // Use 2 vector equality compares and 'and' the results before doing a
   38093       // MOVMSK.
   38094       SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
   38095       SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
   38096       SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
   38097       SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
   38098       SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ);
   38099       SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ);
   38100       Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
   38101     } else {
   38102       SDValue VecX = DAG.getBitcast(VecVT, X);
   38103       SDValue VecY = DAG.getBitcast(VecVT, Y);
   38104       Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ);
   38105     }
   38106     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
   38107     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
   38108     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
   38109     // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
   38110     // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
   38111     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
   38112     SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
   38113                                     MVT::i32);
   38114     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
   38115   }
   38116 
   38117   return SDValue();
   38118 }
   38119 
   38120 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
   38121                             const X86Subtarget &Subtarget) {
   38122   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   38123   SDValue LHS = N->getOperand(0);
   38124   SDValue RHS = N->getOperand(1);
   38125   EVT VT = N->getValueType(0);
   38126   EVT OpVT = LHS.getValueType();
   38127   SDLoc DL(N);
   38128 
   38129   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
   38130     // 0-x == y --> x+y == 0
   38131     // 0-x != y --> x+y != 0
   38132     if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
   38133         LHS.hasOneUse()) {
   38134       SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
   38135       return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
   38136     }
   38137     // x == 0-y --> x+y == 0
   38138     // x != 0-y --> x+y != 0
   38139     if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
   38140         RHS.hasOneUse()) {
   38141       SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
   38142       return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
   38143     }
   38144 
   38145     if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
   38146       return V;
   38147   }
   38148 
   38149   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
   38150       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
   38151     // Put build_vectors on the right.
   38152     if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
   38153       std::swap(LHS, RHS);
   38154       CC = ISD::getSetCCSwappedOperands(CC);
   38155     }
   38156 
   38157     bool IsSEXT0 =
   38158         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
   38159         (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
   38160     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
   38161 
   38162     if (IsSEXT0 && IsVZero1) {
   38163       assert(VT == LHS.getOperand(0).getValueType() &&
   38164              "Uexpected operand type");
   38165       if (CC == ISD::SETGT)
   38166         return DAG.getConstant(0, DL, VT);
   38167       if (CC == ISD::SETLE)
   38168         return DAG.getConstant(1, DL, VT);
   38169       if (CC == ISD::SETEQ || CC == ISD::SETGE)
   38170         return DAG.getNOT(DL, LHS.getOperand(0), VT);
   38171 
   38172       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
   38173              "Unexpected condition code!");
   38174       return LHS.getOperand(0);
   38175     }
   38176   }
   38177 
   38178   // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
   38179   // pre-promote its result type since vXi1 vectors don't get promoted
   38180   // during type legalization.
   38181   // NOTE: The element count check is to ignore operand types that need to
   38182   // go through type promotion to a 128-bit vector.
   38183   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
   38184       VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() > 4 &&
   38185       (OpVT.getVectorElementType() == MVT::i8 ||
   38186        OpVT.getVectorElementType() == MVT::i16)) {
   38187     SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
   38188                                 N->getOperand(2));
   38189     return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
   38190   }
   38191 
   38192   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
   38193   // to avoid scalarization via legalization because v4i32 is not a legal type.
   38194   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
   38195       LHS.getValueType() == MVT::v4f32)
   38196     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
   38197 
   38198   return SDValue();
   38199 }
   38200 
   38201 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
   38202                              TargetLowering::DAGCombinerInfo &DCI) {
   38203   SDValue Src = N->getOperand(0);
   38204   MVT SrcVT = Src.getSimpleValueType();
   38205 
   38206   // Perform constant folding.
   38207   if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
   38208     assert(N->getValueType(0) == MVT::i32 && "Unexpected result type");
   38209     APInt Imm(32, 0);
   38210     for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
   38211       SDValue In = Src.getOperand(Idx);
   38212       if (!In.isUndef() &&
   38213           cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
   38214         Imm.setBit(Idx);
   38215     }
   38216     return DAG.getConstant(Imm, SDLoc(N), N->getValueType(0));
   38217   }
   38218 
   38219   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   38220   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   38221                                         !DCI.isBeforeLegalizeOps());
   38222 
   38223   // MOVMSK only uses the MSB from each vector element.
   38224   KnownBits Known;
   38225   APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
   38226   if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
   38227     DCI.AddToWorklist(Src.getNode());
   38228     DCI.CommitTargetLoweringOpt(TLO);
   38229     return SDValue(N, 0);
   38230   }
   38231 
   38232   return SDValue();
   38233 }
   38234 
   38235 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
   38236                                     TargetLowering::DAGCombinerInfo &DCI,
   38237                                     const X86Subtarget &Subtarget) {
   38238   SDLoc DL(N);
   38239 
   38240   if (DCI.isBeforeLegalizeOps()) {
   38241     SDValue Index = N->getOperand(4);
   38242     // Remove any sign extends from 32 or smaller to larger than 32.
   38243     // Only do this before LegalizeOps in case we need the sign extend for
   38244     // legalization.
   38245     if (Index.getOpcode() == ISD::SIGN_EXTEND) {
   38246       if (Index.getScalarValueSizeInBits() > 32 &&
   38247           Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
   38248         SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
   38249         NewOps[4] = Index.getOperand(0);
   38250         SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
   38251         if (Res == N) {
   38252           // The original sign extend has less users, add back to worklist in
   38253           // case it needs to be removed
   38254           DCI.AddToWorklist(Index.getNode());
   38255           DCI.AddToWorklist(N);
   38256         }
   38257         return SDValue(Res, 0);
   38258       }
   38259     }
   38260 
   38261     // Make sure the index is either i32 or i64
   38262     unsigned ScalarSize = Index.getScalarValueSizeInBits();
   38263     if (ScalarSize != 32 && ScalarSize != 64) {
   38264       MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
   38265       EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
   38266                                    Index.getValueType().getVectorNumElements());
   38267       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
   38268       SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
   38269       NewOps[4] = Index;
   38270       SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
   38271       if (Res == N)
   38272         DCI.AddToWorklist(N);
   38273       return SDValue(Res, 0);
   38274     }
   38275 
   38276     // Try to remove zero extends from 32->64 if we know the sign bit of
   38277     // the input is zero.
   38278     if (Index.getOpcode() == ISD::ZERO_EXTEND &&
   38279         Index.getScalarValueSizeInBits() == 64 &&
   38280         Index.getOperand(0).getScalarValueSizeInBits() == 32) {
   38281       if (DAG.SignBitIsZero(Index.getOperand(0))) {
   38282         SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
   38283         NewOps[4] = Index.getOperand(0);
   38284         SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
   38285         if (Res == N) {
   38286           // The original sign extend has less users, add back to worklist in
   38287           // case it needs to be removed
   38288           DCI.AddToWorklist(Index.getNode());
   38289           DCI.AddToWorklist(N);
   38290         }
   38291         return SDValue(Res, 0);
   38292       }
   38293     }
   38294   }
   38295 
   38296   // With AVX2 we only demand the upper bit of the mask.
   38297   if (!Subtarget.hasAVX512()) {
   38298     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   38299     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   38300                                           !DCI.isBeforeLegalizeOps());
   38301     SDValue Mask = N->getOperand(2);
   38302     KnownBits Known;
   38303     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
   38304     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
   38305       DCI.AddToWorklist(Mask.getNode());
   38306       DCI.CommitTargetLoweringOpt(TLO);
   38307       return SDValue(N, 0);
   38308     }
   38309   }
   38310 
   38311   return SDValue();
   38312 }
   38313 
   38314 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
   38315 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
   38316                                const X86Subtarget &Subtarget) {
   38317   SDLoc DL(N);
   38318   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   38319   SDValue EFLAGS = N->getOperand(1);
   38320 
   38321   // Try to simplify the EFLAGS and condition code operands.
   38322   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
   38323     return getSETCC(CC, Flags, DL, DAG);
   38324 
   38325   return SDValue();
   38326 }
   38327 
   38328 /// Optimize branch condition evaluation.
   38329 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
   38330                              const X86Subtarget &Subtarget) {
   38331   SDLoc DL(N);
   38332   SDValue EFLAGS = N->getOperand(3);
   38333   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
   38334 
   38335   // Try to simplify the EFLAGS and condition code operands.
   38336   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
   38337   // RAUW them under us.
   38338   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
   38339     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
   38340     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
   38341                        N->getOperand(1), Cond, Flags);
   38342   }
   38343 
   38344   return SDValue();
   38345 }
   38346 
   38347 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
   38348                                                   SelectionDAG &DAG) {
   38349   // Take advantage of vector comparisons producing 0 or -1 in each lane to
   38350   // optimize away operation when it's from a constant.
   38351   //
   38352   // The general transformation is:
   38353   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
   38354   //       AND(VECTOR_CMP(x,y), constant2)
   38355   //    constant2 = UNARYOP(constant)
   38356 
   38357   // Early exit if this isn't a vector operation, the operand of the
   38358   // unary operation isn't a bitwise AND, or if the sizes of the operations
   38359   // aren't the same.
   38360   EVT VT = N->getValueType(0);
   38361   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
   38362       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
   38363       VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
   38364     return SDValue();
   38365 
   38366   // Now check that the other operand of the AND is a constant. We could
   38367   // make the transformation for non-constant splats as well, but it's unclear
   38368   // that would be a benefit as it would not eliminate any operations, just
   38369   // perform one more step in scalar code before moving to the vector unit.
   38370   if (BuildVectorSDNode *BV =
   38371           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
   38372     // Bail out if the vector isn't a constant.
   38373     if (!BV->isConstant())
   38374       return SDValue();
   38375 
   38376     // Everything checks out. Build up the new and improved node.
   38377     SDLoc DL(N);
   38378     EVT IntVT = BV->getValueType(0);
   38379     // Create a new constant of the appropriate type for the transformed
   38380     // DAG.
   38381     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
   38382     // The AND node needs bitcasts to/from an integer vector type around it.
   38383     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
   38384     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
   38385                                  N->getOperand(0)->getOperand(0), MaskConst);
   38386     SDValue Res = DAG.getBitcast(VT, NewAnd);
   38387     return Res;
   38388   }
   38389 
   38390   return SDValue();
   38391 }
   38392 
   38393 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
   38394                                const X86Subtarget &Subtarget) {
   38395   SDValue Op0 = N->getOperand(0);
   38396   EVT VT = N->getValueType(0);
   38397   EVT InVT = Op0.getValueType();
   38398 
   38399   // UINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
   38400   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
   38401   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
   38402   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
   38403     SDLoc dl(N);
   38404     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
   38405                                  InVT.getVectorNumElements());
   38406     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
   38407 
   38408     // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
   38409     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
   38410   }
   38411 
   38412   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
   38413   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
   38414   // the optimization here.
   38415   if (DAG.SignBitIsZero(Op0))
   38416     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
   38417 
   38418   return SDValue();
   38419 }
   38420 
   38421 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
   38422                                const X86Subtarget &Subtarget) {
   38423   // First try to optimize away the conversion entirely when it's
   38424   // conditionally from a constant. Vectors only.
   38425   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
   38426     return Res;
   38427 
   38428   // Now move on to more general possibilities.
   38429   SDValue Op0 = N->getOperand(0);
   38430   EVT VT = N->getValueType(0);
   38431   EVT InVT = Op0.getValueType();
   38432 
   38433   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
   38434   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
   38435   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
   38436   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
   38437     SDLoc dl(N);
   38438     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
   38439                                  InVT.getVectorNumElements());
   38440     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
   38441     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
   38442   }
   38443 
   38444   // Without AVX512DQ we only support i64 to float scalar conversion. For both
   38445   // vectors and scalars, see if we know that the upper bits are all the sign
   38446   // bit, in which case we can truncate the input to i32 and convert from that.
   38447   if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
   38448     unsigned BitWidth = InVT.getScalarSizeInBits();
   38449     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
   38450     if (NumSignBits >= (BitWidth - 31)) {
   38451       EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
   38452       if (InVT.isVector())
   38453         TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
   38454                                    InVT.getVectorNumElements());
   38455       SDLoc dl(N);
   38456       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
   38457       return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
   38458     }
   38459   }
   38460 
   38461   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
   38462   // a 32-bit target where SSE doesn't support i64->FP operations.
   38463   if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
   38464     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
   38465     EVT LdVT = Ld->getValueType(0);
   38466 
   38467     // This transformation is not supported if the result type is f16 or f128.
   38468     if (VT == MVT::f16 || VT == MVT::f128)
   38469       return SDValue();
   38470 
   38471     // If we have AVX512DQ we can use packed conversion instructions unless
   38472     // the VT is f80.
   38473     if (Subtarget.hasDQI() && VT != MVT::f80)
   38474       return SDValue();
   38475 
   38476     if (!Ld->isVolatile() && !VT.isVector() &&
   38477         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
   38478         !Subtarget.is64Bit() && LdVT == MVT::i64) {
   38479       SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
   38480           SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
   38481       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
   38482       return FILDChain;
   38483     }
   38484   }
   38485   return SDValue();
   38486 }
   38487 
   38488 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
   38489   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
   38490     MVT VT = N->getSimpleValueType(0);
   38491     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   38492     return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
   38493                        N->getOperand(0), N->getOperand(1),
   38494                        Flags);
   38495   }
   38496 
   38497   return SDValue();
   38498 }
   38499 
   38500 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
   38501 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
   38502                           TargetLowering::DAGCombinerInfo &DCI) {
   38503   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
   38504   // the result is either zero or one (depending on the input carry bit).
   38505   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
   38506   if (X86::isZeroNode(N->getOperand(0)) &&
   38507       X86::isZeroNode(N->getOperand(1)) &&
   38508       // We don't have a good way to replace an EFLAGS use, so only do this when
   38509       // dead right now.
   38510       SDValue(N, 1).use_empty()) {
   38511     SDLoc DL(N);
   38512     EVT VT = N->getValueType(0);
   38513     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
   38514     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
   38515                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
   38516                                            DAG.getConstant(X86::COND_B, DL,
   38517                                                            MVT::i8),
   38518                                            N->getOperand(2)),
   38519                                DAG.getConstant(1, DL, VT));
   38520     return DCI.CombineTo(N, Res1, CarryOut);
   38521   }
   38522 
   38523   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
   38524     MVT VT = N->getSimpleValueType(0);
   38525     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   38526     return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
   38527                        N->getOperand(0), N->getOperand(1),
   38528                        Flags);
   38529   }
   38530 
   38531   return SDValue();
   38532 }
   38533 
   38534 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
   38535 /// which is more useful than 0/1 in some cases.
   38536 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
   38537   SDLoc DL(N);
   38538   // "Condition code B" is also known as "the carry flag" (CF).
   38539   SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
   38540   SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
   38541   MVT VT = N->getSimpleValueType(0);
   38542   if (VT == MVT::i8)
   38543     return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
   38544 
   38545   assert(VT == MVT::i1 && "Unexpected type for SETCC node");
   38546   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
   38547 }
   38548 
   38549 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
   38550 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
   38551 /// with CMP+{ADC, SBB}.
   38552 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
   38553   bool IsSub = N->getOpcode() == ISD::SUB;
   38554   SDValue X = N->getOperand(0);
   38555   SDValue Y = N->getOperand(1);
   38556 
   38557   // If this is an add, canonicalize a zext operand to the RHS.
   38558   // TODO: Incomplete? What if both sides are zexts?
   38559   if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
   38560       Y.getOpcode() != ISD::ZERO_EXTEND)
   38561     std::swap(X, Y);
   38562 
   38563   // Look through a one-use zext.
   38564   bool PeekedThroughZext = false;
   38565   if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
   38566     Y = Y.getOperand(0);
   38567     PeekedThroughZext = true;
   38568   }
   38569 
   38570   // If this is an add, canonicalize a setcc operand to the RHS.
   38571   // TODO: Incomplete? What if both sides are setcc?
   38572   // TODO: Should we allow peeking through a zext of the other operand?
   38573   if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
   38574       Y.getOpcode() != X86ISD::SETCC)
   38575     std::swap(X, Y);
   38576 
   38577   if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
   38578     return SDValue();
   38579 
   38580   SDLoc DL(N);
   38581   EVT VT = N->getValueType(0);
   38582   X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
   38583 
   38584   // If X is -1 or 0, then we have an opportunity to avoid constants required in
   38585   // the general case below.
   38586   auto *ConstantX = dyn_cast<ConstantSDNode>(X);
   38587   if (ConstantX) {
   38588     if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
   38589         (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
   38590       // This is a complicated way to get -1 or 0 from the carry flag:
   38591       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
   38592       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
   38593       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
   38594                          DAG.getConstant(X86::COND_B, DL, MVT::i8),
   38595                          Y.getOperand(1));
   38596     }
   38597 
   38598     if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
   38599         (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
   38600       SDValue EFLAGS = Y->getOperand(1);
   38601       if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
   38602           EFLAGS.getValueType().isInteger() &&
   38603           !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
   38604         // Swap the operands of a SUB, and we have the same pattern as above.
   38605         // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
   38606         //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
   38607         SDValue NewSub = DAG.getNode(
   38608             X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
   38609             EFLAGS.getOperand(1), EFLAGS.getOperand(0));
   38610         SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
   38611         return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
   38612                            DAG.getConstant(X86::COND_B, DL, MVT::i8),
   38613                            NewEFLAGS);
   38614       }
   38615     }
   38616   }
   38617 
   38618   if (CC == X86::COND_B) {
   38619     // X + SETB Z --> X + (mask SBB Z, Z)
   38620     // X - SETB Z --> X - (mask SBB Z, Z)
   38621     // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
   38622     SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
   38623     if (SBB.getValueSizeInBits() != VT.getSizeInBits())
   38624       SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
   38625     return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
   38626   }
   38627 
   38628   if (CC == X86::COND_A) {
   38629     SDValue EFLAGS = Y->getOperand(1);
   38630     // Try to convert COND_A into COND_B in an attempt to facilitate
   38631     // materializing "setb reg".
   38632     //
   38633     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
   38634     // cannot take an immediate as its first operand.
   38635     //
   38636     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
   38637         EFLAGS.getValueType().isInteger() &&
   38638         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
   38639       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
   38640                                    EFLAGS.getNode()->getVTList(),
   38641                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
   38642       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
   38643       SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
   38644       if (SBB.getValueSizeInBits() != VT.getSizeInBits())
   38645         SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
   38646       return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
   38647     }
   38648   }
   38649 
   38650   if (CC != X86::COND_E && CC != X86::COND_NE)
   38651     return SDValue();
   38652 
   38653   SDValue Cmp = Y.getOperand(1);
   38654   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
   38655       !X86::isZeroNode(Cmp.getOperand(1)) ||
   38656       !Cmp.getOperand(0).getValueType().isInteger())
   38657     return SDValue();
   38658 
   38659   SDValue Z = Cmp.getOperand(0);
   38660   EVT ZVT = Z.getValueType();
   38661 
   38662   // If X is -1 or 0, then we have an opportunity to avoid constants required in
   38663   // the general case below.
   38664   if (ConstantX) {
   38665     // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
   38666     // fake operands:
   38667     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
   38668     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
   38669     if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
   38670         (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
   38671       SDValue Zero = DAG.getConstant(0, DL, ZVT);
   38672       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
   38673       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
   38674       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
   38675                          DAG.getConstant(X86::COND_B, DL, MVT::i8),
   38676                          SDValue(Neg.getNode(), 1));
   38677     }
   38678 
   38679     // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
   38680     // with fake operands:
   38681     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
   38682     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
   38683     if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
   38684         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
   38685       SDValue One = DAG.getConstant(1, DL, ZVT);
   38686       SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
   38687       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
   38688                          DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
   38689     }
   38690   }
   38691 
   38692   // (cmp Z, 1) sets the carry flag if Z is 0.
   38693   SDValue One = DAG.getConstant(1, DL, ZVT);
   38694   SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
   38695 
   38696   // Add the flags type for ADC/SBB nodes.
   38697   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   38698 
   38699   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
   38700   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
   38701   if (CC == X86::COND_NE)
   38702     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
   38703                        DAG.getConstant(-1ULL, DL, VT), Cmp1);
   38704 
   38705   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
   38706   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
   38707   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
   38708                      DAG.getConstant(0, DL, VT), Cmp1);
   38709 }
   38710 
   38711 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
   38712                                       const X86Subtarget &Subtarget) {
   38713   if (!Subtarget.hasSSE2())
   38714     return SDValue();
   38715 
   38716   SDValue MulOp = N->getOperand(0);
   38717   SDValue Phi = N->getOperand(1);
   38718 
   38719   if (MulOp.getOpcode() != ISD::MUL)
   38720     std::swap(MulOp, Phi);
   38721   if (MulOp.getOpcode() != ISD::MUL)
   38722     return SDValue();
   38723 
   38724   ShrinkMode Mode;
   38725   if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
   38726     return SDValue();
   38727 
   38728   EVT VT = N->getValueType(0);
   38729 
   38730   // If the vector size is less than 128, or greater than the supported RegSize,
   38731   // do not use PMADD.
   38732   if (VT.getVectorNumElements() < 8)
   38733     return SDValue();
   38734 
   38735   SDLoc DL(N);
   38736   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
   38737                                    VT.getVectorNumElements());
   38738   EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
   38739                                 VT.getVectorNumElements() / 2);
   38740 
   38741   // Shrink the operands of mul.
   38742   SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
   38743   SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
   38744 
   38745   // Madd vector size is half of the original vector size
   38746   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   38747                            ArrayRef<SDValue> Ops) {
   38748     MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
   38749     return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
   38750   };
   38751   SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
   38752                                   PMADDWDBuilder);
   38753   // Fill the rest of the output with 0
   38754   SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
   38755   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
   38756   return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
   38757 }
   38758 
   38759 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
   38760                                      const X86Subtarget &Subtarget) {
   38761   if (!Subtarget.hasSSE2())
   38762     return SDValue();
   38763 
   38764   SDLoc DL(N);
   38765   EVT VT = N->getValueType(0);
   38766   SDValue Op0 = N->getOperand(0);
   38767   SDValue Op1 = N->getOperand(1);
   38768 
   38769   // TODO: There's nothing special about i32, any integer type above i16 should
   38770   // work just as well.
   38771   if (!VT.isVector() || !VT.isSimple() ||
   38772       !(VT.getVectorElementType() == MVT::i32))
   38773     return SDValue();
   38774 
   38775   unsigned RegSize = 128;
   38776   if (Subtarget.useBWIRegs())
   38777     RegSize = 512;
   38778   else if (Subtarget.hasAVX())
   38779     RegSize = 256;
   38780 
   38781   // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
   38782   // TODO: We should be able to handle larger vectors by splitting them before
   38783   // feeding them into several SADs, and then reducing over those.
   38784   if (VT.getSizeInBits() / 4 > RegSize)
   38785     return SDValue();
   38786 
   38787   // We know N is a reduction add, which means one of its operands is a phi.
   38788   // To match SAD, we need the other operand to be a vector select.
   38789   SDValue SelectOp, Phi;
   38790   if (Op0.getOpcode() == ISD::VSELECT) {
   38791     SelectOp = Op0;
   38792     Phi = Op1;
   38793   } else if (Op1.getOpcode() == ISD::VSELECT) {
   38794     SelectOp = Op1;
   38795     Phi = Op0;
   38796   } else
   38797     return SDValue();
   38798 
   38799   // Check whether we have an abs-diff pattern feeding into the select.
   38800   if(!detectZextAbsDiff(SelectOp, Op0, Op1))
   38801     return SDValue();
   38802 
   38803   // SAD pattern detected. Now build a SAD instruction and an addition for
   38804   // reduction. Note that the number of elements of the result of SAD is less
   38805   // than the number of elements of its input. Therefore, we could only update
   38806   // part of elements in the reduction vector.
   38807   SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
   38808 
   38809   // The output of PSADBW is a vector of i64.
   38810   // We need to turn the vector of i64 into a vector of i32.
   38811   // If the reduction vector is at least as wide as the psadbw result, just
   38812   // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
   38813   // anyway.
   38814   MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
   38815   if (VT.getSizeInBits() >= ResVT.getSizeInBits())
   38816     Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
   38817   else
   38818     Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
   38819 
   38820   if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
   38821     // Fill the upper elements with zero to match the add width.
   38822     SDValue Zero = DAG.getConstant(0, DL, VT);
   38823     Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
   38824                       DAG.getIntPtrConstant(0, DL));
   38825   }
   38826 
   38827   return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
   38828 }
   38829 
   38830 /// Convert vector increment or decrement to sub/add with an all-ones constant:
   38831 /// add X, <1, 1...> --> sub X, <-1, -1...>
   38832 /// sub X, <1, 1...> --> add X, <-1, -1...>
   38833 /// The all-ones vector constant can be materialized using a pcmpeq instruction
   38834 /// that is commonly recognized as an idiom (has no register dependency), so
   38835 /// that's better/smaller than loading a splat 1 constant.
   38836 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
   38837   assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
   38838          "Unexpected opcode for increment/decrement transform");
   38839 
   38840   // Pseudo-legality check: getOnesVector() expects one of these types, so bail
   38841   // out and wait for legalization if we have an unsupported vector length.
   38842   EVT VT = N->getValueType(0);
   38843   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
   38844     return SDValue();
   38845 
   38846   SDNode *N1 = N->getOperand(1).getNode();
   38847   APInt SplatVal;
   38848   if (!ISD::isConstantSplatVector(N1, SplatVal) ||
   38849       !SplatVal.isOneValue())
   38850     return SDValue();
   38851 
   38852   SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
   38853   unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
   38854   return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
   38855 }
   38856 
   38857 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
   38858                             const SDLoc &DL, EVT VT,
   38859                             const X86Subtarget &Subtarget) {
   38860   // Example of pattern we try to detect:
   38861   // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
   38862   //(add (build_vector (extract_elt t, 0),
   38863   //                   (extract_elt t, 2),
   38864   //                   (extract_elt t, 4),
   38865   //                   (extract_elt t, 6)),
   38866   //     (build_vector (extract_elt t, 1),
   38867   //                   (extract_elt t, 3),
   38868   //                   (extract_elt t, 5),
   38869   //                   (extract_elt t, 7)))
   38870 
   38871   if (!Subtarget.hasSSE2())
   38872     return SDValue();
   38873 
   38874   if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
   38875       Op1.getOpcode() != ISD::BUILD_VECTOR)
   38876     return SDValue();
   38877 
   38878   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
   38879       VT.getVectorNumElements() < 4 ||
   38880       !isPowerOf2_32(VT.getVectorNumElements()))
   38881     return SDValue();
   38882 
   38883   // Check if one of Op0,Op1 is of the form:
   38884   // (build_vector (extract_elt Mul, 0),
   38885   //               (extract_elt Mul, 2),
   38886   //               (extract_elt Mul, 4),
   38887   //                   ...
   38888   // the other is of the form:
   38889   // (build_vector (extract_elt Mul, 1),
   38890   //               (extract_elt Mul, 3),
   38891   //               (extract_elt Mul, 5),
   38892   //                   ...
   38893   // and identify Mul.
   38894   SDValue Mul;
   38895   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
   38896     SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
   38897             Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
   38898     // TODO: Be more tolerant to undefs.
   38899     if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   38900         Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   38901         Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   38902         Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   38903       return SDValue();
   38904     auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
   38905     auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
   38906     auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
   38907     auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
   38908     if (!Const0L || !Const1L || !Const0H || !Const1H)
   38909       return SDValue();
   38910     unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
   38911              Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
   38912     // Commutativity of mul allows factors of a product to reorder.
   38913     if (Idx0L > Idx1L)
   38914       std::swap(Idx0L, Idx1L);
   38915     if (Idx0H > Idx1H)
   38916       std::swap(Idx0H, Idx1H);
   38917     // Commutativity of add allows pairs of factors to reorder.
   38918     if (Idx0L > Idx0H) {
   38919       std::swap(Idx0L, Idx0H);
   38920       std::swap(Idx1L, Idx1H);
   38921     }
   38922     if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
   38923         Idx1H != 2 * i + 3)
   38924       return SDValue();
   38925     if (!Mul) {
   38926       // First time an extract_elt's source vector is visited. Must be a MUL
   38927       // with 2X number of vector elements than the BUILD_VECTOR.
   38928       // Both extracts must be from same MUL.
   38929       Mul = Op0L->getOperand(0);
   38930       if (Mul->getOpcode() != ISD::MUL ||
   38931           Mul.getValueType().getVectorNumElements() != 2 * e)
   38932         return SDValue();
   38933     }
   38934     // Check that the extract is from the same MUL previously seen.
   38935     if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
   38936         Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
   38937       return SDValue();
   38938   }
   38939 
   38940   // Check if the Mul source can be safely shrunk.
   38941   ShrinkMode Mode;
   38942   if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
   38943     return SDValue();
   38944 
   38945   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   38946                          ArrayRef<SDValue> Ops) {
   38947     // Shrink by adding truncate nodes and let DAGCombine fold with the
   38948     // sources.
   38949     EVT InVT = Ops[0].getValueType();
   38950     assert(InVT.getScalarType() == MVT::i32 &&
   38951            "Unexpected scalar element type");
   38952     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
   38953     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
   38954                                  InVT.getVectorNumElements() / 2);
   38955     EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
   38956                                    InVT.getVectorNumElements());
   38957     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
   38958                        DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
   38959                        DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
   38960   };
   38961   return SplitOpsAndApply(DAG, Subtarget, DL, VT,
   38962                           { Mul.getOperand(0), Mul.getOperand(1) },
   38963                           PMADDBuilder);
   38964 }
   38965 
   38966 // Attempt to turn this pattern into PMADDWD.
   38967 // (mul (add (zext (build_vector)), (zext (build_vector))),
   38968 //      (add (zext (build_vector)), (zext (build_vector)))
   38969 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
   38970                               const SDLoc &DL, EVT VT,
   38971                               const X86Subtarget &Subtarget) {
   38972   if (!Subtarget.hasSSE2())
   38973     return SDValue();
   38974 
   38975   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
   38976     return SDValue();
   38977 
   38978   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
   38979       VT.getVectorNumElements() < 4 ||
   38980       !isPowerOf2_32(VT.getVectorNumElements()))
   38981     return SDValue();
   38982 
   38983   SDValue N00 = N0.getOperand(0);
   38984   SDValue N01 = N0.getOperand(1);
   38985   SDValue N10 = N1.getOperand(0);
   38986   SDValue N11 = N1.getOperand(1);
   38987 
   38988   // All inputs need to be sign extends.
   38989   // TODO: Support ZERO_EXTEND from known positive?
   38990   if (N00.getOpcode() != ISD::SIGN_EXTEND ||
   38991       N01.getOpcode() != ISD::SIGN_EXTEND ||
   38992       N10.getOpcode() != ISD::SIGN_EXTEND ||
   38993       N11.getOpcode() != ISD::SIGN_EXTEND)
   38994     return SDValue();
   38995 
   38996   // Peek through the extends.
   38997   N00 = N00.getOperand(0);
   38998   N01 = N01.getOperand(0);
   38999   N10 = N10.getOperand(0);
   39000   N11 = N11.getOperand(0);
   39001 
   39002   // Must be extending from vXi16.
   39003   EVT InVT = N00.getValueType();
   39004   if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
   39005       N10.getValueType() != InVT || N11.getValueType() != InVT)
   39006     return SDValue();
   39007 
   39008   // All inputs should be build_vectors.
   39009   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
   39010       N01.getOpcode() != ISD::BUILD_VECTOR ||
   39011       N10.getOpcode() != ISD::BUILD_VECTOR ||
   39012       N11.getOpcode() != ISD::BUILD_VECTOR)
   39013     return SDValue();
   39014 
   39015   // For each element, we need to ensure we have an odd element from one vector
   39016   // multiplied by the odd element of another vector and the even element from
   39017   // one of the same vectors being multiplied by the even element from the
   39018   // other vector. So we need to make sure for each element i, this operator
   39019   // is being performed:
   39020   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
   39021   SDValue In0, In1;
   39022   for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
   39023     SDValue N00Elt = N00.getOperand(i);
   39024     SDValue N01Elt = N01.getOperand(i);
   39025     SDValue N10Elt = N10.getOperand(i);
   39026     SDValue N11Elt = N11.getOperand(i);
   39027     // TODO: Be more tolerant to undefs.
   39028     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   39029         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   39030         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
   39031         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
   39032       return SDValue();
   39033     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
   39034     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
   39035     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
   39036     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
   39037     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
   39038       return SDValue();
   39039     unsigned IdxN00 = ConstN00Elt->getZExtValue();
   39040     unsigned IdxN01 = ConstN01Elt->getZExtValue();
   39041     unsigned IdxN10 = ConstN10Elt->getZExtValue();
   39042     unsigned IdxN11 = ConstN11Elt->getZExtValue();
   39043     // Add is commutative so indices can be reordered.
   39044     if (IdxN00 > IdxN10) {
   39045       std::swap(IdxN00, IdxN10);
   39046       std::swap(IdxN01, IdxN11);
   39047     }
   39048     // N0 indices be the even element. N1 indices must be the next odd element.
   39049     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
   39050         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
   39051       return SDValue();
   39052     SDValue N00In = N00Elt.getOperand(0);
   39053     SDValue N01In = N01Elt.getOperand(0);
   39054     SDValue N10In = N10Elt.getOperand(0);
   39055     SDValue N11In = N11Elt.getOperand(0);
   39056     // First time we find an input capture it.
   39057     if (!In0) {
   39058       In0 = N00In;
   39059       In1 = N01In;
   39060     }
   39061     // Mul is commutative so the input vectors can be in any order.
   39062     // Canonicalize to make the compares easier.
   39063     if (In0 != N00In)
   39064       std::swap(N00In, N01In);
   39065     if (In0 != N10In)
   39066       std::swap(N10In, N11In);
   39067     if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
   39068       return SDValue();
   39069   }
   39070 
   39071   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   39072                          ArrayRef<SDValue> Ops) {
   39073     // Shrink by adding truncate nodes and let DAGCombine fold with the
   39074     // sources.
   39075     EVT InVT = Ops[0].getValueType();
   39076     assert(InVT.getScalarType() == MVT::i16 &&
   39077            "Unexpected scalar element type");
   39078     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
   39079     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
   39080                                  InVT.getVectorNumElements() / 2);
   39081     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
   39082   };
   39083   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
   39084                           PMADDBuilder);
   39085 }
   39086 
   39087 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   39088                           const X86Subtarget &Subtarget) {
   39089   const SDNodeFlags Flags = N->getFlags();
   39090   if (Flags.hasVectorReduction()) {
   39091     if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
   39092       return Sad;
   39093     if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
   39094       return MAdd;
   39095   }
   39096   EVT VT = N->getValueType(0);
   39097   SDValue Op0 = N->getOperand(0);
   39098   SDValue Op1 = N->getOperand(1);
   39099 
   39100   if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
   39101     return MAdd;
   39102   if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
   39103     return MAdd;
   39104 
   39105   // Try to synthesize horizontal adds from adds of shuffles.
   39106   if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
   39107        VT == MVT::v8i32) &&
   39108       Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) {
   39109     auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   39110                           ArrayRef<SDValue> Ops) {
   39111       return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
   39112     };
   39113     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
   39114                             HADDBuilder);
   39115   }
   39116 
   39117   if (SDValue V = combineIncDecVector(N, DAG))
   39118     return V;
   39119 
   39120   return combineAddOrSubToADCOrSBB(N, DAG);
   39121 }
   39122 
   39123 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
   39124                                  const X86Subtarget &Subtarget) {
   39125   SDValue Op0 = N->getOperand(0);
   39126   SDValue Op1 = N->getOperand(1);
   39127   EVT VT = N->getValueType(0);
   39128 
   39129   // PSUBUS is supported, starting from SSE2, but truncation for v8i32
   39130   // is only worth it with SSSE3 (PSHUFB).
   39131   if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
   39132       !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
   39133       !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
   39134       !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
   39135                                    VT == MVT::v16i32 || VT == MVT::v8i64)))
   39136     return SDValue();
   39137 
   39138   SDValue SubusLHS, SubusRHS;
   39139   // Try to find umax(a,b) - b or a - umin(a,b) patterns
   39140   // they may be converted to subus(a,b).
   39141   // TODO: Need to add IR canonicalization for this code.
   39142   if (Op0.getOpcode() == ISD::UMAX) {
   39143     SubusRHS = Op1;
   39144     SDValue MaxLHS = Op0.getOperand(0);
   39145     SDValue MaxRHS = Op0.getOperand(1);
   39146     if (MaxLHS == Op1)
   39147       SubusLHS = MaxRHS;
   39148     else if (MaxRHS == Op1)
   39149       SubusLHS = MaxLHS;
   39150     else
   39151       return SDValue();
   39152   } else if (Op1.getOpcode() == ISD::UMIN) {
   39153     SubusLHS = Op0;
   39154     SDValue MinLHS = Op1.getOperand(0);
   39155     SDValue MinRHS = Op1.getOperand(1);
   39156     if (MinLHS == Op0)
   39157       SubusRHS = MinRHS;
   39158     else if (MinRHS == Op0)
   39159       SubusRHS = MinLHS;
   39160     else
   39161       return SDValue();
   39162   } else
   39163     return SDValue();
   39164 
   39165   auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   39166                          ArrayRef<SDValue> Ops) {
   39167     return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
   39168   };
   39169 
   39170   // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
   39171   // special preprocessing in some cases.
   39172   if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
   39173     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
   39174                             { SubusLHS, SubusRHS }, SUBUSBuilder);
   39175 
   39176   // Special preprocessing case can be only applied
   39177   // if the value was zero extended from 16 bit,
   39178   // so we require first 16 bits to be zeros for 32 bit
   39179   // values, or first 48 bits for 64 bit values.
   39180   KnownBits Known;
   39181   DAG.computeKnownBits(SubusLHS, Known);
   39182   unsigned NumZeros = Known.countMinLeadingZeros();
   39183   if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
   39184     return SDValue();
   39185 
   39186   EVT ExtType = SubusLHS.getValueType();
   39187   EVT ShrinkedType;
   39188   if (VT == MVT::v8i32 || VT == MVT::v8i64)
   39189     ShrinkedType = MVT::v8i16;
   39190   else
   39191     ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
   39192 
   39193   // If SubusLHS is zeroextended - truncate SubusRHS to it's
   39194   // size SubusRHS = umin(0xFFF.., SubusRHS).
   39195   SDValue SaturationConst =
   39196       DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
   39197                                            ShrinkedType.getScalarSizeInBits()),
   39198                       SDLoc(SubusLHS), ExtType);
   39199   SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
   39200                              SaturationConst);
   39201   SDValue NewSubusLHS =
   39202       DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
   39203   SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
   39204   SDValue Psubus =
   39205       SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
   39206                        { NewSubusLHS, NewSubusRHS }, SUBUSBuilder);
   39207   // Zero extend the result, it may be used somewhere as 32 bit,
   39208   // if not zext and following trunc will shrink.
   39209   return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
   39210 }
   39211 
   39212 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
   39213                           const X86Subtarget &Subtarget) {
   39214   SDValue Op0 = N->getOperand(0);
   39215   SDValue Op1 = N->getOperand(1);
   39216 
   39217   // X86 can't encode an immediate LHS of a sub. See if we can push the
   39218   // negation into a preceding instruction.
   39219   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
   39220     // If the RHS of the sub is a XOR with one use and a constant, invert the
   39221     // immediate. Then add one to the LHS of the sub so we can turn
   39222     // X-Y -> X+~Y+1, saving one register.
   39223     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
   39224         isa<ConstantSDNode>(Op1.getOperand(1))) {
   39225       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
   39226       EVT VT = Op0.getValueType();
   39227       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
   39228                                    Op1.getOperand(0),
   39229                                    DAG.getConstant(~XorC, SDLoc(Op1), VT));
   39230       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
   39231                          DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
   39232     }
   39233   }
   39234 
   39235   // Try to synthesize horizontal subs from subs of shuffles.
   39236   EVT VT = N->getValueType(0);
   39237   if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
   39238        VT == MVT::v8i32) &&
   39239       Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) {
   39240     auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
   39241                           ArrayRef<SDValue> Ops) {
   39242       return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
   39243     };
   39244     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
   39245                             HSUBBuilder);
   39246   }
   39247 
   39248   if (SDValue V = combineIncDecVector(N, DAG))
   39249     return V;
   39250 
   39251   // Try to create PSUBUS if SUB's argument is max/min
   39252   if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
   39253     return V;
   39254 
   39255   return combineAddOrSubToADCOrSBB(N, DAG);
   39256 }
   39257 
   39258 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
   39259                              TargetLowering::DAGCombinerInfo &DCI,
   39260                              const X86Subtarget &Subtarget) {
   39261   if (DCI.isBeforeLegalize())
   39262     return SDValue();
   39263 
   39264   SDLoc DL(N);
   39265   unsigned Opcode = N->getOpcode();
   39266   MVT VT = N->getSimpleValueType(0);
   39267   MVT SVT = VT.getVectorElementType();
   39268   unsigned NumElts = VT.getVectorNumElements();
   39269   unsigned EltSizeInBits = SVT.getSizeInBits();
   39270 
   39271   SDValue Op = N->getOperand(0);
   39272   MVT OpVT = Op.getSimpleValueType();
   39273   MVT OpEltVT = OpVT.getVectorElementType();
   39274   unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
   39275   unsigned InputBits = OpEltSizeInBits * NumElts;
   39276 
   39277   // Perform any constant folding.
   39278   // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
   39279   APInt UndefElts;
   39280   SmallVector<APInt, 64> EltBits;
   39281   if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
   39282     APInt Undefs(NumElts, 0);
   39283     SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
   39284     bool IsZEXT =
   39285         (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
   39286     for (unsigned i = 0; i != NumElts; ++i) {
   39287       if (UndefElts[i]) {
   39288         Undefs.setBit(i);
   39289         continue;
   39290       }
   39291       Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
   39292                        : EltBits[i].sextOrTrunc(EltSizeInBits);
   39293     }
   39294     return getConstVector(Vals, Undefs, VT, DAG, DL);
   39295   }
   39296 
   39297   // (vzext (bitcast (vzext (x)) -> (vzext x)
   39298   // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
   39299   SDValue V = peekThroughBitcasts(Op);
   39300   if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
   39301     MVT InnerVT = V.getSimpleValueType();
   39302     MVT InnerEltVT = InnerVT.getVectorElementType();
   39303 
   39304     // If the element sizes match exactly, we can just do one larger vzext. This
   39305     // is always an exact type match as vzext operates on integer types.
   39306     if (OpEltVT == InnerEltVT) {
   39307       assert(OpVT == InnerVT && "Types must match for vzext!");
   39308       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
   39309     }
   39310 
   39311     // The only other way we can combine them is if only a single element of the
   39312     // inner vzext is used in the input to the outer vzext.
   39313     if (InnerEltVT.getSizeInBits() < InputBits)
   39314       return SDValue();
   39315 
   39316     // In this case, the inner vzext is completely dead because we're going to
   39317     // only look at bits inside of the low element. Just do the outer vzext on
   39318     // a bitcast of the input to the inner.
   39319     return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
   39320   }
   39321 
   39322   // Check if we can bypass extracting and re-inserting an element of an input
   39323   // vector. Essentially:
   39324   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
   39325   // TODO: Add X86ISD::VSEXT support
   39326   if (Opcode == X86ISD::VZEXT &&
   39327       V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
   39328       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
   39329       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
   39330     SDValue ExtractedV = V.getOperand(0);
   39331     SDValue OrigV = ExtractedV.getOperand(0);
   39332     if (isNullConstant(ExtractedV.getOperand(1))) {
   39333         MVT OrigVT = OrigV.getSimpleValueType();
   39334         // Extract a subvector if necessary...
   39335         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
   39336           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
   39337           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
   39338                                     OrigVT.getVectorNumElements() / Ratio);
   39339           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
   39340                               DAG.getIntPtrConstant(0, DL));
   39341         }
   39342         Op = DAG.getBitcast(OpVT, OrigV);
   39343         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
   39344       }
   39345   }
   39346 
   39347   return SDValue();
   39348 }
   39349 
   39350 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
   39351                                     const X86Subtarget &Subtarget) {
   39352   MVT VT = N->getSimpleValueType(0);
   39353   SDLoc DL(N);
   39354 
   39355   if (N->getOperand(0) == N->getOperand(1)) {
   39356     if (N->getOpcode() == X86ISD::PCMPEQ)
   39357       return getOnesVector(VT, DAG, DL);
   39358     if (N->getOpcode() == X86ISD::PCMPGT)
   39359       return getZeroVector(VT, Subtarget, DAG, DL);
   39360   }
   39361 
   39362   return SDValue();
   39363 }
   39364 
   39365 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
   39366                                       TargetLowering::DAGCombinerInfo &DCI,
   39367                                       const X86Subtarget &Subtarget) {
   39368   if (DCI.isBeforeLegalizeOps())
   39369     return SDValue();
   39370 
   39371   MVT OpVT = N->getSimpleValueType(0);
   39372 
   39373   bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
   39374 
   39375   SDLoc dl(N);
   39376   SDValue Vec = N->getOperand(0);
   39377   SDValue SubVec = N->getOperand(1);
   39378 
   39379   unsigned IdxVal = N->getConstantOperandVal(2);
   39380   MVT SubVecVT = SubVec.getSimpleValueType();
   39381 
   39382   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
   39383     // Inserting zeros into zeros is a nop.
   39384     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
   39385       return getZeroVector(OpVT, Subtarget, DAG, dl);
   39386 
   39387     // If we're inserting into a zero vector and then into a larger zero vector,
   39388     // just insert into the larger zero vector directly.
   39389     if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
   39390         ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
   39391       unsigned Idx2Val = SubVec.getConstantOperandVal(2);
   39392       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
   39393                          getZeroVector(OpVT, Subtarget, DAG, dl),
   39394                          SubVec.getOperand(1),
   39395                          DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
   39396     }
   39397 
   39398     // If we're inserting into a zero vector and our input was extracted from an
   39399     // insert into a zero vector of the same type and the extraction was at
   39400     // least as large as the original insertion. Just insert the original
   39401     // subvector into a zero vector.
   39402     if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
   39403         SubVec.getConstantOperandVal(1) == 0 &&
   39404         SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
   39405       SDValue Ins = SubVec.getOperand(0);
   39406       if (Ins.getConstantOperandVal(2) == 0 &&
   39407           ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
   39408           Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
   39409         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
   39410                            getZeroVector(OpVT, Subtarget, DAG, dl),
   39411                            Ins.getOperand(1), N->getOperand(2));
   39412     }
   39413 
   39414     // If we're inserting a bitcast into zeros, rewrite the insert and move the
   39415     // bitcast to the other side. This helps with detecting zero extending
   39416     // during isel.
   39417     // TODO: Is this useful for other indices than 0?
   39418     if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
   39419       MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
   39420       unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
   39421       MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
   39422       SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
   39423                                    DAG.getBitcast(NewVT, Vec),
   39424                                    SubVec.getOperand(0), N->getOperand(2));
   39425       return DAG.getBitcast(OpVT, Insert);
   39426     }
   39427   }
   39428 
   39429   // Stop here if this is an i1 vector.
   39430   if (IsI1Vector)
   39431     return SDValue();
   39432 
   39433   // If this is an insert of an extract, combine to a shuffle. Don't do this
   39434   // if the insert or extract can be represented with a subregister operation.
   39435   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
   39436       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
   39437       (IdxVal != 0 || !Vec.isUndef())) {
   39438     int ExtIdxVal = SubVec.getConstantOperandVal(1);
   39439     if (ExtIdxVal != 0) {
   39440       int VecNumElts = OpVT.getVectorNumElements();
   39441       int SubVecNumElts = SubVecVT.getVectorNumElements();
   39442       SmallVector<int, 64> Mask(VecNumElts);
   39443       // First create an identity shuffle mask.
   39444       for (int i = 0; i != VecNumElts; ++i)
   39445         Mask[i] = i;
   39446       // Now insert the extracted portion.
   39447       for (int i = 0; i != SubVecNumElts; ++i)
   39448         Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
   39449 
   39450       return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
   39451     }
   39452   }
   39453 
   39454   // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
   39455   // load:
   39456   // (insert_subvector (insert_subvector undef, (load16 addr), 0),
   39457   //                   (load16 addr + 16), Elts/2)
   39458   // --> load32 addr
   39459   // or:
   39460   // (insert_subvector (insert_subvector undef, (load32 addr), 0),
   39461   //                   (load32 addr + 32), Elts/2)
   39462   // --> load64 addr
   39463   // or a 16-byte or 32-byte broadcast:
   39464   // (insert_subvector (insert_subvector undef, (load16 addr), 0),
   39465   //                   (load16 addr), Elts/2)
   39466   // --> X86SubVBroadcast(load16 addr)
   39467   // or:
   39468   // (insert_subvector (insert_subvector undef, (load32 addr), 0),
   39469   //                   (load32 addr), Elts/2)
   39470   // --> X86SubVBroadcast(load32 addr)
   39471   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
   39472       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
   39473       OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
   39474     if (isNullConstant(Vec.getOperand(2))) {
   39475       SDValue SubVec2 = Vec.getOperand(1);
   39476       // If needed, look through bitcasts to get to the load.
   39477       if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
   39478         bool Fast;
   39479         unsigned Alignment = FirstLd->getAlignment();
   39480         unsigned AS = FirstLd->getAddressSpace();
   39481         const X86TargetLowering *TLI = Subtarget.getTargetLowering();
   39482         if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
   39483                                     OpVT, AS, Alignment, &Fast) && Fast) {
   39484           SDValue Ops[] = {SubVec2, SubVec};
   39485           if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
   39486                                                     Subtarget, false))
   39487             return Ld;
   39488         }
   39489       }
   39490       // If lower/upper loads are the same and the only users of the load, then
   39491       // lower to a VBROADCASTF128/VBROADCASTI128/etc.
   39492       if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
   39493         if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
   39494             SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
   39495           return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
   39496 
   39497       // If this is subv_broadcast insert into both halves, use a larger
   39498       // subv_broadcast.
   39499       if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
   39500         return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
   39501                            SubVec.getOperand(0));
   39502 
   39503       // If we're inserting all zeros into the upper half, change this to
   39504       // an insert into an all zeros vector. We will match this to a move
   39505       // with implicit upper bit zeroing during isel.
   39506       if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
   39507         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
   39508                            getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
   39509                            Vec.getOperand(2));
   39510 
   39511       // If we are inserting into both halves of the vector, the starting
   39512       // vector should be undef. If it isn't, make it so. Only do this if the
   39513       // the early insert has no other uses.
   39514       // TODO: Should this be a generic DAG combine?
   39515       if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
   39516         Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
   39517                           SubVec2, Vec.getOperand(2));
   39518         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
   39519                            N->getOperand(2));
   39520 
   39521       }
   39522     }
   39523   }
   39524 
   39525   return SDValue();
   39526 }
   39527 
   39528 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
   39529                                        TargetLowering::DAGCombinerInfo &DCI,
   39530                                        const X86Subtarget &Subtarget) {
   39531   if (DCI.isBeforeLegalizeOps())
   39532     return SDValue();
   39533 
   39534   MVT OpVT = N->getSimpleValueType(0);
   39535   SDValue InVec = N->getOperand(0);
   39536   unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   39537 
   39538   if (ISD::isBuildVectorAllZeros(InVec.getNode()))
   39539     return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
   39540 
   39541   if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
   39542     if (OpVT.getScalarType() == MVT::i1)
   39543       return DAG.getConstant(1, SDLoc(N), OpVT);
   39544     return getOnesVector(OpVT, DAG, SDLoc(N));
   39545   }
   39546 
   39547   if (InVec.getOpcode() == ISD::BUILD_VECTOR)
   39548     return DAG.getBuildVector(
   39549         OpVT, SDLoc(N),
   39550         InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
   39551 
   39552   // If we're extracting the lowest subvector and we're the only user,
   39553   // we may be able to perform this with a smaller vector width.
   39554   if (IdxVal == 0 && InVec.hasOneUse()) {
   39555     unsigned InOpcode = InVec.getOpcode();
   39556     if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
   39557       // v2f64 CVTDQ2PD(v4i32).
   39558       if (InOpcode == ISD::SINT_TO_FP &&
   39559           InVec.getOperand(0).getValueType() == MVT::v4i32) {
   39560         return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0));
   39561       }
   39562       // v2f64 CVTPS2PD(v4f32).
   39563       if (InOpcode == ISD::FP_EXTEND &&
   39564           InVec.getOperand(0).getValueType() == MVT::v4f32) {
   39565         return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0));
   39566       }
   39567     }
   39568     if ((InOpcode == X86ISD::VZEXT || InOpcode == X86ISD::VSEXT) &&
   39569         OpVT.is128BitVector() &&
   39570         InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
   39571       unsigned ExtOp = InOpcode == X86ISD::VZEXT ? ISD::ZERO_EXTEND_VECTOR_INREG
   39572                                                  : ISD::SIGN_EXTEND_VECTOR_INREG;
   39573       return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
   39574     }
   39575   }
   39576 
   39577   return SDValue();
   39578 }
   39579 
   39580 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
   39581   EVT VT = N->getValueType(0);
   39582   SDValue Src = N->getOperand(0);
   39583 
   39584   // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
   39585   // This occurs frequently in our masked scalar intrinsic code and our
   39586   // floating point select lowering with AVX512.
   39587   // TODO: SimplifyDemandedBits instead?
   39588   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
   39589     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
   39590       if (C->getAPIntValue().isOneValue())
   39591         return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
   39592                            Src.getOperand(0));
   39593 
   39594   return SDValue();
   39595 }
   39596 
   39597 // Simplify PMULDQ and PMULUDQ operations.
   39598 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
   39599                              TargetLowering::DAGCombinerInfo &DCI) {
   39600   SDValue LHS = N->getOperand(0);
   39601   SDValue RHS = N->getOperand(1);
   39602 
   39603   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   39604   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   39605                                         !DCI.isBeforeLegalizeOps());
   39606   APInt DemandedMask(APInt::getLowBitsSet(64, 32));
   39607 
   39608   // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element.
   39609   KnownBits LHSKnown;
   39610   if (TLI.SimplifyDemandedBits(LHS, DemandedMask, LHSKnown, TLO)) {
   39611     DCI.CommitTargetLoweringOpt(TLO);
   39612     return SDValue(N, 0);
   39613   }
   39614 
   39615   KnownBits RHSKnown;
   39616   if (TLI.SimplifyDemandedBits(RHS, DemandedMask, RHSKnown, TLO)) {
   39617     DCI.CommitTargetLoweringOpt(TLO);
   39618     return SDValue(N, 0);
   39619   }
   39620 
   39621   return SDValue();
   39622 }
   39623 
   39624 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   39625                                              DAGCombinerInfo &DCI) const {
   39626   SelectionDAG &DAG = DCI.DAG;
   39627   switch (N->getOpcode()) {
   39628   default: break;
   39629   case ISD::SCALAR_TO_VECTOR:
   39630     return combineScalarToVector(N, DAG);
   39631   case ISD::EXTRACT_VECTOR_ELT:
   39632   case X86ISD::PEXTRW:
   39633   case X86ISD::PEXTRB:
   39634     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
   39635   case ISD::INSERT_SUBVECTOR:
   39636     return combineInsertSubvector(N, DAG, DCI, Subtarget);
   39637   case ISD::EXTRACT_SUBVECTOR:
   39638     return combineExtractSubvector(N, DAG, DCI, Subtarget);
   39639   case ISD::VSELECT:
   39640   case ISD::SELECT:
   39641   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
   39642   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
   39643   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
   39644   case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
   39645   case ISD::SUB:            return combineSub(N, DAG, Subtarget);
   39646   case X86ISD::SBB:         return combineSBB(N, DAG);
   39647   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
   39648   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
   39649   case ISD::SHL:
   39650   case ISD::SRA:
   39651   case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
   39652   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
   39653   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
   39654   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
   39655   case X86ISD::BEXTR:       return combineBEXTR(N, DAG, DCI, Subtarget);
   39656   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
   39657   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
   39658   case ISD::STORE:          return combineStore(N, DAG, Subtarget);
   39659   case ISD::MSTORE:         return combineMaskedStore(N, DAG, Subtarget);
   39660   case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
   39661   case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
   39662   case ISD::FADD:
   39663   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
   39664   case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
   39665   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
   39666   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
   39667   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
   39668   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
   39669   case X86ISD::FXOR:
   39670   case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
   39671   case X86ISD::FMIN:
   39672   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
   39673   case ISD::FMINNUM:
   39674   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
   39675   case X86ISD::BT:          return combineBT(N, DAG, DCI);
   39676   case ISD::ANY_EXTEND:
   39677   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
   39678   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
   39679   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
   39680   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
   39681   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
   39682   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
   39683   case X86ISD::PACKSS:
   39684   case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);
   39685   case X86ISD::VSHLI:
   39686   case X86ISD::VSRAI:
   39687   case X86ISD::VSRLI:
   39688     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
   39689   case ISD::SIGN_EXTEND_VECTOR_INREG:
   39690   case ISD::ZERO_EXTEND_VECTOR_INREG:
   39691   case X86ISD::VSEXT:
   39692   case X86ISD::VZEXT:       return combineVSZext(N, DAG, DCI, Subtarget);
   39693   case X86ISD::PINSRB:
   39694   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
   39695   case X86ISD::SHUFP:       // Handle all target specific shuffles
   39696   case X86ISD::INSERTPS:
   39697   case X86ISD::EXTRQI:
   39698   case X86ISD::INSERTQI:
   39699   case X86ISD::PALIGNR:
   39700   case X86ISD::VSHLDQ:
   39701   case X86ISD::VSRLDQ:
   39702   case X86ISD::BLENDI:
   39703   case X86ISD::UNPCKH:
   39704   case X86ISD::UNPCKL:
   39705   case X86ISD::MOVHLPS:
   39706   case X86ISD::MOVLHPS:
   39707   case X86ISD::PSHUFB:
   39708   case X86ISD::PSHUFD:
   39709   case X86ISD::PSHUFHW:
   39710   case X86ISD::PSHUFLW:
   39711   case X86ISD::MOVSHDUP:
   39712   case X86ISD::MOVSLDUP:
   39713   case X86ISD::MOVDDUP:
   39714   case X86ISD::MOVSS:
   39715   case X86ISD::MOVSD:
   39716   case X86ISD::VBROADCAST:
   39717   case X86ISD::VPPERM:
   39718   case X86ISD::VPERMI:
   39719   case X86ISD::VPERMV:
   39720   case X86ISD::VPERMV3:
   39721   case X86ISD::VPERMIL2:
   39722   case X86ISD::VPERMILPI:
   39723   case X86ISD::VPERMILPV:
   39724   case X86ISD::VPERM2X128:
   39725   case X86ISD::SHUF128:
   39726   case X86ISD::VZEXT_MOVL:
   39727   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
   39728   case X86ISD::FMADD_RND:
   39729   case X86ISD::FMSUB:
   39730   case X86ISD::FMSUB_RND:
   39731   case X86ISD::FNMADD:
   39732   case X86ISD::FNMADD_RND:
   39733   case X86ISD::FNMSUB:
   39734   case X86ISD::FNMSUB_RND:
   39735   case ISD::FMA: return combineFMA(N, DAG, Subtarget);
   39736   case X86ISD::FMADDSUB_RND:
   39737   case X86ISD::FMSUBADD_RND:
   39738   case X86ISD::FMADDSUB:
   39739   case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, Subtarget);
   39740   case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI);
   39741   case X86ISD::MGATHER:
   39742   case X86ISD::MSCATTER:
   39743   case ISD::MGATHER:
   39744   case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI, Subtarget);
   39745   case X86ISD::PCMPEQ:
   39746   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
   39747   case X86ISD::PMULDQ:
   39748   case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI);
   39749   }
   39750 
   39751   return SDValue();
   39752 }
   39753 
   39754 /// Return true if the target has native support for the specified value type
   39755 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
   39756 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
   39757 /// some i16 instructions are slow.
   39758 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   39759   if (!isTypeLegal(VT))
   39760     return false;
   39761 
   39762   // There are no vXi8 shifts.
   39763   if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
   39764     return false;
   39765 
   39766   if (VT != MVT::i16)
   39767     return true;
   39768 
   39769   switch (Opc) {
   39770   default:
   39771     return true;
   39772   case ISD::LOAD:
   39773   case ISD::SIGN_EXTEND:
   39774   case ISD::ZERO_EXTEND:
   39775   case ISD::ANY_EXTEND:
   39776   case ISD::SHL:
   39777   case ISD::SRL:
   39778   case ISD::SUB:
   39779   case ISD::ADD:
   39780   case ISD::MUL:
   39781   case ISD::AND:
   39782   case ISD::OR:
   39783   case ISD::XOR:
   39784     return false;
   39785   }
   39786 }
   39787 
   39788 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
   39789                                                   SDValue Value, SDValue Addr,
   39790                                                   SelectionDAG &DAG) const {
   39791   const Module *M = DAG.getMachineFunction().getMMI().getModule();
   39792   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
   39793   if (IsCFProtectionSupported) {
   39794     // In case control-flow branch protection is enabled, we need to add
   39795     // notrack prefix to the indirect branch.
   39796     // In order to do that we create NT_BRIND SDNode.
   39797     // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
   39798     return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
   39799   }
   39800 
   39801   return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
   39802 }
   39803 
   39804 /// This method query the target whether it is beneficial for dag combiner to
   39805 /// promote the specified node. If true, it should return the desired promotion
   39806 /// type by reference.
   39807 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   39808   EVT VT = Op.getValueType();
   39809   if (VT != MVT::i16)
   39810     return false;
   39811 
   39812   auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
   39813     if (!Op.hasOneUse())
   39814       return false;
   39815     SDNode *User = *Op->use_begin();
   39816     if (!ISD::isNormalStore(User))
   39817       return false;
   39818     auto *Ld = cast<LoadSDNode>(Load);
   39819     auto *St = cast<StoreSDNode>(User);
   39820     return Ld->getBasePtr() == St->getBasePtr();
   39821   };
   39822 
   39823   bool Commute = false;
   39824   switch (Op.getOpcode()) {
   39825   default: return false;
   39826   case ISD::SIGN_EXTEND:
   39827   case ISD::ZERO_EXTEND:
   39828   case ISD::ANY_EXTEND:
   39829     break;
   39830   case ISD::SHL:
   39831   case ISD::SRL: {
   39832     SDValue N0 = Op.getOperand(0);
   39833     // Look out for (store (shl (load), x)).
   39834     if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
   39835       return false;
   39836     break;
   39837   }
   39838   case ISD::ADD:
   39839   case ISD::MUL:
   39840   case ISD::AND:
   39841   case ISD::OR:
   39842   case ISD::XOR:
   39843     Commute = true;
   39844     LLVM_FALLTHROUGH;
   39845   case ISD::SUB: {
   39846     SDValue N0 = Op.getOperand(0);
   39847     SDValue N1 = Op.getOperand(1);
   39848     // Avoid disabling potential load folding opportunities.
   39849     if (MayFoldLoad(N1) &&
   39850         (!Commute || !isa<ConstantSDNode>(N0) ||
   39851          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
   39852       return false;
   39853     if (MayFoldLoad(N0) &&
   39854         ((Commute && !isa<ConstantSDNode>(N1)) ||
   39855          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
   39856       return false;
   39857   }
   39858   }
   39859 
   39860   PVT = MVT::i32;
   39861   return true;
   39862 }
   39863 
   39864 bool X86TargetLowering::
   39865     isDesirableToCombineBuildVectorToShuffleTruncate(
   39866         ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
   39867 
   39868   assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
   39869          "Element count mismatch");
   39870   assert(
   39871       Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
   39872       "Shuffle Mask expected to be legal");
   39873 
   39874   // For 32-bit elements VPERMD is better than shuffle+truncate.
   39875   // TODO: After we improve lowerBuildVector, add execption for VPERMW.
   39876   if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
   39877     return false;
   39878 
   39879   if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
   39880     return false;
   39881 
   39882   return true;
   39883 }
   39884 
   39885 //===----------------------------------------------------------------------===//
   39886 //                           X86 Inline Assembly Support
   39887 //===----------------------------------------------------------------------===//
   39888 
   39889 // Helper to match a string separated by whitespace.
   39890 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
   39891   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
   39892 
   39893   for (StringRef Piece : Pieces) {
   39894     if (!S.startswith(Piece)) // Check if the piece matches.
   39895       return false;
   39896 
   39897     S = S.substr(Piece.size());
   39898     StringRef::size_type Pos = S.find_first_not_of(" \t");
   39899     if (Pos == 0) // We matched a prefix.
   39900       return false;
   39901 
   39902     S = S.substr(Pos);
   39903   }
   39904 
   39905   return S.empty();
   39906 }
   39907 
   39908 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
   39909 
   39910   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
   39911     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
   39912         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
   39913         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
   39914 
   39915       if (AsmPieces.size() == 3)
   39916         return true;
   39917       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
   39918         return true;
   39919     }
   39920   }
   39921   return false;
   39922 }
   39923 
   39924 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   39925   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
   39926 
   39927   const std::string &AsmStr = IA->getAsmString();
   39928 
   39929   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
   39930   if (!Ty || Ty->getBitWidth() % 16 != 0)
   39931     return false;
   39932 
   39933   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
   39934   SmallVector<StringRef, 4> AsmPieces;
   39935   SplitString(AsmStr, AsmPieces, ";\n");
   39936 
   39937   switch (AsmPieces.size()) {
   39938   default: return false;
   39939   case 1:
   39940     // FIXME: this should verify that we are targeting a 486 or better.  If not,
   39941     // we will turn this bswap into something that will be lowered to logical
   39942     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
   39943     // lower so don't worry about this.
   39944     // bswap $0
   39945     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
   39946         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
   39947         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
   39948         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
   39949         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
   39950         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
   39951       // No need to check constraints, nothing other than the equivalent of
   39952       // "=r,0" would be valid here.
   39953       return IntrinsicLowering::LowerToByteSwap(CI);
   39954     }
   39955 
   39956     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
   39957     if (CI->getType()->isIntegerTy(16) &&
   39958         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
   39959         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
   39960          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
   39961       AsmPieces.clear();
   39962       StringRef ConstraintsStr = IA->getConstraintString();
   39963       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
   39964       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
   39965       if (clobbersFlagRegisters(AsmPieces))
   39966         return IntrinsicLowering::LowerToByteSwap(CI);
   39967     }
   39968     break;
   39969   case 3:
   39970     if (CI->getType()->isIntegerTy(32) &&
   39971         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
   39972         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
   39973         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
   39974         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
   39975       AsmPieces.clear();
   39976       StringRef ConstraintsStr = IA->getConstraintString();
   39977       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
   39978       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
   39979       if (clobbersFlagRegisters(AsmPieces))
   39980         return IntrinsicLowering::LowerToByteSwap(CI);
   39981     }
   39982 
   39983     if (CI->getType()->isIntegerTy(64)) {
   39984       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
   39985       if (Constraints.size() >= 2 &&
   39986           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
   39987           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
   39988         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
   39989         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
   39990             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
   39991             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
   39992           return IntrinsicLowering::LowerToByteSwap(CI);
   39993       }
   39994     }
   39995     break;
   39996   }
   39997   return false;
   39998 }
   39999 
   40000 /// Given a constraint letter, return the type of constraint for this target.
   40001 X86TargetLowering::ConstraintType
   40002 X86TargetLowering::getConstraintType(StringRef Constraint) const {
   40003   if (Constraint.size() == 1) {
   40004     switch (Constraint[0]) {
   40005     case 'R':
   40006     case 'q':
   40007     case 'Q':
   40008     case 'f':
   40009     case 't':
   40010     case 'u':
   40011     case 'y':
   40012     case 'x':
   40013     case 'v':
   40014     case 'Y':
   40015     case 'l':
   40016     case 'k': // AVX512 masking registers.
   40017       return C_RegisterClass;
   40018     case 'a':
   40019     case 'b':
   40020     case 'c':
   40021     case 'd':
   40022     case 'S':
   40023     case 'D':
   40024     case 'A':
   40025       return C_Register;
   40026     case 'I':
   40027     case 'J':
   40028     case 'K':
   40029     case 'L':
   40030     case 'M':
   40031     case 'N':
   40032     case 'G':
   40033     case 'C':
   40034     case 'e':
   40035     case 'Z':
   40036       return C_Other;
   40037     default:
   40038       break;
   40039     }
   40040   }
   40041   else if (Constraint.size() == 2) {
   40042     switch (Constraint[0]) {
   40043     default:
   40044       break;
   40045     case 'Y':
   40046       switch (Constraint[1]) {
   40047       default:
   40048         break;
   40049       case 'z':
   40050       case '0':
   40051         return C_Register;
   40052       case 'i':
   40053       case 'm':
   40054       case 'k':
   40055       case 't':
   40056       case '2':
   40057         return C_RegisterClass;
   40058       }
   40059     }
   40060   }
   40061   return TargetLowering::getConstraintType(Constraint);
   40062 }
   40063 
   40064 /// Examine constraint type and operand type and determine a weight value.
   40065 /// This object must already have been set up with the operand type
   40066 /// and the current alternative constraint selected.
   40067 TargetLowering::ConstraintWeight
   40068   X86TargetLowering::getSingleConstraintMatchWeight(
   40069     AsmOperandInfo &info, const char *constraint) const {
   40070   ConstraintWeight weight = CW_Invalid;
   40071   Value *CallOperandVal = info.CallOperandVal;
   40072     // If we don't have a value, we can't do a match,
   40073     // but allow it at the lowest weight.
   40074   if (!CallOperandVal)
   40075     return CW_Default;
   40076   Type *type = CallOperandVal->getType();
   40077   // Look at the constraint type.
   40078   switch (*constraint) {
   40079   default:
   40080     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
   40081     LLVM_FALLTHROUGH;
   40082   case 'R':
   40083   case 'q':
   40084   case 'Q':
   40085   case 'a':
   40086   case 'b':
   40087   case 'c':
   40088   case 'd':
   40089   case 'S':
   40090   case 'D':
   40091   case 'A':
   40092     if (CallOperandVal->getType()->isIntegerTy())
   40093       weight = CW_SpecificReg;
   40094     break;
   40095   case 'f':
   40096   case 't':
   40097   case 'u':
   40098     if (type->isFloatingPointTy())
   40099       weight = CW_SpecificReg;
   40100     break;
   40101   case 'y':
   40102     if (type->isX86_MMXTy() && Subtarget.hasMMX())
   40103       weight = CW_SpecificReg;
   40104     break;
   40105   case 'Y': {
   40106     unsigned Size = StringRef(constraint).size();
   40107     // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
   40108     char NextChar = Size == 2 ? constraint[1] : 'i';
   40109     if (Size > 2)
   40110       break;
   40111     switch (NextChar) {
   40112       default:
   40113         return CW_Invalid;
   40114       // XMM0
   40115       case 'z':
   40116       case '0':
   40117         if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
   40118           return CW_SpecificReg;
   40119         return CW_Invalid;
   40120       // Conditional OpMask regs (AVX512)
   40121       case 'k':
   40122         if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
   40123           return CW_Register;
   40124         return CW_Invalid;
   40125       // Any MMX reg
   40126       case 'm':
   40127         if (type->isX86_MMXTy() && Subtarget.hasMMX())
   40128           return weight;
   40129         return CW_Invalid;
   40130       // Any SSE reg when ISA >= SSE2, same as 'Y'
   40131       case 'i':
   40132       case 't':
   40133       case '2':
   40134         if (!Subtarget.hasSSE2())
   40135           return CW_Invalid;
   40136         break;
   40137     }
   40138     // Fall through (handle "Y" constraint).
   40139     LLVM_FALLTHROUGH;
   40140   }
   40141   case 'v':
   40142     if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
   40143       weight = CW_Register;
   40144     LLVM_FALLTHROUGH;
   40145   case 'x':
   40146     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
   40147         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
   40148       weight = CW_Register;
   40149     break;
   40150   case 'k':
   40151     // Enable conditional vector operations using %k<#> registers.
   40152     if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
   40153       weight = CW_Register;
   40154     break;
   40155   case 'I':
   40156     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
   40157       if (C->getZExtValue() <= 31)
   40158         weight = CW_Constant;
   40159     }
   40160     break;
   40161   case 'J':
   40162     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   40163       if (C->getZExtValue() <= 63)
   40164         weight = CW_Constant;
   40165     }
   40166     break;
   40167   case 'K':
   40168     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   40169       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
   40170         weight = CW_Constant;
   40171     }
   40172     break;
   40173   case 'L':
   40174     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   40175       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
   40176         weight = CW_Constant;
   40177     }
   40178     break;
   40179   case 'M':
   40180     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   40181       if (C->getZExtValue() <= 3)
   40182         weight = CW_Constant;
   40183     }
   40184     break;
   40185   case 'N':
   40186     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   40187       if (C->getZExtValue() <= 0xff)
   40188         weight = CW_Constant;
   40189     }
   40190     break;
   40191   case 'G':
   40192   case 'C':
   40193     if (isa<ConstantFP>(CallOperandVal)) {
   40194       weight = CW_Constant;
   40195     }
   40196     break;
   40197   case 'e':
   40198     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   40199       if ((C->getSExtValue() >= -0x80000000LL) &&
   40200           (C->getSExtValue() <= 0x7fffffffLL))
   40201         weight = CW_Constant;
   40202     }
   40203     break;
   40204   case 'Z':
   40205     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
   40206       if (C->getZExtValue() <= 0xffffffff)
   40207         weight = CW_Constant;
   40208     }
   40209     break;
   40210   }
   40211   return weight;
   40212 }
   40213 
   40214 /// Try to replace an X constraint, which matches anything, with another that
   40215 /// has more specific requirements based on the type of the corresponding
   40216 /// operand.
   40217 const char *X86TargetLowering::
   40218 LowerXConstraint(EVT ConstraintVT) const {
   40219   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   40220   // 'f' like normal targets.
   40221   if (ConstraintVT.isFloatingPoint()) {
   40222     if (Subtarget.hasSSE2())
   40223       return "Y";
   40224     if (Subtarget.hasSSE1())
   40225       return "x";
   40226   }
   40227 
   40228   return TargetLowering::LowerXConstraint(ConstraintVT);
   40229 }
   40230 
   40231 /// Lower the specified operand into the Ops vector.
   40232 /// If it is invalid, don't add anything to Ops.
   40233 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   40234                                                      std::string &Constraint,
   40235                                                      std::vector<SDValue>&Ops,
   40236                                                      SelectionDAG &DAG) const {
   40237   SDValue Result;
   40238 
   40239   // Only support length 1 constraints for now.
   40240   if (Constraint.length() > 1) return;
   40241 
   40242   char ConstraintLetter = Constraint[0];
   40243   switch (ConstraintLetter) {
   40244   default: break;
   40245   case 'I':
   40246     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   40247       if (C->getZExtValue() <= 31) {
   40248         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   40249                                        Op.getValueType());
   40250         break;
   40251       }
   40252     }
   40253     return;
   40254   case 'J':
   40255     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   40256       if (C->getZExtValue() <= 63) {
   40257         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   40258                                        Op.getValueType());
   40259         break;
   40260       }
   40261     }
   40262     return;
   40263   case 'K':
   40264     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   40265       if (isInt<8>(C->getSExtValue())) {
   40266         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   40267                                        Op.getValueType());
   40268         break;
   40269       }
   40270     }
   40271     return;
   40272   case 'L':
   40273     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   40274       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
   40275           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
   40276         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
   40277                                        Op.getValueType());
   40278         break;
   40279       }
   40280     }
   40281     return;
   40282   case 'M':
   40283     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   40284       if (C->getZExtValue() <= 3) {
   40285         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   40286                                        Op.getValueType());
   40287         break;
   40288       }
   40289     }
   40290     return;
   40291   case 'N':
   40292     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   40293       if (C->getZExtValue() <= 255) {
   40294         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   40295                                        Op.getValueType());
   40296         break;
   40297       }
   40298     }
   40299     return;
   40300   case 'O':
   40301     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   40302       if (C->getZExtValue() <= 127) {
   40303         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   40304                                        Op.getValueType());
   40305         break;
   40306       }
   40307     }
   40308     return;
   40309   case 'e': {
   40310     // 32-bit signed value
   40311     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   40312       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
   40313                                            C->getSExtValue())) {
   40314         // Widen to 64 bits here to get it sign extended.
   40315         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
   40316         break;
   40317       }
   40318     // FIXME gcc accepts some relocatable values here too, but only in certain
   40319     // memory models; it's complicated.
   40320     }
   40321     return;
   40322   }
   40323   case 'Z': {
   40324     // 32-bit unsigned value
   40325     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   40326       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
   40327                                            C->getZExtValue())) {
   40328         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
   40329                                        Op.getValueType());
   40330         break;
   40331       }
   40332     }
   40333     // FIXME gcc accepts some relocatable values here too, but only in certain
   40334     // memory models; it's complicated.
   40335     return;
   40336   }
   40337   case 'i': {
   40338     // Literal immediates are always ok.
   40339     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
   40340       // Widen to 64 bits here to get it sign extended.
   40341       Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
   40342       break;
   40343     }
   40344 
   40345     // In any sort of PIC mode addresses need to be computed at runtime by
   40346     // adding in a register or some sort of table lookup.  These can't
   40347     // be used as immediates.
   40348     if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
   40349       return;
   40350 
   40351     // If we are in non-pic codegen mode, we allow the address of a global (with
   40352     // an optional displacement) to be used with 'i'.
   40353     GlobalAddressSDNode *GA = nullptr;
   40354     int64_t Offset = 0;
   40355 
   40356     // Match either (GA), (GA+C), (GA+C1+C2), etc.
   40357     while (1) {
   40358       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
   40359         Offset += GA->getOffset();
   40360         break;
   40361       } else if (Op.getOpcode() == ISD::ADD) {
   40362         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   40363           Offset += C->getZExtValue();
   40364           Op = Op.getOperand(0);
   40365           continue;
   40366         }
   40367       } else if (Op.getOpcode() == ISD::SUB) {
   40368         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
   40369           Offset += -C->getZExtValue();
   40370           Op = Op.getOperand(0);
   40371           continue;
   40372         }
   40373       }
   40374 
   40375       // Otherwise, this isn't something we can handle, reject it.
   40376       return;
   40377     }
   40378 
   40379     const GlobalValue *GV = GA->getGlobal();
   40380     // If we require an extra load to get this address, as in PIC mode, we
   40381     // can't accept it.
   40382     if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
   40383       return;
   40384 
   40385     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
   40386                                         GA->getValueType(0), Offset);
   40387     break;
   40388   }
   40389   }
   40390 
   40391   if (Result.getNode()) {
   40392     Ops.push_back(Result);
   40393     return;
   40394   }
   40395   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   40396 }
   40397 
   40398 /// Check if \p RC is a general purpose register class.
   40399 /// I.e., GR* or one of their variant.
   40400 static bool isGRClass(const TargetRegisterClass &RC) {
   40401   return RC.hasSuperClassEq(&X86::GR8RegClass) ||
   40402          RC.hasSuperClassEq(&X86::GR16RegClass) ||
   40403          RC.hasSuperClassEq(&X86::GR32RegClass) ||
   40404          RC.hasSuperClassEq(&X86::GR64RegClass) ||
   40405          RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
   40406 }
   40407 
   40408 /// Check if \p RC is a vector register class.
   40409 /// I.e., FR* / VR* or one of their variant.
   40410 static bool isFRClass(const TargetRegisterClass &RC) {
   40411   return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
   40412          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
   40413          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
   40414          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
   40415          RC.hasSuperClassEq(&X86::VR512RegClass);
   40416 }
   40417 
   40418 std::pair<unsigned, const TargetRegisterClass *>
   40419 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   40420                                                 StringRef Constraint,
   40421                                                 MVT VT) const {
   40422   // First, see if this is a constraint that directly corresponds to an LLVM
   40423   // register class.
   40424   if (Constraint.size() == 1) {
   40425     // GCC Constraint Letters
   40426     switch (Constraint[0]) {
   40427     default: break;
   40428       // TODO: Slight differences here in allocation order and leaving
   40429       // RIP in the class. Do they matter any more here than they do
   40430       // in the normal allocation?
   40431     case 'k':
   40432       if (Subtarget.hasAVX512()) {
   40433         //  Only supported in AVX512 or later.
   40434         switch (VT.SimpleTy) {
   40435         default: break;
   40436         case MVT::i32:
   40437           return std::make_pair(0U, &X86::VK32RegClass);
   40438         case MVT::i16:
   40439           return std::make_pair(0U, &X86::VK16RegClass);
   40440         case MVT::i8:
   40441           return std::make_pair(0U, &X86::VK8RegClass);
   40442         case MVT::i1:
   40443           return std::make_pair(0U, &X86::VK1RegClass);
   40444         case MVT::i64:
   40445           return std::make_pair(0U, &X86::VK64RegClass);
   40446         }
   40447       }
   40448       break;
   40449     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
   40450       if (Subtarget.is64Bit()) {
   40451         if (VT == MVT::i32 || VT == MVT::f32)
   40452           return std::make_pair(0U, &X86::GR32RegClass);
   40453         if (VT == MVT::i16)
   40454           return std::make_pair(0U, &X86::GR16RegClass);
   40455         if (VT == MVT::i8 || VT == MVT::i1)
   40456           return std::make_pair(0U, &X86::GR8RegClass);
   40457         if (VT == MVT::i64 || VT == MVT::f64)
   40458           return std::make_pair(0U, &X86::GR64RegClass);
   40459         break;
   40460       }
   40461       LLVM_FALLTHROUGH;
   40462       // 32-bit fallthrough
   40463     case 'Q':   // Q_REGS
   40464       if (VT == MVT::i32 || VT == MVT::f32)
   40465         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
   40466       if (VT == MVT::i16)
   40467         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
   40468       if (VT == MVT::i8 || VT == MVT::i1)
   40469         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
   40470       if (VT == MVT::i64)
   40471         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
   40472       break;
   40473     case 'r':   // GENERAL_REGS
   40474     case 'l':   // INDEX_REGS
   40475       if (VT == MVT::i8 || VT == MVT::i1)
   40476         return std::make_pair(0U, &X86::GR8RegClass);
   40477       if (VT == MVT::i16)
   40478         return std::make_pair(0U, &X86::GR16RegClass);
   40479       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
   40480         return std::make_pair(0U, &X86::GR32RegClass);
   40481       return std::make_pair(0U, &X86::GR64RegClass);
   40482     case 'R':   // LEGACY_REGS
   40483       if (VT == MVT::i8 || VT == MVT::i1)
   40484         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
   40485       if (VT == MVT::i16)
   40486         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
   40487       if (VT == MVT::i32 || !Subtarget.is64Bit())
   40488         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
   40489       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
   40490     case 'f':  // FP Stack registers.
   40491       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
   40492       // value to the correct fpstack register class.
   40493       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
   40494         return std::make_pair(0U, &X86::RFP32RegClass);
   40495       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
   40496         return std::make_pair(0U, &X86::RFP64RegClass);
   40497       return std::make_pair(0U, &X86::RFP80RegClass);
   40498     case 'y':   // MMX_REGS if MMX allowed.
   40499       if (!Subtarget.hasMMX()) break;
   40500       return std::make_pair(0U, &X86::VR64RegClass);
   40501     case 'Y':   // SSE_REGS if SSE2 allowed
   40502       if (!Subtarget.hasSSE2()) break;
   40503       LLVM_FALLTHROUGH;
   40504     case 'v':
   40505     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
   40506       if (!Subtarget.hasSSE1()) break;
   40507       bool VConstraint = (Constraint[0] == 'v');
   40508 
   40509       switch (VT.SimpleTy) {
   40510       default: break;
   40511       // Scalar SSE types.
   40512       case MVT::f32:
   40513       case MVT::i32:
   40514         if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
   40515           return std::make_pair(0U, &X86::FR32XRegClass);
   40516         return std::make_pair(0U, &X86::FR32RegClass);
   40517       case MVT::f64:
   40518       case MVT::i64:
   40519         if (VConstraint && Subtarget.hasVLX())
   40520           return std::make_pair(0U, &X86::FR64XRegClass);
   40521         return std::make_pair(0U, &X86::FR64RegClass);
   40522       // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
   40523       // Vector types.
   40524       case MVT::v16i8:
   40525       case MVT::v8i16:
   40526       case MVT::v4i32:
   40527       case MVT::v2i64:
   40528       case MVT::v4f32:
   40529       case MVT::v2f64:
   40530         if (VConstraint && Subtarget.hasVLX())
   40531           return std::make_pair(0U, &X86::VR128XRegClass);
   40532         return std::make_pair(0U, &X86::VR128RegClass);
   40533       // AVX types.
   40534       case MVT::v32i8:
   40535       case MVT::v16i16:
   40536       case MVT::v8i32:
   40537       case MVT::v4i64:
   40538       case MVT::v8f32:
   40539       case MVT::v4f64:
   40540         if (VConstraint && Subtarget.hasVLX())
   40541           return std::make_pair(0U, &X86::VR256XRegClass);
   40542         return std::make_pair(0U, &X86::VR256RegClass);
   40543       case MVT::v8f64:
   40544       case MVT::v16f32:
   40545       case MVT::v16i32:
   40546       case MVT::v8i64:
   40547         return std::make_pair(0U, &X86::VR512RegClass);
   40548       }
   40549       break;
   40550     }
   40551   } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
   40552     switch (Constraint[1]) {
   40553     default:
   40554       break;
   40555     case 'i':
   40556     case 't':
   40557     case '2':
   40558       return getRegForInlineAsmConstraint(TRI, "Y", VT);
   40559     case 'm':
   40560       if (!Subtarget.hasMMX()) break;
   40561       return std::make_pair(0U, &X86::VR64RegClass);
   40562     case 'z':
   40563     case '0':
   40564       if (!Subtarget.hasSSE1()) break;
   40565       return std::make_pair(X86::XMM0, &X86::VR128RegClass);
   40566     case 'k':
   40567       // This register class doesn't allocate k0 for masked vector operation.
   40568       if (Subtarget.hasAVX512()) { // Only supported in AVX512.
   40569         switch (VT.SimpleTy) {
   40570         default: break;
   40571         case MVT::i32:
   40572           return std::make_pair(0U, &X86::VK32WMRegClass);
   40573         case MVT::i16:
   40574           return std::make_pair(0U, &X86::VK16WMRegClass);
   40575         case MVT::i8:
   40576           return std::make_pair(0U, &X86::VK8WMRegClass);
   40577         case MVT::i1:
   40578           return std::make_pair(0U, &X86::VK1WMRegClass);
   40579         case MVT::i64:
   40580           return std::make_pair(0U, &X86::VK64WMRegClass);
   40581         }
   40582       }
   40583       break;
   40584     }
   40585   }
   40586 
   40587   // Use the default implementation in TargetLowering to convert the register
   40588   // constraint into a member of a register class.
   40589   std::pair<unsigned, const TargetRegisterClass*> Res;
   40590   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
   40591 
   40592   // Not found as a standard register?
   40593   if (!Res.second) {
   40594     // Map st(0) -> st(7) -> ST0
   40595     if (Constraint.size() == 7 && Constraint[0] == '{' &&
   40596         tolower(Constraint[1]) == 's' &&
   40597         tolower(Constraint[2]) == 't' &&
   40598         Constraint[3] == '(' &&
   40599         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
   40600         Constraint[5] == ')' &&
   40601         Constraint[6] == '}') {
   40602 
   40603       Res.first = X86::FP0+Constraint[4]-'0';
   40604       Res.second = &X86::RFP80RegClass;
   40605       return Res;
   40606     }
   40607 
   40608     // GCC allows "st(0)" to be called just plain "st".
   40609     if (StringRef("{st}").equals_lower(Constraint)) {
   40610       Res.first = X86::FP0;
   40611       Res.second = &X86::RFP80RegClass;
   40612       return Res;
   40613     }
   40614 
   40615     // flags -> EFLAGS
   40616     if (StringRef("{flags}").equals_lower(Constraint)) {
   40617       Res.first = X86::EFLAGS;
   40618       Res.second = &X86::CCRRegClass;
   40619       return Res;
   40620     }
   40621 
   40622     // 'A' means [ER]AX + [ER]DX.
   40623     if (Constraint == "A") {
   40624       if (Subtarget.is64Bit()) {
   40625         Res.first = X86::RAX;
   40626         Res.second = &X86::GR64_ADRegClass;
   40627       } else {
   40628         assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
   40629                "Expecting 64, 32 or 16 bit subtarget");
   40630         Res.first = X86::EAX;
   40631         Res.second = &X86::GR32_ADRegClass;
   40632       }
   40633       return Res;
   40634     }
   40635     return Res;
   40636   }
   40637 
   40638   // Make sure it isn't a register that requires 64-bit mode.
   40639   if (!Subtarget.is64Bit() &&
   40640       (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
   40641       TRI->getEncodingValue(Res.first) >= 8) {
   40642     // Register requires REX prefix, but we're in 32-bit mode.
   40643     Res.first = 0;
   40644     Res.second = nullptr;
   40645     return Res;
   40646   }
   40647 
   40648   // Make sure it isn't a register that requires AVX512.
   40649   if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
   40650       TRI->getEncodingValue(Res.first) & 0x10) {
   40651     // Register requires EVEX prefix.
   40652     Res.first = 0;
   40653     Res.second = nullptr;
   40654     return Res;
   40655   }
   40656 
   40657   // Otherwise, check to see if this is a register class of the wrong value
   40658   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
   40659   // turn into {ax},{dx}.
   40660   // MVT::Other is used to specify clobber names.
   40661   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
   40662     return Res;   // Correct type already, nothing to do.
   40663 
   40664   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
   40665   // return "eax". This should even work for things like getting 64bit integer
   40666   // registers when given an f64 type.
   40667   const TargetRegisterClass *Class = Res.second;
   40668   // The generic code will match the first register class that contains the
   40669   // given register. Thus, based on the ordering of the tablegened file,
   40670   // the "plain" GR classes might not come first.
   40671   // Therefore, use a helper method.
   40672   if (isGRClass(*Class)) {
   40673     unsigned Size = VT.getSizeInBits();
   40674     if (Size == 1) Size = 8;
   40675     unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
   40676     if (DestReg > 0) {
   40677       bool is64Bit = Subtarget.is64Bit();
   40678       const TargetRegisterClass *RC =
   40679           Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
   40680         : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
   40681         : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
   40682         : &X86::GR64RegClass;
   40683       if (RC->contains(DestReg))
   40684         Res = std::make_pair(DestReg, RC);
   40685     } else {
   40686       // No register found/type mismatch.
   40687       Res.first = 0;
   40688       Res.second = nullptr;
   40689     }
   40690   } else if (isFRClass(*Class)) {
   40691     // Handle references to XMM physical registers that got mapped into the
   40692     // wrong class.  This can happen with constraints like {xmm0} where the
   40693     // target independent register mapper will just pick the first match it can
   40694     // find, ignoring the required type.
   40695 
   40696     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
   40697     if (VT == MVT::f32 || VT == MVT::i32)
   40698       Res.second = &X86::FR32RegClass;
   40699     else if (VT == MVT::f64 || VT == MVT::i64)
   40700       Res.second = &X86::FR64RegClass;
   40701     else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
   40702       Res.second = &X86::VR128RegClass;
   40703     else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
   40704       Res.second = &X86::VR256RegClass;
   40705     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
   40706       Res.second = &X86::VR512RegClass;
   40707     else {
   40708       // Type mismatch and not a clobber: Return an error;
   40709       Res.first = 0;
   40710       Res.second = nullptr;
   40711     }
   40712   }
   40713 
   40714   return Res;
   40715 }
   40716 
   40717 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
   40718                                             const AddrMode &AM, Type *Ty,
   40719                                             unsigned AS) const {
   40720   // Scaling factors are not free at all.
   40721   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
   40722   // will take 2 allocations in the out of order engine instead of 1
   40723   // for plain addressing mode, i.e. inst (reg1).
   40724   // E.g.,
   40725   // vaddps (%rsi,%rdx), %ymm0, %ymm1
   40726   // Requires two allocations (one for the load, one for the computation)
   40727   // whereas:
   40728   // vaddps (%rsi), %ymm0, %ymm1
   40729   // Requires just 1 allocation, i.e., freeing allocations for other operations
   40730   // and having less micro operations to execute.
   40731   //
   40732   // For some X86 architectures, this is even worse because for instance for
   40733   // stores, the complex addressing mode forces the instruction to use the
   40734   // "load" ports instead of the dedicated "store" port.
   40735   // E.g., on Haswell:
   40736   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
   40737   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
   40738   if (isLegalAddressingMode(DL, AM, Ty, AS))
   40739     // Scale represents reg2 * scale, thus account for 1
   40740     // as soon as we use a second register.
   40741     return AM.Scale != 0;
   40742   return -1;
   40743 }
   40744 
   40745 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   40746   // Integer division on x86 is expensive. However, when aggressively optimizing
   40747   // for code size, we prefer to use a div instruction, as it is usually smaller
   40748   // than the alternative sequence.
   40749   // The exception to this is vector division. Since x86 doesn't have vector
   40750   // integer division, leaving the division as-is is a loss even in terms of
   40751   // size, because it will have to be scalarized, while the alternative code
   40752   // sequence can be performed in vector form.
   40753   bool OptSize =
   40754       Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
   40755   return OptSize && !VT.isVector();
   40756 }
   40757 
   40758 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
   40759   if (!Subtarget.is64Bit())
   40760     return;
   40761 
   40762   // Update IsSplitCSR in X86MachineFunctionInfo.
   40763   X86MachineFunctionInfo *AFI =
   40764     Entry->getParent()->getInfo<X86MachineFunctionInfo>();
   40765   AFI->setIsSplitCSR(true);
   40766 }
   40767 
   40768 void X86TargetLowering::insertCopiesSplitCSR(
   40769     MachineBasicBlock *Entry,
   40770     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
   40771   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   40772   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
   40773   if (!IStart)
   40774     return;
   40775 
   40776   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   40777   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
   40778   MachineBasicBlock::iterator MBBI = Entry->begin();
   40779   for (const MCPhysReg *I = IStart; *I; ++I) {
   40780     const TargetRegisterClass *RC = nullptr;
   40781     if (X86::GR64RegClass.contains(*I))
   40782       RC = &X86::GR64RegClass;
   40783     else
   40784       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
   40785 
   40786     unsigned NewVR = MRI->createVirtualRegister(RC);
   40787     // Create copy from CSR to a virtual register.
   40788     // FIXME: this currently does not emit CFI pseudo-instructions, it works
   40789     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
   40790     // nounwind. If we want to generalize this later, we may need to emit
   40791     // CFI pseudo-instructions.
   40792     assert(Entry->getParent()->getFunction().hasFnAttribute(
   40793                Attribute::NoUnwind) &&
   40794            "Function should be nounwind in insertCopiesSplitCSR!");
   40795     Entry->addLiveIn(*I);
   40796     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
   40797         .addReg(*I);
   40798 
   40799     // Insert the copy-back instructions right before the terminator.
   40800     for (auto *Exit : Exits)
   40801       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
   40802               TII->get(TargetOpcode::COPY), *I)
   40803           .addReg(NewVR);
   40804   }
   40805 }
   40806 
   40807 bool X86TargetLowering::supportSwiftError() const {
   40808   return Subtarget.is64Bit();
   40809 }
   40810 
   40811 /// Returns the name of the symbol used to emit stack probes or the empty
   40812 /// string if not applicable.
   40813 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
   40814   // If the function specifically requests stack probes, emit them.
   40815   if (MF.getFunction().hasFnAttribute("probe-stack"))
   40816     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
   40817 
   40818   // Generally, if we aren't on Windows, the platform ABI does not include
   40819   // support for stack probes, so don't emit them.
   40820   if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
   40821       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
   40822     return "";
   40823 
   40824   // We need a stack probe to conform to the Windows ABI. Choose the right
   40825   // symbol.
   40826   if (Subtarget.is64Bit())
   40827     return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
   40828   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
   40829 }
   40830